From e07dfe6a7091db80bef5c94d10eaf6360d7224de Mon Sep 17 00:00:00 2001 From: amoitra Date: Mon, 21 Oct 2019 12:30:06 -0700 Subject: [PATCH 001/383] Set xla_gpu_use_cudnn_batchnorm to true --- tensorflow/compiler/xla/debug_options_flags.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) mode change 100644 => 100755 tensorflow/compiler/xla/debug_options_flags.cc diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc old mode 100644 new mode 100755 index ec0059d37d9..acffd7734b3 --- a/tensorflow/compiler/xla/debug_options_flags.cc +++ b/tensorflow/compiler/xla/debug_options_flags.cc @@ -42,9 +42,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_cpu_use_mkl_dnn(true); #endif // INTEL_MKL opts.set_xla_gpu_max_kernel_unroll_factor(4); - // Set cudnn batchnorm off by default; it does not provide a performance win - // on average. - opts.set_xla_gpu_use_cudnn_batchnorm(false); + // Set cudnn batchnorm on by default. + opts.set_xla_gpu_use_cudnn_batchnorm(true); // Run all GPU work on one stream by default. Using multiple streams // increases memory usage and we lack strong motivating benchmarks for tuning From b64e97b4448b86bafc8dc72d6a5db64e15dfe30e Mon Sep 17 00:00:00 2001 From: amoitra Date: Wed, 13 Nov 2019 12:14:10 -0800 Subject: [PATCH 002/383] Always expand batchnorm inference --- tensorflow/compiler/xla/service/gpu/gpu_compiler.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc index 95e21a84f29..e7877a26db8 100755 --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -165,6 +165,10 @@ Status GpuCompiler::OptimizeHloModule( // where possible. Not every batchnorm op can be implemented as a call to // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs. if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) { + pass.AddPass( + /*rewrite_training_op=*/false, + /*rewrite_inference_op=*/true, + /*rewrite_grad_op=*/false); pass.AddPass(); } pass.AddPass( From 400bba1be9dfb1518b7a0748041d279532b7f7c0 Mon Sep 17 00:00:00 2001 From: amoitra Date: Wed, 13 Nov 2019 13:17:15 -0800 Subject: [PATCH 003/383] Add comments --- tensorflow/compiler/xla/service/gpu/gpu_compiler.cc | 2 ++ 1 file changed, 2 insertions(+) mode change 100755 => 100644 tensorflow/compiler/xla/service/gpu/gpu_compiler.cc diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc old mode 100755 new mode 100644 index e7877a26db8..3100e740afb --- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc @@ -165,6 +165,8 @@ Status GpuCompiler::OptimizeHloModule( // where possible. Not every batchnorm op can be implemented as a call to // cudnn, so decompose any remaining batchnorm ops into a soup of HLOs. if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) { + // Since BatchNorm inference is essentially pointwise operations, it is + // always advantageous to use kernel fusion rather than cudnn. pass.AddPass( /*rewrite_training_op=*/false, /*rewrite_inference_op=*/true, From 6a74e16e94f05375b5b220cc046e3e0b1d5d2055 Mon Sep 17 00:00:00 2001 From: Agoniii <815244047@qq.com> Date: Thu, 28 Nov 2019 15:01:34 +0800 Subject: [PATCH 004/383] add label for xlaop --- tensorflow/compiler/tf2xla/kernels/data_format_ops.cc | 5 +++++ tensorflow/compiler/tf2xla/xla_compiler.cc | 11 ----------- tensorflow/compiler/tf2xla/xla_op_registry.cc | 8 ++++++++ tensorflow/compiler/tf2xla/xla_op_registry.h | 5 +++++ 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc index fea2407a5d1..34e909b745a 100644 --- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc +++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc @@ -143,6 +143,11 @@ class DataFormatVecPermuteOp : public XlaOpKernel { REGISTER_XLA_OP( Name("DataFormatVecPermute").TypeConstraint("T", {DT_INT32, DT_INT64}), DataFormatVecPermuteOp); +REGISTER_XLA_OP( + Name("DataFormatVecPermute") + .Label("host") + .TypeConstraint("T", {DT_INT32, DT_INT64}), + DataFormatVecPermuteOp); } // namespace } // namespace tensorflow diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 9d10be1d90a..7124506cf46 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -723,17 +723,6 @@ Status XlaCompiler::CompileFunction( std::unique_ptr graph = GetGraph(fbody); - // Clear the "_kernel" attribute if it is set to "host". This is used to - // indicate that a computation should happen on the host instead of the - // accelerator, but doesn't make sense in XLA. - const char* const kKernelAttr = "_kernel"; - for (Node* n : graph->nodes()) { - string value; - if (TryGetNodeAttr(n->attrs(), kKernelAttr, &value) && value == "host") { - n->ClearAttr(kKernelAttr); - } - } - // _Arg and _Retval nodes don't exist in the stored subgraph for the function; // they are added by the function body looked up. Therefore, they don't have // core assignments here. diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc index e70012f761a..6718d4ec4c2 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.cc +++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc @@ -61,6 +61,7 @@ XlaOpRegistry::~XlaOpRegistry() = default; /* static */ bool XlaOpRegistry::IsCompatible(const OpRegistration& x, const OpRegistration& y) { if (x.name != y.name) return true; + if (x.label != y.label) return true; // The registrations refer to the same Op: ensures they are compatible and // are restricted to different device whitelists. if (x.compilation_only != y.compilation_only) { @@ -256,6 +257,7 @@ void XlaOpRegistry::RegisterCompilationKernels() { std::unique_ptr kdef(new KernelDef); kdef->set_op(op_registration->name); kdef->set_device_type(backend.first); + kdef->set_label(op_registration->label); // Constrain each type attribute to the intersection of: // a) the types supported by the backend, and @@ -539,6 +541,12 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::IsMetadataOp() { return *this; } +XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Label( + absl::string_view label) { + registration_->label = string(label); + return *this; +} + std::unique_ptr XlaOpRegistrationBuilder::Build( XlaOpRegistry::Factory factory) { registration_->factory = factory; diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h index af08790e02e..3a384c7b1d8 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.h +++ b/tensorflow/compiler/tf2xla/xla_op_registry.h @@ -270,6 +270,8 @@ class XlaOpRegistry { // operands and not their values. bool is_metadata_op = false; + string label; + // Factory used to build OpKernels that perform symbolic execution. Factory factory; }; @@ -350,6 +352,9 @@ class XlaOpRegistrationBuilder { // operands and not their values. XlaOpRegistrationBuilder& IsMetadataOp(); + // Specifies a particular value for the "_kernel" attr. + XlaOpRegistrationBuilder& Label(absl::string_view label); + std::unique_ptr Build( XlaOpRegistry::Factory factory); From 75057371b960dc9e468ea1977031c37e729aeeba Mon Sep 17 00:00:00 2001 From: Johan Euphrosine Date: Mon, 2 Dec 2019 15:15:02 +0900 Subject: [PATCH 005/383] lite/microfrontend: fix FilterbankState unsigned type missmatch FilterbankState work is uint64_t*, casting a signed type prevent the libraries to compile w/ the esp32 arduino core toolchain. --- tensorflow/lite/experimental/microfrontend/lib/filterbank.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/experimental/microfrontend/lib/filterbank.c b/tensorflow/lite/experimental/microfrontend/lib/filterbank.c index a65af382b9a..80f8738f001 100644 --- a/tensorflow/lite/experimental/microfrontend/lib/filterbank.c +++ b/tensorflow/lite/experimental/microfrontend/lib/filterbank.c @@ -118,7 +118,7 @@ static uint32_t Sqrt64(uint64_t num) { uint32_t* FilterbankSqrt(struct FilterbankState* state, int scale_down_shift) { const int num_channels = state->num_channels; - const int64_t* work = state->work + 1; + const uint64_t* work = state->work + 1; // Reuse the work buffer since we're fine clobbering it at this point to hold // the output. uint32_t* output = (uint32_t*)state->work; From 90bf975aeccb36c9a816849e3d696730bf50088c Mon Sep 17 00:00:00 2001 From: Agoniii <815244047@qq.com> Date: Mon, 2 Dec 2019 20:04:00 +0800 Subject: [PATCH 006/383] using std::string --- tensorflow/compiler/tf2xla/xla_op_registry.cc | 4 ++-- tensorflow/compiler/tf2xla/xla_op_registry.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc index 6718d4ec4c2..5a822712a4a 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.cc +++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc @@ -542,8 +542,8 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::IsMetadataOp() { } XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Label( - absl::string_view label) { - registration_->label = string(label); + std::string label) { + registration_->label = label; return *this; } diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.h b/tensorflow/compiler/tf2xla/xla_op_registry.h index 3a384c7b1d8..5717d86e7ad 100644 --- a/tensorflow/compiler/tf2xla/xla_op_registry.h +++ b/tensorflow/compiler/tf2xla/xla_op_registry.h @@ -270,7 +270,7 @@ class XlaOpRegistry { // operands and not their values. bool is_metadata_op = false; - string label; + std::string label; // Factory used to build OpKernels that perform symbolic execution. Factory factory; @@ -353,7 +353,7 @@ class XlaOpRegistrationBuilder { XlaOpRegistrationBuilder& IsMetadataOp(); // Specifies a particular value for the "_kernel" attr. - XlaOpRegistrationBuilder& Label(absl::string_view label); + XlaOpRegistrationBuilder& Label(std::string label); std::unique_ptr Build( XlaOpRegistry::Factory factory); From 147de48ad973a6a05e8113af815988014652caf2 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Tue, 3 Dec 2019 01:51:40 +0000 Subject: [PATCH 007/383] Return new instance of AutoCastVariable after assignment Assignments and sparse updates will now return a new instance of `AutoCastVariable` wrapping the [`_UnreadVariable`](https://github.com/tensorflow/tensorflow/blob/2692ea8ec1953e42952597adb5b5099181a679b2/tensorflow/python/ops/resource_variable_ops.py#L1806) returned from the assignment op. --- .../experimental/autocast_variable.py | 42 ++++++++++++------- .../experimental/autocast_variable_test.py | 22 ++++++---- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py index 5957b3d8ce0..469c6902025 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py +++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py @@ -185,46 +185,60 @@ class AutoCastVariable(variables.Variable): return self._variable.constraint def assign(self, value, use_locking=None, name=None, read_value=True): - return self._variable.assign(value, use_locking, name, read_value) + assign_op = self._variable.assign(value, use_locking, name, read_value) + return create_autocast_variable(assign_op) if read_value else assign_op def assign_add(self, delta, use_locking=None, name=None, read_value=True): - return self._variable.assign_add(delta, use_locking, name, read_value) + assign_op = self._variable.assign_add(delta, use_locking, name, read_value) + return create_autocast_variable(assign_op) if read_value else assign_op def assign_sub(self, delta, use_locking=None, name=None, read_value=True): - return self._variable.assign_sub(delta, use_locking, name, read_value) + assign_op = self._variable.assign_sub(delta, use_locking, name, read_value) + return create_autocast_variable(assign_op) if read_value else assign_op def scatter_sub(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_sub(sparse_delta, use_locking, name) + var = self._variable.scatter_sub(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_add(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_add(sparse_delta, use_locking, name) + var = self._variable.scatter_add(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_max(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_max(sparse_delta, use_locking, name) + var = self._variable.scatter_max(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_min(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_min(sparse_delta, use_locking, name) + var = self._variable.scatter_min(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_mul(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_mul(sparse_delta, use_locking, name) + var = self._variable.scatter_mul(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_div(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_div(sparse_delta, use_locking, name) + var = self._variable.scatter_div(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_update(self, sparse_delta, use_locking=False, name=None): - return self._variable.scatter_update(sparse_delta, use_locking, name) + var = self._variable.scatter_update(sparse_delta, use_locking, name) + return create_autocast_variable(var) def batch_scatter_update(self, sparse_delta, use_locking=False, name=None): - return self._variable.batch_scatter_update(sparse_delta, use_locking, name) + var = self._variable.batch_scatter_update(sparse_delta, use_locking, name) + return create_autocast_variable(var) def scatter_nd_sub(self, indices, updates, name=None): - return self._variable.scatter_nd_sub(indices, updates, name) + var = self._variable.scatter_nd_sub(indices, updates, name) + return create_autocast_variable(var) def scatter_nd_add(self, indices, updates, name=None): - return self._variable.scatter_nd_add(indices, updates, name) + var = self._variable.scatter_nd_add(indices, updates, name) + return create_autocast_variable(var) def scatter_nd_update(self, indices, updates, name=None): - return self._variable.scatter_nd_update(indices, updates, name) + var = self._variable.scatter_nd_update(indices, updates, name) + return create_autocast_variable(var) def load(self, value, session=None): return self._variable.load(value, session) diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py index ea4d262edab..205adbb04eb 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py @@ -18,6 +18,7 @@ from __future__ import division from __future__ import print_function import os +from functools import partial from absl.testing import parameterized import numpy as np @@ -71,6 +72,11 @@ def get_var(val, dtype, name=None): @test_util.run_all_in_graph_and_eager_modes class AutoCastVariableTest(test.TestCase, parameterized.TestCase): + def check_and_evaluate(self, var, dtype=None): + self.assertIsInstance(var, autocast_variable.AutoCastVariable) + if dtype: + self.assertEqual(var.dtype, dtype) + return self.evaluate(var) @parameterized.named_parameters(*TESTCASES) def test_read(self, distribute): @@ -157,25 +163,25 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase): # Test AutoCastVariable correctly delegates Variable methods to the # underlying variable. with get_distribute_scope(distribute): - evaluate = self.evaluate for read_dtype in (dtypes.float32, dtypes.float16): + evaluate = partial(self.check_and_evaluate, dtype=read_dtype) x = get_var(7., dtypes.float32) x = autocast_variable.create_autocast_variable(x) with ops.get_default_graph()._enable_auto_casting_variables( read_dtype): - evaluate(x.initializer) - self.assertEqual(evaluate(x.value()), 7) - self.assertEqual(evaluate(x.read_value()), 7) + self.evaluate(x.initializer) + self.assertEqual(self.evaluate(x.value()), 7) + self.assertEqual(self.evaluate(x.read_value()), 7) self.assertTrue(x.trainable) self.assertEqual(x.synchronization, x._variable.synchronization) self.assertEqual(x.aggregation, x._variable.aggregation) - self.assertEqual(evaluate(x.initialized_value()), 7) + self.assertEqual(self.evaluate(x.initialized_value()), 7) if not context.executing_eagerly(): if not distribute: # These functions are not supported for DistributedVariables x.load(9) self.assertEqual(x.eval(), 9) - self.assertEqual(evaluate(x.initial_value), 7) + self.assertEqual(self.evaluate(x.initial_value), 7) self.assertEqual(x.op, x._variable.op) self.assertEqual(x.graph, x._variable.graph) if not distribute: @@ -197,8 +203,8 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase): x = autocast_variable.create_autocast_variable(x) with ops.get_default_graph()._enable_auto_casting_variables( read_dtype): - evaluate(x.initializer) - self.assertAllEqual(evaluate(x.value()), [7, 8]) + self.evaluate(x.initializer) + self.assertAllEqual(self.evaluate(x.value()), [7, 8]) def slices(val, index): return indexed_slices.IndexedSlices( From be36fd93ef817281115eac37c37a800e8b182001 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Tue, 26 Nov 2019 22:19:46 -0800 Subject: [PATCH 008/383] switch BucketBySequenceLengthTest to use combinations --- .../bucket_by_sequence_length_test.py | 32 ++++++++----------- 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py index d9c463d744d..d829863b994 100644 --- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py @@ -25,11 +25,11 @@ from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -73,14 +73,12 @@ def _get_record_shape(sparse): return tensor_shape.TensorShape([None]) -@test_util.run_all_in_graph_and_eager_modes class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.named_parameters( - ("WithoutPadding", True), - ("WithPadding", False), - ) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(param_no_padding=[True, False]))) def testBucketDropReminder(self, param_no_padding): boundaries = [10, 20, 30] @@ -201,10 +199,9 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, _test_bucket_by_padding(param_no_padding) - @parameterized.named_parameters( - ("WithoutPadding", True), - ("WithPadding", False), - ) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(param_no_padding=[True, False]))) def testBucket(self, param_no_padding): boundaries = [10, 20, 30] @@ -347,10 +344,9 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - @parameterized.named_parameters( - ("WithoutPadding", True), - ("WithPadding", False), - ) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(param_no_padding=[True, False]))) def testTupleElements(self, param_no_padding): def build_dataset(sparse): @@ -381,10 +377,10 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, _test_tuple_elements_by_padding(param_no_padding) - @parameterized.named_parameters( - ("DoDropRemainder", True), - ("DoNotDropRemainder", False), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(param_drop_remainder=[True, False]))) def testBucketSparse(self, param_drop_remainder): # pylint: disable=g-doc-args """Tests bucketing of sparse tensors (case where `no_padding` == True). From c5c6f9686f1be93a54d344e5f2f864f1855e1bed Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 10:11:03 -0800 Subject: [PATCH 009/383] Switch CopyToDeviceTest to use TF combination --- .../kernel_tests/copy_to_device_test.py | 68 +++++++++++++------ 1 file changed, 46 insertions(+), 22 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py index 36c61636798..08769f9622f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.core.protobuf import config_pb2 from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import prefetching_ops @@ -24,6 +26,7 @@ from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import structure +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -35,9 +38,10 @@ from tensorflow.python.util import compat as util_compat # TODO(b/117581999): add eager coverage when supported. -class CopyToDeviceTest(test_base.DatasetTestBase): +class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -62,7 +66,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceInt32(self): host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3]) device_dataset = host_dataset.apply( @@ -86,7 +91,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -111,7 +117,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -136,7 +143,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -161,7 +169,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyDictToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -186,7 +195,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopySparseTensorsToDevice(self): def make_tensor(i): @@ -219,7 +229,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopySparseTensorsToDeviceWithPrefetch(self): def make_tensor(i): @@ -252,7 +263,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -273,7 +285,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuWithPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -294,7 +307,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuWithMap(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -332,7 +346,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuInt32(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -352,7 +367,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuInt32AndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -372,7 +388,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuStrings(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -392,7 +409,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuStringsAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -412,7 +430,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDevicePingPongCPUGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -436,7 +455,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -465,7 +485,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceWithReInitAndPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -494,7 +515,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -518,7 +540,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testCopyToDeviceGpuWithReInitAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -542,7 +565,8 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testIteratorGetNextAsOptionalOnGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") From 7fc9eb7bfee7dd62000fd39c44a566444503a93a Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 10:24:12 -0800 Subject: [PATCH 010/383] Switch CounterTest to use TF combinations --- .../python/data/experimental/kernel_tests/counter_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py index 79e4523ea43..11629573ad1 100644 --- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py @@ -17,17 +17,19 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import counter from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class CounterTest(test_base.DatasetTestBase): +class CounterTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testCounter(self): """Test dataset construction using `count`.""" dataset = counter.Counter(start=3, step=4) From 1246058569a00f9bc2d479eeb94dc5f3e4c708c4 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 10:33:02 -0800 Subject: [PATCH 011/383] Switch CsvDatasetTest to use TF combinations --- .../kernel_tests/csv_dataset_test.py | 54 +++++++++++++++++-- 1 file changed, 51 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py index 4b349ebd811..941ca209848 100644 --- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py @@ -22,21 +22,22 @@ import gzip import os import zlib +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import error_ops from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.eager import context +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class CsvDatasetTest(test_base.DatasetTestBase): +class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def _setup_files(self, inputs, linebreak='\n', compression_type=None): filenames = [] @@ -117,26 +118,31 @@ class CsvDatasetTest(test_base.DatasetTestBase): dataset = readers.CsvDataset(filenames, **kwargs) self._verify_output_or_err(dataset, expected_output, expected_err_re) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_requiredFields(self): record_defaults = [[]] * 4 inputs = [['1,2,3,4']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_int(self): record_defaults = [[0]] * 4 inputs = [['1,2,3,4', '5,6,7,8']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_float(self): record_defaults = [[0.0]] * 4 inputs = [['1.0,2.1,3.2,4.3', '5.4,6.5,7.6,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_string(self): record_defaults = [['']] * 4 inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEmptyFields(self): record_defaults = [[0]] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] @@ -144,6 +150,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errWithUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4']] @@ -152,6 +159,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Unquoted fields cannot have quotes inside', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errWithUnescapedQuotes(self): record_defaults = [['']] * 3 inputs = [['"a"b","c","d"']] @@ -161,6 +169,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): 'Quote inside a string has to be escaped by another quote', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_ignoreErrWithUnescapedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,"2"3",4', '1,"2"3",4",5,5', 'a,b,"c"d"', 'e,f,g']] @@ -169,6 +178,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): dataset = dataset.apply(error_ops.ignore_errors()) self._verify_output_or_err(dataset, [['e', 'f', 'g']]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_ignoreErrWithUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4', 'a,b,c"d', '9,8"7,6,5', 'e,f,g']] @@ -177,12 +187,14 @@ class CsvDatasetTest(test_base.DatasetTestBase): dataset = dataset.apply(error_ops.ignore_errors()) self._verify_output_or_err(dataset, [['e', 'f', 'g']]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4']] self._test_by_comparison( inputs, record_defaults=record_defaults, use_quote_delim=False) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_mixedTypes(self): record_defaults = [ constant_op.constant([], dtype=dtypes.int32), @@ -193,30 +205,35 @@ class CsvDatasetTest(test_base.DatasetTestBase): inputs = [['1,2.1,3.2,4.3', '5,6.5,7.6,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withUseQuoteDelimFalse(self): record_defaults = [['']] * 4 inputs = [['1,2,"3,4"', '"5,6",7,8']] self._test_by_comparison( inputs, record_defaults=record_defaults, use_quote_delim=False) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withFieldDelim(self): record_defaults = [[0]] * 4 inputs = [['1:2:3:4', '5:6:7:8']] self._test_by_comparison( inputs, record_defaults=record_defaults, field_delim=':') + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNaValue(self): record_defaults = [[0]] * 4 inputs = [['1,NA,3,4', 'NA,6,7,8']] self._test_by_comparison( inputs, record_defaults=record_defaults, na_value='NA') + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withSelectCols(self): record_defaults = [['']] * 2 inputs = [['1,2,3,4', '"5","6","7","8"']] self._test_by_comparison( inputs, record_defaults=record_defaults, select_cols=[1, 2]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withSelectColsTooHigh(self): record_defaults = [[0]] * 2 inputs = [['1,2,3,4', '5,6,7,8']] @@ -226,23 +243,27 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, select_cols=[3, 4]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withOneCol(self): record_defaults = [['NA']] inputs = [['0', '', '2']] self._test_dataset( inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withMultipleFiles(self): record_defaults = [[0]] * 4 inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withLeadingAndTrailingSpaces(self): record_defaults = [[0.0]] * 4 inputs = [['0, 1, 2, 3']] expected = [[0.0, 1.0, 2.0, 3.0]] self._test_dataset(inputs, expected, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithMissingDefault(self): record_defaults = [[]] * 2 inputs = [['0,']] @@ -251,6 +272,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Field 1 is required but missing in record!', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithFewerDefaultsThanFields(self): record_defaults = [[0.0]] * 2 inputs = [['0,1,2,3']] @@ -259,6 +281,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Expect 2 fields but have more in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithMoreDefaultsThanFields(self): record_defaults = [[0.0]] * 5 inputs = [['0,1,2,3']] @@ -267,6 +290,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Expect 5 fields but have 4 in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withHeader(self): record_defaults = [[0]] * 2 inputs = [['col1,col2', '1,2']] @@ -278,6 +302,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withHeaderAndNoRecords(self): record_defaults = [[0]] * 2 inputs = [['col1,col2']] @@ -289,6 +314,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithHeaderEmptyFile(self): record_defaults = [[0]] * 2 inputs = [[]] @@ -300,12 +326,14 @@ class CsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEmptyFile(self): record_defaults = [['']] * 2 inputs = [['']] # Empty file self._test_dataset( inputs, expected_output=[], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithEmptyRecord(self): record_defaults = [['']] * 2 inputs = [['', '1,2']] # First record is empty @@ -314,6 +342,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Expect 2 fields but have 1 in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withChainedOps(self): # Testing that one dataset can create multiple iterators fine. # `repeat` creates multiple iterators from the same C++ Dataset. @@ -325,6 +354,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): ds_actual.repeat(5).prefetch(1), ds_expected.repeat(5).prefetch(1)) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withTypeDefaults(self): # Testing using dtypes as record_defaults for required fields record_defaults = [dtypes.float32, [0.0]] @@ -335,6 +365,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCsvDataset_fieldOrder(self): data = [[ '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19', @@ -352,6 +383,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): ## The following tests exercise parsing logic for quoted fields + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withQuoted(self): record_defaults = [['']] * 4 inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']] @@ -363,6 +395,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset( inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNewLine(self): # In this case, we expect it to behave differently from # TextLineDataset->map(decode_csv) since that flow has bugs @@ -371,6 +404,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']] self._test_dataset(inputs, expected, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNewLineInUnselectedCol(self): record_defaults = [['']] inputs = [['1,"2\n3",4', '5,6,7']] @@ -380,6 +414,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, select_cols=[0]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withMultipleNewLines(self): # In this case, we expect it to behave differently from # TextLineDataset->map(decode_csv) since that flow has bugs @@ -388,6 +423,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']] self._test_dataset(inputs, expected, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithTerminateMidRecord(self): record_defaults = [['']] * 4 inputs = [['a,b,c,"a']] @@ -397,6 +433,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): 'Reached end of file without closing quoted field in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEscapedQuotes(self): record_defaults = [['']] * 4 inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']] @@ -406,6 +443,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): ## Testing that parsing works with all buffer sizes, quoted/unquoted fields, ## and different types of line breaks + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withInvalidBufferSize(self): record_defaults = [['']] * 4 inputs = [['a,b,c,d']] @@ -432,6 +470,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, buffer_size=i) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withLF(self): record_defaults = [['NA']] * 3 inputs = [['abc,def,ghi', '0,1,2', ',,']] @@ -439,6 +478,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCR(self): # Test that when the line separator is '\r', parsing works with all buffer # sizes @@ -448,6 +488,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRLF(self): # Test that when the line separator is '\r\n', parsing works with all buffer # sizes @@ -457,6 +498,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withBufferSizeAndQuoted(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -465,6 +507,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRAndQuoted(self): # Test that when the line separator is '\r', parsing works with all buffer # sizes @@ -475,6 +518,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRLFAndQuoted(self): # Test that when the line separator is '\r\n', parsing works with all buffer # sizes @@ -485,6 +529,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withGzipCompressionType(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -497,6 +542,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): compression_type='GZIP', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withZlibCompressionType(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -509,6 +555,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): compression_type='ZLIB', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withScalarDefaults(self): record_defaults = [constant_op.constant(0, dtype=dtypes.int64)] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] @@ -516,6 +563,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_with2DDefaults(self): record_defaults = [constant_op.constant([[0]], dtype=dtypes.int64)] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] From b9350874eb8bbf4f0576234b1ecdc997d65bea1c Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:11:12 -0800 Subject: [PATCH 012/383] Switch DenseToSparseBatchTest to use TF combinations --- .../kernel_tests/dense_to_sparse_batch_test.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py index cca7ae073ee..5dd1bb0532c 100644 --- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py @@ -17,20 +17,21 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class DenseToSparseBatchTest(test_base.DatasetTestBase): +class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDataset(self): components = np.random.randint(12, size=(100,)).astype(np.int32) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -53,6 +54,7 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetWithUnknownShape(self): components = np.random.randint(5, size=(40,)).astype(np.int32) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -80,12 +82,14 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetWithInvalidShape(self): input_tensor = array_ops.constant([[1]]) with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"): dataset_ops.Dataset.from_tensors(input_tensor).apply( batching.dense_to_sparse_batch(4, [-2])) + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetShapeErrors(self): def dataset_fn(input_tensor): From 12a6cc569963354dfa4a5d10af291a9e6fcc3b06 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:17:18 -0800 Subject: [PATCH 013/383] Switch DirectedInterleaveDatasetTest to use TF combinations --- .../kernel_tests/directed_interleave_dataset_test.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py index 4a8c7d1ccc6..fc18afaa842 100644 --- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py @@ -17,22 +17,24 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import random_seed -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): +class DirectedInterleaveDatasetTest(test_base.DatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testBasic(self): selector_dataset = dataset_ops.Dataset.range(10).repeat(100) input_datasets = [ @@ -76,6 +78,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): return freqs + @combinations.generate(test_base.default_test_combinations()) def testSampleFromDatasets(self): random_seed.set_random_seed(1619) num_samples = 5000 @@ -95,6 +98,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples) self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2) + @combinations.generate(test_base.default_test_combinations()) def testSelectFromDatasets(self): words = [b"foo", b"bar", b"baz"] datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words] @@ -107,6 +111,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testErrors(self): with self.assertRaisesRegexp(ValueError, r"vector of length `len\(datasets\)`"): From b7be69fcce909f9aef82dc975025d14eb374aa45 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:29:58 -0800 Subject: [PATCH 014/383] Switch GetSingleElementTest to use TF combinations --- .../kernel_tests/get_single_element_test.py | 25 +++++++++++-------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py index f65740c5651..98fb3713b80 100644 --- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py @@ -23,25 +23,27 @@ from tensorflow.python.data.experimental.ops import get_single_element from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import function +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.named_parameters( - ("Zero", 0, 1), - ("Five", 5, 1), - ("Ten", 10, 1), - ("Empty", 100, 1, errors.InvalidArgumentError, "Dataset was empty."), - ("MoreThanOne", 0, 2, errors.InvalidArgumentError, - "Dataset had more than one element."), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + skip=[0, 5, 10], take=[1], error=[None], error_msg=[None]) + + combinations.combine( + skip=[100], take=[1], error=[errors.InvalidArgumentError], + error_msg=["Dataset was empty."]) + + combinations.combine( + skip=[0], take=[2], error=[errors.InvalidArgumentError], + error_msg=["Dataset had more than one element."]))) def testGetSingleElement(self, skip, take, error=None, error_msg=None): def make_sparse(x): @@ -62,6 +64,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaisesRegexp(error, error_msg): self.evaluate(get_single_element.get_single_element(dataset)) + @combinations.generate(test_base.default_test_combinations()) def testWindow(self): """Test that `get_single_element()` can consume a nested dataset.""" def flat_map_func(ds): @@ -73,6 +76,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) + @combinations.generate(test_base.default_test_combinations()) def testSideEffect(self): counter_var = variables.Variable(0) @@ -92,6 +96,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(self.evaluate(fn()), b"hello") self.assertEqual(self.evaluate(counter_var), 1) + @combinations.generate(test_base.default_test_combinations()) def testAutomaticControlDependencies(self): counter_var = variables.Variable(1) From b48761c689d575c92995e15dd78990cb291badf3 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:35:25 -0800 Subject: [PATCH 015/383] Switch GroupByReducerTest to use TF combinations --- .../kernel_tests/group_by_reducer_test.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py index 0e9042b2ef8..bf823143d57 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py @@ -17,25 +17,26 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class GroupByReducerTest(test_base.DatasetTestBase): +class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testSum(self): reducer = grouping.Reducer( init_func=lambda _: np.int64(0), @@ -49,6 +50,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) + @combinations.generate(test_base.default_test_combinations()) def testAverage(self): def reduce_fn(x, y): @@ -68,6 +70,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[i - 1, i]) + @combinations.generate(test_base.default_test_combinations()) def testConcat(self): components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray) reducer = grouping.Reducer( @@ -84,6 +87,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[b"acegikmoqs"[:i], b"bdfhjlnprt"[:i]]) + @combinations.generate(test_base.default_test_combinations()) def testSparseSum(self): def _sparse(i): return sparse_tensor.SparseTensorValue( @@ -103,6 +107,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) + @combinations.generate(test_base.default_test_combinations()) def testChangingStateShape(self): def reduce_fn(x, _): @@ -130,6 +135,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testTypeMismatch(self): reducer = grouping.Reducer( init_func=lambda x: constant_op.constant(1, dtype=dtypes.int32), @@ -144,6 +150,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda _: np.int64(0), reducer)) # TODO(b/78665031): Remove once non-scalar keys are supported. + @combinations.generate(test_base.default_test_combinations()) def testInvalidKeyShape(self): reducer = grouping.Reducer( init_func=lambda x: np.int64(0), @@ -157,6 +164,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer)) # TODO(b/78665031): Remove once non-int64 keys are supported. + @combinations.generate(test_base.default_test_combinations()) def testInvalidKeyType(self): reducer = grouping.Reducer( init_func=lambda x: np.int64(0), @@ -169,6 +177,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): dataset.apply( grouping.group_by_reducer(lambda _: "wrong", reducer)) + @combinations.generate(test_base.default_test_combinations()) def testTuple(self): def init_fn(_): return np.array([], dtype=np.int64), np.int64(0) From 4be4b406a7507f8ab3b0c6643399f1383b6f3b7d Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:40:44 -0800 Subject: [PATCH 016/383] Switch GroupByWindowTest to use TF combinations --- .../kernel_tests/group_by_window_test.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py index e529364e509..2495083cf63 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py @@ -17,17 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import string_ops @@ -37,8 +38,7 @@ from tensorflow.python.platform import test # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py. # Currently, they use a constant batch size, though should be made to use a # different batch size per key. -@test_util.run_all_in_graph_and_eager_modes -class GroupByWindowTest(test_base.DatasetTestBase): +class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): def _dynamicPad(self, bucket, window, window_size): # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a @@ -51,6 +51,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): 32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape( [None]), tensor_shape.TensorShape([3]))))) + @combinations.generate(test_base.default_test_combinations()) def testSingleBucket(self): def _map_fn(v): @@ -80,6 +81,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual(expected_unk_int64, bucketed_values[1]) self.assertAllEqual(expected_vec3_str, bucketed_values[2]) + @combinations.generate(test_base.default_test_combinations()) def testEvenOddBuckets(self): def _map_fn(v): @@ -132,6 +134,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2]) + @combinations.generate(test_base.default_test_combinations()) def testEvenOddBucketsFilterOutAllOdd(self): def _map_fn(v): @@ -173,6 +176,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual( np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"]) + @combinations.generate(test_base.default_test_combinations()) def testDynamicWindowSize(self): components = np.arange(100).astype(np.int64) @@ -202,6 +206,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertEqual(batches, 15) + @combinations.generate(test_base.default_test_combinations()) def testSimple(self): components = np.random.randint(100, size=(200,)).astype(np.int64) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -222,6 +227,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches])) + @combinations.generate(test_base.default_test_combinations()) def testImmediateOutput(self): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) @@ -240,6 +246,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next())) self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( @@ -252,6 +259,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual([0, 0, 0], self.evaluate(get_next())) self.assertAllEqual([1], self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testEmpty(self): dataset = dataset_ops.Dataset.range(4).apply( grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)) @@ -262,6 +270,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): "Window size must be greater than zero, but got 0."): print(self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testReduceFuncError(self): components = np.random.randint(100, size=(200,)).astype(np.int64) @@ -280,6 +289,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testConsumeWindowDatasetMoreThanOnce(self): components = np.random.randint(50, size=(200,)).astype(np.int64) @@ -311,6 +321,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): counts.append(tight_result.shape[0]) self.assertEqual(len(components), sum(counts)) + @combinations.generate(test_base.default_test_combinations()) def testShortCircuit(self): dataset = dataset_ops.Dataset.range(10) From 27a2f9f39f8a43fb203107ac0c297d0fe7ef4b22 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:46:02 -0800 Subject: [PATCH 017/383] Switch IgnoreErrorsTest to use TF combinations --- .../experimental/kernel_tests/ignore_errors_test.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py index c37439f328b..5ed72767425 100644 --- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py @@ -19,14 +19,15 @@ from __future__ import print_function import os +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import error_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.ops import array_ops from tensorflow.python.ops import io_ops @@ -36,9 +37,9 @@ from tensorflow.python.util import compat _NUMPY_RANDOM_SEED = 42 -@test_util.run_all_in_graph_and_eager_modes -class IgnoreErrorsTest(test_base.DatasetTestBase): +class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) @@ -53,6 +54,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) @@ -67,6 +69,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testReadFileIgnoreError(self): def write_string_to_file(value, filename): @@ -102,6 +105,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testTFRecordDatasetIgnoreError(self): filenames = [] for i in range(5): @@ -126,6 +130,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testZipIgnoreError(self): a = dataset_ops.Dataset.from_tensor_slices([1., 2., 0., 4.]) b = a.map(lambda x: array_ops.check_numerics(1. / x, "error")) From e0a77c884f83772c80334dbb152773a377d3049c Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:51:16 -0800 Subject: [PATCH 018/383] Switch MakeBatchedFeaturesDatasetTest to use TF combinations --- .../make_batched_features_dataset_test.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py index 2ddff457bc4..980fd03b073 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py @@ -17,26 +17,29 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.ops import readers +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.ops import io_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class MakeBatchedFeaturesDatasetTest( - reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): + reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testRead(self): for batch_size in [1, 2]: for num_epochs in [1, 10]: @@ -85,6 +88,7 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self._next_actual_batch() + @combinations.generate(test_base.default_test_combinations()) def testReadWithEquivalentDataset(self): features = { "file": parsing_ops.FixedLenFeature([], dtypes.int64), @@ -103,6 +107,7 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testReadWithFusedShuffleRepeatDataset(self): num_epochs = 5 total_records = num_epochs * self._num_records @@ -151,6 +156,7 @@ class MakeBatchedFeaturesDatasetTest( all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) + @combinations.generate(test_base.default_test_combinations()) def testParallelReadersAndParsers(self): num_epochs = 5 for batch_size in [1, 2]: @@ -186,6 +192,7 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self._next_actual_batch() + @combinations.generate(test_base.default_test_combinations()) def testDropFinalBatch(self): for batch_size in [1, 2]: for num_epochs in [1, 10]: @@ -201,6 +208,7 @@ class MakeBatchedFeaturesDatasetTest( if isinstance(tensor, ops.Tensor): # Guard against SparseTensor. self.assertEqual(tensor.shape[0], batch_size) + @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): dataset = self.make_batch_feature( filenames=self.test_filenames[0], @@ -213,6 +221,7 @@ class MakeBatchedFeaturesDatasetTest( if issubclass(clazz, ops.Tensor): self.assertEqual(32, shape[0]) + @combinations.generate(test_base.default_test_combinations()) def testOldStyleReader(self): with self.assertRaisesRegexp( TypeError, r"The `reader` argument must return a `Dataset` object. " From 0e445ffe0c5eebec53bca5a0893a850288fb5f88 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:56:03 -0800 Subject: [PATCH 019/383] Switch MakeCsvDatasetTest to use TF combinations --- .../kernel_tests/make_csv_dataset_test.py | 23 ++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py index 16c323b3790..5f8382f43c4 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py @@ -21,21 +21,21 @@ import gzip import os import zlib +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class MakeCsvDatasetTest(test_base.DatasetTestBase): +class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs): return readers.make_csv_dataset( @@ -126,6 +126,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): self._verify_output(dataset, batch_size, num_epochs, label_name, expected_output, expected_keys) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset(self): """Tests making a CSV dataset with keys and defaults provided.""" record_defaults = [ @@ -157,6 +158,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withBatchSizeAndEpochs(self): """Tests making a CSV dataset with keys and defaults provided.""" record_defaults = [ @@ -188,6 +190,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withCompressionType(self): """Tests `compression_type` argument.""" record_defaults = [ @@ -221,6 +224,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): compression_type=compression_type, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withCompressionTypeAndNoColumnNames(self): """Tests `compression_type` argument.""" record_defaults = [ @@ -269,6 +273,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): compression_type="ZLIB", ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withBadInputs(self): """Tests that exception is raised when input is malformed. """ @@ -304,6 +309,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): label_name="not_a_real_label", column_names=column_names) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoLabel(self): """Tests making a CSV dataset with no label provided.""" record_defaults = [ @@ -333,6 +339,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoHeader(self): """Tests that datasets can be created from CSV files with no header line. """ @@ -363,6 +370,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypes(self): """Tests that defaults can be a dtype instead of a Tensor for required vals. """ @@ -394,6 +402,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoColNames(self): """Tests that datasets can be created when column names are not specified. @@ -427,6 +436,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInferenceMismatch(self): # Test that error is thrown when num fields doesn't match columns column_names = ["col%d" % i for i in range(5)] @@ -442,6 +452,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): batch_size=2, num_epochs=10) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInference(self): """Tests that datasets can be created when no defaults are specified. @@ -468,6 +479,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInferenceFallthrough(self): """Tests that datasets can be created when no defaults are specified. @@ -498,6 +510,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNAValuesAndFieldDelim(self): """Tests that datasets can be created from different delim and na_value.""" column_names = ["col%d" % i for i in range(5)] @@ -520,6 +533,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): field_delim=" ", ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withSelectCols(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -588,6 +602,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): select_columns=[column_names[i] for i in select_cols], ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withSelectColsError(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -626,6 +641,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): label_name=None, select_columns=["invalid_col_name"]) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withShuffle(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -710,6 +726,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) + @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): column_names = ["col%d" % i for i in range(5)] inputs = [[",".join(x for x in column_names), "0,1,2,3,4", "5,6,7,8,9"], [ From c0e7479f2d2ff61f87d4839f79b03e6140f3fd2c Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 11:59:53 -0800 Subject: [PATCH 020/383] Switch MakeTFRecordDatasetTest to use TF combinations --- .../kernel_tests/make_tf_record_dataset_test.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py index ec1760398fa..a67ccd92842 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py @@ -17,19 +17,22 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.ops import readers +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import string_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class MakeTFRecordDatasetTest( - reader_dataset_ops_test_base.TFRecordDatasetTestBase): + reader_dataset_ops_test_base.TFRecordDatasetTestBase, + parameterized.TestCase): def _read_test(self, batch_size, num_epochs, file_index=None, num_parallel_reads=1, drop_final_batch=False, parser_fn=False): @@ -63,6 +66,7 @@ class MakeTFRecordDatasetTest( with self.assertRaises(errors.OutOfRangeError): self.evaluate(outputs()) + @combinations.generate(test_base.default_test_combinations()) def testRead(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -78,6 +82,7 @@ class MakeTFRecordDatasetTest( # Basic test: read from both files, with parallel reads. self._read_test(batch_size, num_epochs, num_parallel_reads=8) + @combinations.generate(test_base.default_test_combinations()) def testDropFinalBatch(self): for batch_size in [1, 2, 10]: for num_epochs in [1, 3]: @@ -91,6 +96,7 @@ class MakeTFRecordDatasetTest( self._read_test(batch_size, num_epochs, num_parallel_reads=8, drop_final_batch=True) + @combinations.generate(test_base.default_test_combinations()) def testParserFn(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -145,6 +151,7 @@ class MakeTFRecordDatasetTest( actual.extend(b) self.assertAllEqual(sorted(expected), sorted(actual)) + @combinations.generate(test_base.default_test_combinations()) def testShuffle(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -156,6 +163,7 @@ class MakeTFRecordDatasetTest( self._shuffle_test(batch_size, num_epochs, num_parallel_reads, seed=21345) + @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): dataset = readers.make_tf_record_dataset( file_pattern=self.test_filenames, num_epochs=None, batch_size=32) From 49f85cceb9d0a5382465438bfd75153ddcf974e8 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 12:08:52 -0800 Subject: [PATCH 021/383] Switch MapDefunTest to use TF combinations --- .../kernel_tests/map_defun_op_test.py | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py index a42ce40fb29..2c085352c50 100644 --- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py @@ -19,10 +19,13 @@ from __future__ import print_function import time +from absl.testing import parameterized + from tensorflow.python.client import session from tensorflow.python.data.experimental.ops import map_defun from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.eager import function +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors @@ -39,8 +42,10 @@ from tensorflow.python.platform import test @test_util.run_v1_only("b/123903858: Add eager and V2 test coverage") -class MapDefunTest(test_base.DatasetTestBase): +class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testNoIntraOpLimit(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -55,6 +60,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = elems * 2 + 3 self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunSimple(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -67,6 +74,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = elems * 2 + 3 self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunMismatchedTypes(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)]) @@ -79,6 +88,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(r) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunReduceDim(self): # Tests where the output has a different rank from the input @@ -92,6 +103,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = constant_op.constant([1, 3, 5]) self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunMultipleOutputs(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -105,6 +118,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = [elems, elems * 2 + 3] self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunShapeInference(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -116,6 +131,8 @@ class MapDefunTest(test_base.DatasetTestBase): result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])[0] self.assertEqual(result.get_shape(), (3, 2)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunPartialShapeInference(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -126,6 +143,8 @@ class MapDefunTest(test_base.DatasetTestBase): result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)]) self.assertEqual(result[0].get_shape().as_list(), [None, 2]) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunRaisesErrorOnRuntimeShapeMismatch(self): @function.defun(input_signature=[ @@ -145,6 +164,8 @@ class MapDefunTest(test_base.DatasetTestBase): "All inputs must have the same dimension 0."): sess.run(result, feed_dict={elems1: [1, 2, 3, 4, 5], elems2: [1, 2, 3]}) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunRaisesDefunError(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)]) @@ -157,6 +178,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(result) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunCancelledCorrectly(self): @function.defun(input_signature=[tensor_spec.TensorSpec([5], dtypes.int64)]) @@ -173,6 +196,8 @@ class MapDefunTest(test_base.DatasetTestBase): r"indices = 10 is not in \[0, 5\)"): self.evaluate(map_defun_op) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithUnspecifiedOutputShape(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -190,6 +215,8 @@ class MapDefunTest(test_base.DatasetTestBase): self.assertAllEqual(self.evaluate(r[1]), self.evaluate(expected + 1)) self.assertAllEqual(self.evaluate(r[2]), self.evaluate(expected + 2)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithDifferentOutputShapeEachRun(self): @function.defun( @@ -204,6 +231,8 @@ class MapDefunTest(test_base.DatasetTestBase): self.assertAllEqual( sess.run(r, feed_dict={elems: [[0], [1]]}), [[3], [5]]) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithWrongOutputShape(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -216,6 +245,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(r) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithInvalidInput(self): @function.defun( @@ -233,6 +264,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): sess.run(r, feed_dict={p: 0}) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithParentCancellation(self): # Checks that a cancellation of the parent graph is threaded through to # MapDefunOp correctly. @@ -254,6 +287,8 @@ class MapDefunTest(test_base.DatasetTestBase): sess.close() thread.join() + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithCapturedInputs(self): c = constant_op.constant(2) @@ -266,6 +301,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = x + c self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithVariantTensor(self): @function.defun( @@ -288,6 +325,8 @@ class MapDefunTest(test_base.DatasetTestBase): actual = self.evaluate(deserialized) self.assertValuesEqual(expected, actual) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithVariantTensorAsCaptured(self): st = sparse_tensor.SparseTensor( @@ -309,6 +348,8 @@ class MapDefunTest(test_base.DatasetTestBase): actual = self.evaluate(deserialized) self.assertValuesEqual(expected, actual) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithStrTensor(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) From 8f6b18cfd4ce507899b19078cf9a6444aa460b3d Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 13:34:21 -0800 Subject: [PATCH 022/383] Switch OverrideThreadpoolTest to use TF combinations --- .../kernel_tests/override_threadpool_test.py | 49 +++++++++---------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py index 811a58262ef..65565d183e2 100644 --- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py @@ -28,14 +28,13 @@ from tensorflow.python.data.experimental.ops import threadpool from tensorflow.python.data.experimental.ops import unique from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import script_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class OverrideThreadpoolTest(test_base.DatasetTestBase, parameterized.TestCase): @@ -70,17 +69,15 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, # perform work. self.assertLessEqual(len(thread_ids), num_threads) - @parameterized.named_parameters( - ("1", 1, None), - ("2", 2, None), - ("3", 4, None), - ("4", 8, None), - ("5", 16, None), - ("6", 4, -1), - ("7", 4, 0), - ("8", 4, 1), - ("9", 4, 4), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + + combinations.combine( + num_threads=[4], max_intra_op_parallelism=[0, 1, 4]) + + combinations.combine( + num_threads=[5], max_intra_op_parallelism=[-1]))) def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): @@ -93,20 +90,17 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, self._testNumThreadsHelper(num_threads, override_threadpool_fn) - @parameterized.named_parameters( - ("1", 1, None), - ("2", 2, None), - ("3", 4, None), - ("4", 8, None), - ("5", 16, None), - ("6", None, 0), - ("7", None, 1), - ("8", None, 4), - ("9", 4, 0), - ("10", 4, 1), - ("11", 4, 4), - ("12", None, None), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + + combinations.combine( + num_threads=[None], max_intra_op_parallelism=[0, 1, 4]) + + combinations.combine( + num_threads=[4], max_intra_op_parallelism=[0, 1, 4]) + + combinations.combine( + num_threads=[None], max_intra_op_parallelism=[None]))) def testNumThreads(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): @@ -121,6 +115,7 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, self._testNumThreadsHelper(num_threads, override_threadpool_fn) + @combinations.generate(test_base.default_test_combinations()) def testMaxIntraOpParallelismAsGraphDefInternal(self): dataset = dataset_ops.Dataset.from_tensors(0) dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1) From 776903251503560a7d983a50660a66dc1ebd897a Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 13:52:10 -0800 Subject: [PATCH 023/383] Switch ParallelInterleaveTest to use TF combinations --- .../kernel_tests/parallel_interleave_test.py | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py index 1fe5655ec02..083c97b24b6 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py @@ -22,24 +22,24 @@ import math import threading import time +from absl.testing import parameterized import numpy as np from six.moves import zip_longest from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class ParallelInterleaveTest(test_base.DatasetTestBase): +class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): def setUp(self): @@ -116,6 +116,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): num_open -= 1 break + @combinations.generate(test_base.default_test_combinations()) def testPythonImplementation(self): input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6], [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]] @@ -136,6 +137,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.assertEqual(expected, produced, "Values differ at %s. %s != %s" % (index, expected, produced)) + @combinations.generate(test_base.default_test_combinations()) def testPythonImplementationBlockLength(self): input_lists = [[4] * 4, [5] * 5, [6] * 6] * 2 expected_elements = [ @@ -147,6 +149,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.assertEqual(expected, produced, "Values differ at %s. %s != %s" % (index, expected, produced)) + @combinations.generate(test_base.default_test_combinations()) def testPythonImplementationEmptyLists(self): input_lists = [[4, 4, 4, 4], [], [6, 6, 6, 6, 6, 6], [4, 4, 4, 4], [], [6, 6, 6, 6, 6, 6]] @@ -189,18 +192,23 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreaded(self): self._testSingleThreaded() + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedSloppy(self): self._testSingleThreaded(sloppy=True) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedPrefetch1Itr(self): self._testSingleThreaded(prefetch_input_elements=1) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedPrefetch1ItrSloppy(self): self._testSingleThreaded(prefetch_input_elements=1, sloppy=True) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedRagged(self): # Tests a sequence with wildly different elements per iterator. self.skipTest("b/131722904") @@ -259,9 +267,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContention(self): self._testTwoThreadsNoContention() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionSloppy(self): self._testTwoThreadsNoContention(sloppy=True) @@ -306,9 +316,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRaces(self): self._testTwoThreadsNoContentionWithRaces() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesSloppy(self): self._testTwoThreadsNoContentionWithRaces(sloppy=True) @@ -343,9 +355,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionBlockLength(self): self._testTwoThreadsNoContentionBlockLength() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionBlockLengthSloppy(self): self._testTwoThreadsNoContentionBlockLength(sloppy=True) @@ -391,9 +405,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesAndBlocking(self): self._testTwoThreadsNoContentionWithRacesAndBlocking() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesAndBlockingSloppy(self): self._testTwoThreadsNoContentionWithRacesAndBlocking(sloppy=True) @@ -411,9 +427,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testEmptyInput(self): self._testEmptyInput() + @combinations.generate(test_base.default_test_combinations()) def testEmptyInputSloppy(self): self._testEmptyInput(sloppy=True) @@ -431,9 +449,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testNonEmptyInputIntoEmptyOutputs(self): self._testNonEmptyInputIntoEmptyOutputs() + @combinations.generate(test_base.default_test_combinations()) def testNonEmptyInputIntoEmptyOutputsSloppy(self): self._testNonEmptyInputIntoEmptyOutputs(sloppy=True) @@ -469,12 +489,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): "At index %s: %s expected, got: %s" % (i, expected_element, actual_element)) + @combinations.generate(test_base.default_test_combinations()) def testPartiallyEmptyOutputs(self): self._testPartiallyEmptyOutputs() + @combinations.generate(test_base.default_test_combinations()) def testPartiallyEmptyOutputsSloppy(self): self._testPartiallyEmptyOutputs(sloppy=True, prefetch_input_elements=0) + @combinations.generate(test_base.default_test_combinations()) def testDelayedOutputSloppy(self): # Explicitly control the sequence of events to ensure we correctly avoid # head-of-line blocking. @@ -500,6 +523,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testBlockLengthWithContentionSloppy(self): self.skipTest("b/131722904") self._clear_coordination_events() @@ -557,9 +581,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.read_coordination_events[i].acquire() self.write_coordination_events[i].set() + @combinations.generate(test_base.default_test_combinations()) def testEarlyExit(self): self._testEarlyExit() + @combinations.generate(test_base.default_test_combinations()) def testEarlyExitSloppy(self): self._testEarlyExit(sloppy=True) @@ -584,12 +610,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2) self.assertItemsEqual(output_values, expected_values) + @combinations.generate(test_base.default_test_combinations()) def testTooManyReaders(self): self._testTooManyReaders() + @combinations.generate(test_base.default_test_combinations()) def testTooManyReadersSloppy(self): self._testTooManyReaders(sloppy=True) + @combinations.generate(test_base.default_test_combinations()) def testSparse(self): def _map_fn(i): return sparse_tensor.SparseTensor( @@ -610,6 +639,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testErrorsInOutputFn(self): self.skipTest("b/131722904") self._clear_coordination_events() @@ -642,6 +672,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testErrorsInInputFn(self): def map_py_fn(x): @@ -687,6 +718,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testErrorsInInterleaveFn(self): def map_py_fn(x): @@ -730,6 +762,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testShutdownRace(self): dataset = dataset_ops.Dataset.range(20) map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1)) From 47e66474095f153c7be377269e132652a9fcbd52 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 14:01:59 -0800 Subject: [PATCH 024/383] Switch ParseExampleDatasetTest to use TF combinations --- .../parse_example_dataset_test.py | 33 +++++++++++++++---- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py index 794f72365df..f82fcb5a3eb 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py @@ -20,6 +20,7 @@ from __future__ import print_function import copy +from absl.testing import parameterized import numpy as np from tensorflow.core.example import example_pb2 @@ -28,11 +29,11 @@ from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsi from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.ops.ragged import ragged_factory_ops from tensorflow.python.platform import test @@ -50,8 +51,8 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d) sequence_example = example_pb2.SequenceExample -@test_util.run_all_in_graph_and_eager_modes -class ParseExampleDatasetTest(test_base.DatasetTestBase): +class ParseExampleDatasetTest(test_base.DatasetTestBase, + parameterized.TestCase): def _compare_output_to_expected(self, dict_tensors, expected_tensors): self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys())) @@ -107,6 +108,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[1], None) + @combinations.generate(test_base.default_test_combinations()) def testEmptySerializedWithAllDefaults(self): sparse_name = "st_a" a_name = "a" @@ -145,7 +147,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - @test_util.run_deprecated_v1 + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testEmptySerializedWithoutDefaultsShouldFail(self): input_features = { "st_a": @@ -179,7 +182,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_err=(errors_impl.InvalidArgumentError, "Feature: c \\(data type: float\\) is required")) - @test_util.run_deprecated_v1 + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testDenseNotMatchingShapeShouldFail(self): original = [ example(features=features({ @@ -197,6 +201,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_err=(errors_impl.InvalidArgumentError, "Key: a, Index: 1. Number of float values")) + @combinations.generate(test_base.default_test_combinations()) def testDenseDefaultNoShapeShouldFail(self): original = [example(features=features({"a": float_feature([1, 1, 3]),})),] @@ -207,6 +212,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)}, expected_err=(ValueError, "Missing shape for feature a")) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparse(self): original = [ example(features=features({ @@ -248,6 +254,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparseFeature(self): original = [ example(features=features({ @@ -284,6 +291,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparseFeatureReuse(self): original = [ example(features=features({ @@ -325,6 +333,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContaining3DSparseFeature(self): original = [ example(features=features({ @@ -370,6 +379,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDense(self): aname = "a" bname = "b*has+a:tricky_name" @@ -407,6 +417,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): # This test is identical as the previous one except # for the creation of 'serialized'. + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseWithConcat(self): aname = "a" bname = "b*has+a:tricky_name" @@ -452,6 +463,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseScalar(self): original = [ example(features=features({ @@ -476,6 +488,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseWithDefaults(self): original = [ example(features=features({ @@ -514,6 +527,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self): expected_st_a = sparse_tensor.SparseTensorValue( # indices, values, shape np.empty((0, 2), dtype=np.int64), # indices @@ -569,6 +583,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testerializedContainingSparseAndSparseFeatureWithReuse(self): expected_idx = sparse_tensor.SparseTensorValue( # indices, values, shape np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), @@ -667,11 +682,13 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingVarLenDenseLargerBatch(self): np.random.seed(3456) for batch_size in (1, 10, 20, 100, 256): self._testSerializedContainingVarLenDenseLargerBatch(batch_size) + @combinations.generate(test_base.default_test_combinations()) def testSerializedShapeMismatch(self): aname = "a" bname = "b" @@ -724,7 +741,8 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_err=(ValueError, "Cannot reshape a tensor with 0 elements to shape")) - @test_util.run_deprecated_v1 + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testSerializedContainingVarLenDense(self): aname = "a" bname = "b" @@ -877,6 +895,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): "Unsupported: FixedLenSequenceFeature requires " "allow_missing to be True.")) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithNoPartitions(self): original = [ example( @@ -922,6 +941,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithOnePartition(self): original = [ example( @@ -1040,6 +1060,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithMultiplePartitions(self): original = [ # rt shape: [(batch), 2, None, None] From 0216a93efc666c720496f9c9805768f62300231c Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 14:06:35 -0800 Subject: [PATCH 025/383] Switch PrefetchToDeviceTest to use TF combinations --- .../kernel_tests/prefetch_to_device_test.py | 26 +++++++++++++------ 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py index f51da6e8b66..1b6295864ea 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py @@ -17,11 +17,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.experimental.ops import prefetching_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import structure +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -31,9 +34,10 @@ from tensorflow.python.platform import test # TODO(b/117581999): add eager coverage when supported. -class PrefetchToDeviceTest(test_base.DatasetTestBase): +class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -57,7 +61,8 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -82,7 +87,8 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -106,7 +112,8 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchSparseTensorsToDevice(self): def make_tensor(i): return sparse_tensor.SparseTensorValue( @@ -136,7 +143,8 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -156,7 +164,8 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -184,7 +193,8 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate( + combinations.combine(tf_api_version=[1, 2], mode=["graph"])) def testPrefetchToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") From b8e36b1a6f8563645413595f8f51aae8c0a91a69 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 14:20:07 -0800 Subject: [PATCH 026/383] Switch PrefetchWithSlackTest to use TF combinations --- .../kernel_tests/prefetch_with_slack_test.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py index 5de98189322..43111faeb91 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py @@ -24,16 +24,17 @@ from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import multi_device_iterator_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): - @test_util.run_v1_only("b/121264236") + # TODO(b/121264236) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testPrefetchWithSlackOption(self): """Determines slack_period based on num devices attached to iterator.""" dataset = dataset_ops.Dataset.range(10) @@ -60,6 +61,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): self.evaluate(elem_on_1) self.evaluate(elem_on_2) + @combinations.generate(test_base.default_test_combinations()) def testPrefetchWithSlackOptionWithoutIterator(self): """Defaults to slack period of 1 without iterator.""" dataset = dataset_ops.Dataset.range(10) @@ -72,6 +74,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): dataset.options()._static_optimization_configs()) self.assertDatasetProduces(dataset, range(10)) + @combinations.generate(test_base.default_test_combinations()) def testWithPassthroughDataset(self): """Should still work with a passthrough dataset after prefetch().""" dataset = dataset_ops.Dataset.range(10) @@ -82,6 +85,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.with_options(options) self.assertDatasetProduces(dataset, range(1, 11)) + @combinations.generate(test_base.default_test_combinations()) def testErrorWithoutPrefetch(self): """The rewrite fails if there is no prefetch() in the pipeline.""" dataset = dataset_ops.Dataset.range(10) @@ -92,6 +96,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): get_next = self.getNext(dataset) self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testErrorWithInvalidDataset(self): """With a nested dataset op after prefetch, the rewrite should fail.""" dataset = dataset_ops.Dataset.range(10) From ff631e250778409449cf5f402835e95239590210 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 14:29:07 -0800 Subject: [PATCH 027/383] Switch RebatchDatasetTest to use TF combinations --- .../kernel_tests/rebatch_dataset_test.py | 44 +++++++++++++++---- 1 file changed, 35 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py index 32bcdbe183b..30496658529 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py @@ -32,8 +32,8 @@ from tensorflow.python.data.experimental.ops import scan_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -47,13 +47,11 @@ def _flat_shapes(dataset): return nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)) -@test_util.run_all_in_graph_and_eager_modes class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): - drop_remainder_cases = [("WithDropRemainder", True), - ("WithoutDropRemainder", False)] - - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testBasic(self, drop_remainder): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder) @@ -64,13 +62,16 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testScalarInputError(self): dataset = dataset_ops.Dataset.range(1024) distribute._RebatchDataset(dataset.batch(4), num_replicas=4) with self.assertRaisesRegexp(ValueError, "at least one dimension"): distribute._RebatchDataset(dataset, num_replicas=4) - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testBatchNotDivisibleByNumReplicas(self, drop_remainder): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder) @@ -89,6 +90,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): i += 4 self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testBatchSizeNotDivisibleByNumReplicas2(self): dataset = dataset_ops.Dataset.range(32).batch(16, drop_remainder=True) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5) @@ -102,6 +104,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output.extend([[]]) # Last replica gets an empty batch self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testTupleOutput(self): dataset = dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(32) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4) @@ -110,6 +113,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testNestedDictionaryOutput(self): dataset = dataset_ops.Dataset.range(1024).map( lambda x: {"a": x, "b": {"c": x}}).batch(32) @@ -119,7 +123,9 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testFinalPartialBatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(1032).batch( 32, drop_remainder=drop_remainder) @@ -136,7 +142,9 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[k for k in range(i, i + 2)] for i in range(1024, 1032, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testFinalPartialBatchAfterRebatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(34).batch( 32, drop_remainder=drop_remainder) @@ -150,6 +158,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += [[32], [33], [], []] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMultipleBatches(self): dataset = dataset_ops.Dataset.range(128).batch(4).batch(8) self.assertEqual([[None, None]], @@ -170,6 +179,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 128, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMapAndBatch(self): dataset = dataset_ops.Dataset.range(1024).apply( batching.map_and_batch(math_ops.square, 32)) @@ -180,6 +190,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMapAndBatchWithCapturedInput(self): captured_t = variables.Variable(42) dataset = dataset_ops.Dataset.range(1024).apply( @@ -193,6 +204,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( rebatched_dataset, expected_output, requires_initialization=True) + @combinations.generate(test_base.default_test_combinations()) def testPaddedBatch(self): dataset = dataset_ops.Dataset.range(128).batch( 4, drop_remainder=True).padded_batch( @@ -213,6 +225,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 128, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testConcatenate(self): dataset1 = dataset_ops.Dataset.range(64).batch(8) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -224,6 +237,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[i, i + 1] for i in range(0, 32, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testConcatenateDifferentShapes(self): dataset1 = dataset_ops.Dataset.range(64).batch(16) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -235,6 +249,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[i, i + 1] for i in range(0, 32, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testZip(self): dataset1 = dataset_ops.Dataset.range(64).batch(8) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -245,6 +260,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testZipDifferentShapes(self): dataset1 = dataset_ops.Dataset.range(64).batch(16) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -256,6 +272,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 32, 2)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testFlatMapBatching(self): dataset = dataset_ops.Dataset.range(2).flat_map( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -274,6 +291,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 32, 8)] # generates 4 elements self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testInterleaveBatching(self): dataset = dataset_ops.Dataset.range(2).interleave( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -290,6 +308,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += expected_output self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testParallelInterleaveBatching(self): dataset = dataset_ops.Dataset.range(2).interleave( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -307,6 +326,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += expected_output self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowStaticBatch(self): dataset = dataset_ops.Dataset.from_tensor_slices( [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)]) @@ -326,6 +346,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for k in range(2)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) @@ -350,6 +371,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatchWithPartialBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) @@ -371,6 +393,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatchWithPartialBatchWithDropRemainder(self): # This test exercises nested batch functionality, dynamic batch size # and drop_remainder=True together. @@ -395,6 +418,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testScanAfterBatch(self): dataset = dataset_ops.Dataset.range(40).batch(10).apply( scan_ops.scan(np.int64(2), lambda state, value: (state, value * state))) @@ -405,6 +429,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[i * 2 for i in range(j*5, (j+1)*5)] for j in range(8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMakeBatchedFeaturesDataset(self): # Set up fn = os.path.join(self.get_temp_dir(), "tf_record.txt") @@ -438,6 +463,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): } for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testRaggedTensorDataset(self): # Set up a dataset that produces ragged tensors with a static batch size. row_lengths = np.random.randint(8, size=128) From dae7cfc9f07d5d12d4aad82fe1d4e5df87b4e39c Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 14:39:34 -0800 Subject: [PATCH 028/383] Switch RejectionResampleTest to use TF combinations --- .../kernel_tests/rejection_resample_test.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py index 673e77fc3bb..e11a4ca9203 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py @@ -24,9 +24,9 @@ import numpy as np from tensorflow.python.data.experimental.ops import resampling from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import string_ops @@ -34,12 +34,12 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat -@test_util.run_all_in_graph_and_eager_modes class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.named_parameters( - ("InitialDistributionKnown", True), - ("InitialDistributionUnknown", False)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(initial_known=[True, False]))) def testDistribution(self, initial_known): classes = np.random.randint(5, size=(20000,)) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] @@ -72,9 +72,10 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2) - @parameterized.named_parameters( - ("OnlyInitial", True), - ("NotInitial", False)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(only_initial_dist=[True, False]))) def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist): init_dist = [0.5, 0.5] target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0] @@ -99,6 +100,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): while True: returned.append(self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] From 1c77b6cb31c04d4e1ef28d56aaca35a6c4a7dfe2 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 14:45:33 -0800 Subject: [PATCH 029/383] Switch ShuffleAndRepeatTest to use TF combinations --- .../kernel_tests/shuffle_and_repeat_test.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py index 92ae528b940..8bb109a6519 100644 --- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py @@ -17,18 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import shuffle_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class ShuffleAndRepeatTest(test_base.DatasetTestBase): +class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): def _build_ds(self, seed, count=5, num_elements=20): return dataset_ops.Dataset.range(num_elements).apply( @@ -44,6 +44,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self.evaluate(get_next()) return outputs + @combinations.generate(test_base.default_test_combinations()) def testCorrectOutput(self): output = self._gen_outputs(lambda: self._build_ds(10), 100) self.assertSequenceEqual( @@ -52,6 +53,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): for i in range(5): self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20)) + @combinations.generate(test_base.default_test_combinations()) def testReshuffling(self): # Check that the output orders of different epochs are indeed different. output = self._gen_outputs(lambda: self._build_ds(10), 100) @@ -60,17 +62,20 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): epoch2 = output[(i + 1) * 20:(i + 2) * 20] self.assertNotEqual(epoch1, epoch2) + @combinations.generate(test_base.default_test_combinations()) def testSameOrderForSameSeeds(self): output1 = self._gen_outputs(lambda: self._build_ds(10), 100) output2 = self._gen_outputs(lambda: self._build_ds(10), 100) self.assertEqual(output1, output2) + @combinations.generate(test_base.default_test_combinations()) def testDifferentOrderForDifferentSeeds(self): output1 = self._gen_outputs(lambda: self._build_ds(10), 100) output2 = self._gen_outputs(lambda: self._build_ds(20), 100) self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) + @combinations.generate(test_base.default_test_combinations()) def testCountNone(self): output1 = self._gen_outputs( lambda: self._build_ds(10, count=None), 100, verify_exhausted=False) @@ -79,6 +84,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) + @combinations.generate(test_base.default_test_combinations()) def testCountMinusOne(self): output1 = self._gen_outputs( lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False) @@ -87,6 +93,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) + @combinations.generate(test_base.default_test_combinations()) def testInfiniteOutputs(self): # Asserting the iterator is exhausted after producing 100 items should fail. with self.assertRaises(AssertionError): @@ -94,6 +101,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): with self.assertRaises(AssertionError): self._gen_outputs(lambda: self._build_ds(10, count=-1), 100) + @combinations.generate(test_base.default_test_combinations()) def testInfiniteEmpty(self): with self.assertRaises(errors.OutOfRangeError): self._gen_outputs(lambda: self._build_ds(10, count=None, num_elements=0), @@ -102,12 +110,14 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self._gen_outputs(lambda: self._build_ds(10, count=-1, num_elements=0), 100) + @combinations.generate(test_base.default_test_combinations()) def testLargeBufferSize(self): ds = dataset_ops.Dataset.range(20).apply( shuffle_ops.shuffle_and_repeat(buffer_size=21)) get_next = self.getNext(ds) self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testVeryLargeBufferSize(self): num_epochs = 1000 * 1000 # Each element being shuffled and repeated has shape (100,). This will OOM From 680afa490888e88c7ffca2c0d5b5902ec7f65517 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:00:40 -0800 Subject: [PATCH 030/383] Switch SqlDatasetTest to use TF combinations --- .../kernel_tests/sql_dataset_test.py | 42 +++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py index f55f62f5cb0..8e1dd4bd8dc 100644 --- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py @@ -18,18 +18,22 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base +from tensorflow.python.data.kernel_tests import test_base +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): +class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, + parameterized.TestCase): # Test that SqlDataset can read from a database table. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSet(self): for _ in range(2): # Run twice to verify statelessness of db operations. dataset = self._createSqlDataset( @@ -44,6 +48,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): num_test_iterations=2) # Test that SqlDataset works on a join query. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetJoinQuery(self): get_next = self.getNext( self._createSqlDataset( @@ -60,6 +65,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that SqlDataset can read a database entry with a null-terminator # in the middle of the text and place the entry in a `string` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetNullTerminator(self): get_next = self.getNext( self._createSqlDataset( @@ -76,6 +82,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that SqlDataset works when used on two different queries. # Because the output types of the dataset must be determined at graph-creation # time, the two queries must have the same number and types of columns. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetReuseSqlDataset(self): get_next = self.getNext( self._createSqlDataset( @@ -100,6 +107,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that an `OutOfRangeError` is raised on the first call to # `get_next_str_only` if result set is empty. + @combinations.generate(test_base.default_test_combinations()) def testReadEmptyResultSet(self): get_next = self.getNext( self._createSqlDataset( @@ -110,6 +118,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.evaluate(get_next()) # Test that an error is raised when `driver_name` is invalid. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithInvalidDriverName(self): with self.assertRaises(errors.InvalidArgumentError): dataset = self._createSqlDataset( @@ -120,6 +129,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.assertDatasetProduces(dataset, expected_output=[]) # Test that an error is raised when a column name in `query` is nonexistent + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithInvalidColumnName(self): get_next = self.getNext( self._createSqlDataset( @@ -130,6 +140,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.evaluate(get_next()) # Test that an error is raised when there is a syntax error in `query`. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetOfQueryWithSyntaxError(self): get_next = self.getNext( self._createSqlDataset( @@ -141,6 +152,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that an error is raised when the number of columns in `query` # does not match the length of `, output_types`. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithMismatchBetweenColumnsAndOutputTypes(self): get_next = self.getNext( self._createSqlDataset( @@ -154,6 +166,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # than a select query. In particular, the error refers to the number of # output types passed to the op not matching the number of columns in the # result set of the query (namely, 0 for an insert statement.) + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetOfInsertQuery(self): get_next = self.getNext( self._createSqlDataset( @@ -165,6 +178,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8(self): get_next = self.getNext( self._createSqlDataset( @@ -178,6 +192,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -191,6 +206,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -205,6 +221,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16(self): get_next = self.getNext( self._createSqlDataset( @@ -218,6 +235,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -231,6 +249,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -246,6 +265,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32(self): get_next = self.getNext( self._createSqlDataset( @@ -257,6 +277,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -270,6 +291,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -285,6 +307,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database # table and place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32VarCharColumnAsInt(self): get_next = self.getNext( self._createSqlDataset( @@ -298,6 +321,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table # and place it in an `int64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64(self): get_next = self.getNext( self._createSqlDataset( @@ -311,6 +335,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -324,6 +349,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -339,6 +365,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in a `uint8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt8(self): get_next = self.getNext( self._createSqlDataset( @@ -352,6 +379,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read the minimum and maximum uint8 values from a # SQLite database table and place them in `uint8` tensors. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt8MinAndMaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -367,6 +395,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table # and place it in a `uint16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt16(self): get_next = self.getNext( self._createSqlDataset( @@ -380,6 +409,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read the minimum and maximum uint16 values from a # SQLite database table and place them in `uint16` tensors. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt16MinAndMaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -396,6 +426,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a # SQLite database table and place them as `True` and `False` respectively # in `bool` tensors. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetBool(self): get_next = self.getNext( self._createSqlDataset( @@ -409,6 +440,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued # from a SQLite database table and place it as `True` in a `bool` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetBoolNotZeroOrOne(self): get_next = self.getNext( self._createSqlDataset( @@ -422,6 +454,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a float from a SQLite database table # and place it in a `float64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64(self): get_next = self.getNext( self._createSqlDataset( @@ -437,6 +470,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a float from a SQLite database table beyond # the precision of 64-bit IEEE, without throwing an error. Test that # `SqlDataset` identifies such a value as equal to itself. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64OverlyPrecise(self): get_next = self.getNext( self._createSqlDataset( @@ -458,6 +492,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # representing the largest integer representable as a 64-bit IEEE float # such that the previous integer is also representable as a 64-bit IEEE float. # Test that `SqlDataset` can distinguish these two numbers. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64LargestConsecutiveWholeNumbersNotEqual(self): get_next = self.getNext( self._createSqlDataset( @@ -472,6 +507,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.evaluate(get_next()) # Test that SqlDataset can stop correctly when combined with batch + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithBatchStop(self): dataset = self._createSqlDataset( query="SELECT * FROM data", output_types=(dtypes.int32)) From a18d9cbd553c1e8f355b1e1d5688c573e1a5534f Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:08:31 -0800 Subject: [PATCH 031/383] Switch StatsDatasetTest to use TF combinations --- .../kernel_tests/stats_dataset_ops_test.py | 29 +++++++++++++++++-- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py index 4f04a0a3639..756c1e0e743 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py @@ -17,14 +17,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.experimental.ops import stats_ops from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -32,8 +35,10 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): +class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testBytesProduced(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( @@ -57,6 +62,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.assertStatisticsHasCount(handle, "bytes_produced", 100.0, 101) self.assertStatisticsHasSum(handle, "bytes_produced", expected_sum, 101) + @combinations.generate(test_base.default_test_combinations()) def testLatencyStats(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -76,6 +82,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 100.0, 101) + @combinations.generate(test_base.default_test_combinations()) def testPrefetchBufferUtilization(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( @@ -117,6 +124,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): 301, offset=2) + @combinations.generate(test_base.default_test_combinations()) def testPrefetchBufferScalars(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(10).map( @@ -140,6 +148,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testFilteredElementsStats(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(101).filter( @@ -167,6 +176,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle, self.regexForNodeName("FilterDataset", "filtered_elements"), 34.0) + @combinations.generate(test_base.default_test_combinations()) def testReinitialize(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -187,6 +197,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.assertStatisticsHasCount(handle, "record_latency", (j + 1) * 100.0, (j * 100) + 101) + @combinations.generate(test_base.default_test_combinations()) def testNoAggregatorRegistered(self): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) @@ -198,6 +209,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testMultipleTags(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -221,6 +233,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle, "record_latency", 100.0, 201, offset=1) self.assertStatisticsHasCount(handle, "record_latency_2", 100.0, 201) + @combinations.generate(test_base.default_test_combinations()) def testRepeatedTags(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -239,6 +252,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 200.0, 201) + @combinations.generate(test_base.default_test_combinations()) def testMultipleIteratorsSameAggregator(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -259,6 +273,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 200.0, 201) + @combinations.generate(test_base.default_test_combinations()) def testMultipleDatasetWithPrefixes(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -289,6 +304,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.assertStatisticsHasCount(handle, "dataset2::record_latency", 100.0, 201) + @combinations.generate(test_base.default_test_combinations()) def testMultiplePrefetchStats(self): aggregator = stats_aggregator.StatsAggregator() @@ -314,8 +330,10 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.evaluate(next_element()) -class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): +class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testMapBufferUtilization(self): def dataset_fn(): @@ -326,6 +344,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): self.parallelCallsStats( dataset_fn, {"ParallelMapDataset"}, 10, function_processing_time=True) + @combinations.generate(test_base.default_test_combinations()) def testMapAutoTuneBufferUtilization(self): def dataset_fn(): @@ -336,6 +355,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): self.parallelCallsStats( dataset_fn, {"ParallelMapDataset"}, 10, function_processing_time=True) + @combinations.generate(test_base.default_test_combinations()) def testInterleaveAutoTuneBufferUtilization(self): def dataset_fn(): @@ -351,6 +371,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): self.parallelCallsStats(dataset_fn, {"ParallelInterleaveDatasetV2"}, 10) + @combinations.generate(test_base.default_test_combinations()) def testMapAndBatchAutoTuneBufferUtilization(self): def dataset_fn(): @@ -370,8 +391,10 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): class FeatureStatsDatasetTest( stats_dataset_test_base.StatsDatasetTestBase, - reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): + reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testFeaturesStats(self): num_epochs = 5 total_records = num_epochs * self._num_records From f318360fe4ff76b61976fa32f2337796a30329d9 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:19:06 -0800 Subject: [PATCH 032/383] Switch TakeWhileTest to use TF combinations --- .../kernel_tests/take_while_test.py | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py index b2b0effb0df..959837faa24 100644 --- a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py @@ -23,18 +23,21 @@ import numpy as np from tensorflow.python.data.experimental.ops import take_while_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.parameters((14, 2), (15, 2), (100, 3)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(num_elements=[14, 15], window_size=[2]) + + combinations.combine(num_elements=[100], window_size=[3]))) def testTakeWhileDataset(self, num_elements, window_size): def _predicate_func(elem): @@ -49,8 +52,19 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): expected_num_elements = int(num_elements / window_size) * window_size self.assertDatasetProduces(dataset, np.arange(expected_num_elements)) - @parameterized.parameters((10, 2, False), (16, 7, False), (100, 99, False), - (100, 101, True), (0, 1, True)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + num_elements=[10], upper_bound=[2], out_of_bounds=[False]) + + combinations.combine( + num_elements=[16], upper_bound=[7], out_of_bounds=[False]) + + combinations.combine( + num_elements=[100], upper_bound=[99], out_of_bounds=[False]) + + combinations.combine( + num_elements=[100], upper_bound=[101], out_of_bounds=[True]) + + combinations.combine( + num_elements=[0], upper_bound=[1], out_of_bounds=[True]))) def testTakeWhileDatasetRange(self, num_elements, upper_bound, out_of_bounds): dataset = dataset_ops.Dataset.range(num_elements).apply( take_while_ops.take_while(lambda x: x < upper_bound)) @@ -62,6 +76,7 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): else: self.assertDatasetProduces(dataset, np.arange(upper_bound)) + @combinations.generate(test_base.default_test_combinations()) def testTakeWhileDatasetString(self): def not_equal(string): @@ -79,7 +94,13 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.assertEqual(b"test", self.evaluate(next_element())) - @parameterized.parameters((5, 3), (10, 0), (100, 5), (8, 7)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(size=[5], index=[3]) + + combinations.combine(size=[10], index=[0]) + + combinations.combine(size=[100], index=[5]) + + combinations.combine(size=[8], index=[7]))) def testTakewhileDatasetShortCircuit(self, size, index): def _predicate_func(data_elem): @@ -98,6 +119,7 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTakeWhileDatasetWithRepeat(self): dataset = dataset_ops.Dataset.range(10).apply( take_while_ops.take_while(lambda x: x < 2)).repeat(5) From f655bfc3779a58ed6e2c8c2d401e6542a6b66c6b Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:25:23 -0800 Subject: [PATCH 033/383] Switch TFRecordWriterTest to use TF combinations --- .../kernel_tests/tf_record_writer_test.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py index 136a446bbd8..a327fc82466 100644 --- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py @@ -19,14 +19,16 @@ from __future__ import print_function import os +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.experimental.ops import writers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers from tensorflow.python.eager import function +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.lib.io import tf_record from tensorflow.python.ops import string_ops @@ -34,8 +36,7 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat -@test_util.run_all_in_graph_and_eager_modes -class TFRecordWriterTest(test_base.DatasetTestBase): +class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): def setUp(self): super(TFRecordWriterTest, self).setUp() @@ -63,11 +64,13 @@ class TFRecordWriterTest(test_base.DatasetTestBase): def _outputFilename(self): return os.path.join(self.get_temp_dir(), "tf_record.out.txt") + @combinations.generate(test_base.default_test_combinations()) def testWrite(self): self.evaluate(self.writer_fn(self._createFile())) for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testWriteZLIB(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) self.evaluate( @@ -76,6 +79,7 @@ class TFRecordWriterTest(test_base.DatasetTestBase): tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testWriteGZIP(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP) self.evaluate( @@ -84,20 +88,24 @@ class TFRecordWriterTest(test_base.DatasetTestBase): tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testFailDataset(self): with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write("whoops") + @combinations.generate(test_base.default_test_combinations()) def testFailDType(self): input_dataset = dataset_ops.Dataset.from_tensors(10) with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset) + @combinations.generate(test_base.default_test_combinations()) def testFailShape(self): input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]]) with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset) + @combinations.generate(test_base.default_test_combinations()) def testSideEffect(self): def writer_fn(): input_dataset = readers.TFRecordDataset(self._createFile()) @@ -112,6 +120,7 @@ class TFRecordWriterTest(test_base.DatasetTestBase): for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testShard(self): filename = self._createFile() dataset = readers.TFRecordDataset([filename]) From 7e5022debfd1e79b1c992fe310184ed4ec5c948e Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:31:07 -0800 Subject: [PATCH 034/383] Switch UniqueTest to use TF combinations --- .../data/experimental/kernel_tests/unique_test.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py index 42d76a2eb30..2576c25f0f5 100644 --- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py @@ -17,17 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import unique from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.platform import test from tensorflow.python.util import compat -@test_util.run_all_in_graph_and_eager_modes -class UniqueTest(test_base.DatasetTestBase): +class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): def _testSimpleHelper(self, dtype, test_cases): """Test the `unique()` transformation on a list of test cases. @@ -52,7 +53,8 @@ class UniqueTest(test_base.DatasetTestBase): for element in expected ]) - @test_util.run_deprecated_v1 + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testSimpleInt(self): for dtype in [dtypes.int32, dtypes.int64]: self._testSimpleHelper(dtype, [ @@ -65,7 +67,8 @@ class UniqueTest(test_base.DatasetTestBase): ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]), ]) - @test_util.run_deprecated_v1 + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testSimpleString(self): self._testSimpleHelper(dtypes.string, [ ([], []), From 994dd45bc1ff405a738f3a3b64b0feea241f21ed Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:37:11 -0800 Subject: [PATCH 035/383] Switch VariantTest to use TF combinations --- .../data/experimental/kernel_tests/variant_test.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/variant_test.py b/tensorflow/python/data/experimental/kernel_tests/variant_test.py index 6a3a1424d12..897aa223371 100644 --- a/tensorflow/python/data/experimental/kernel_tests/variant_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/variant_test.py @@ -17,16 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import cardinality from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import test_util +from tensorflow.python.framework import combinations from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class VariantTest(test_base.DatasetTestBase): +class VariantTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testRoundtripRange(self): dataset = dataset_ops.Dataset.range(10) variant = dataset_ops.to_variant(dataset) @@ -35,6 +37,7 @@ class VariantTest(test_base.DatasetTestBase): self.assertDatasetProduces(dataset, range(10)) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10) + @combinations.generate(test_base.default_test_combinations()) def testRoundtripMap(self): dataset = dataset_ops.Dataset.range(10).map(lambda x: x*x) variant = dataset_ops.to_variant(dataset) From 492e4ab4563b2560c04319f557d1b2f78671d090 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Wed, 27 Nov 2019 15:37:39 -0800 Subject: [PATCH 036/383] Switch WrapDatasetVariantTest to use TF combinations --- .../experimental/kernel_tests/map_defun_op_test.py | 3 +-- .../experimental/kernel_tests/wrap_unwrap_test.py | 12 ++++++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py index 2c085352c50..a2cc54d104e 100644 --- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py @@ -32,7 +32,6 @@ from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_spec -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops from tensorflow.python.ops import data_flow_ops @@ -41,7 +40,7 @@ from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test -@test_util.run_v1_only("b/123903858: Add eager and V2 test coverage") +# TODO(b/123903858): Add eager and V2 test coverage class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): @combinations.generate( diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py index 09627d02994..b65c0fb260a 100644 --- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py @@ -17,18 +17,20 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_dataset_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class WrapDatasetVariantTest(test_base.DatasetTestBase): +class WrapDatasetVariantTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testBasic(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access @@ -42,7 +44,9 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase): for i in range(100): self.assertEqual(i, self.evaluate(get_next())) - @test_util.run_v1_only("b/123901304") + # TODO("b/123901304") + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) def testSkipEagerGPU(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access From 031e618642250baa2be5ea702a94157dbdbf93e7 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Sun, 1 Dec 2019 20:41:50 -0800 Subject: [PATCH 037/383] Address the comments --- .../kernel_tests/copy_to_device_test.py | 63 +++++++------------ .../experimental/kernel_tests/counter_test.py | 24 +++---- .../kernel_tests/parallel_interleave_test.py | 1 + .../parse_example_dataset_test.py | 9 +-- .../kernel_tests/prefetch_to_device_test.py | 21 +++---- .../kernel_tests/prefetch_with_slack_test.py | 2 +- .../kernel_tests/stats_dataset_ops_test.py | 2 +- .../experimental/kernel_tests/unique_test.py | 6 +- .../kernel_tests/wrap_unwrap_test.py | 2 +- 9 files changed, 47 insertions(+), 83 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py index 08769f9622f..2fa149fcbaa 100644 --- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py @@ -40,8 +40,7 @@ from tensorflow.python.util import compat as util_compat # TODO(b/117581999): add eager coverage when supported. class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -66,8 +65,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceInt32(self): host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3]) device_dataset = host_dataset.apply( @@ -91,8 +89,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -117,8 +114,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -143,8 +139,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -169,8 +164,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyDictToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -195,8 +189,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopySparseTensorsToDevice(self): def make_tensor(i): @@ -229,8 +222,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopySparseTensorsToDeviceWithPrefetch(self): def make_tensor(i): @@ -263,8 +255,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -285,8 +276,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -307,8 +297,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithMap(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -346,8 +335,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuInt32(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -367,8 +355,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuInt32AndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -388,8 +375,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuStrings(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -409,8 +395,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuStringsAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -430,8 +415,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDevicePingPongCPUGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -455,8 +439,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -485,8 +468,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceWithReInitAndPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -515,8 +497,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -540,8 +521,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithReInitAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -565,8 +545,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testIteratorGetNextAsOptionalOnGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py index 11629573ad1..9bbc5789983 100644 --- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py @@ -29,25 +29,21 @@ from tensorflow.python.platform import test class CounterTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate(test_base.default_test_combinations()) - def testCounter(self): + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(start=3, step=4, expected_output=[[3, 7, 11]]) + + combinations.combine( + start=0, step=-1, expected_output=[[0, -1, -2]]))) + def testCounter(self, start, step, expected_output): """Test dataset construction using `count`.""" - dataset = counter.Counter(start=3, step=4) + dataset = counter.Counter(start, step) self.assertEqual( [], dataset_ops.get_legacy_output_shapes(dataset).as_list()) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) get_next = self.getNext(dataset) - - negative_dataset = counter.Counter(start=0, step=-1) - negative_get_next = self.getNext(negative_dataset) - - self.assertEqual(3, self.evaluate(get_next())) - self.assertEqual(3 + 4, self.evaluate(get_next())) - self.assertEqual(3 + 2 * 4, self.evaluate(get_next())) - - self.assertEqual(0, self.evaluate(negative_get_next())) - self.assertEqual(-1, self.evaluate(negative_get_next())) - self.assertEqual(-2, self.evaluate(negative_get_next())) + for expected in expected_output: + self.assertEqual(expected, self.evaluate(get_next())) if __name__ == "__main__": diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py index 083c97b24b6..0fb8c78a7c0 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py @@ -39,6 +39,7 @@ from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test +#TODO(feihugis): refactor this test to be parameterized. class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): def setUp(self): diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py index f82fcb5a3eb..58cba64617d 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py @@ -147,8 +147,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + @combinations.generate(test_base.graph_only_combinations()) def testEmptySerializedWithoutDefaultsShouldFail(self): input_features = { "st_a": @@ -182,8 +181,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_err=(errors_impl.InvalidArgumentError, "Feature: c \\(data type: float\\) is required")) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + @combinations.generate(test_base.graph_only_combinations()) def testDenseNotMatchingShapeShouldFail(self): original = [ example(features=features({ @@ -741,8 +739,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_err=(ValueError, "Cannot reshape a tensor with 0 elements to shape")) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + @combinations.generate(test_base.graph_only_combinations()) def testSerializedContainingVarLenDense(self): aname = "a" bname = "b" diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py index 1b6295864ea..8ac4e239881 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py @@ -36,8 +36,7 @@ from tensorflow.python.platform import test # TODO(b/117581999): add eager coverage when supported. class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -61,8 +60,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -87,8 +85,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -112,8 +109,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchSparseTensorsToDevice(self): def make_tensor(i): return sparse_tensor.SparseTensorValue( @@ -143,8 +139,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -164,8 +159,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -193,8 +187,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate( - combinations.combine(tf_api_version=[1, 2], mode=["graph"])) + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py index 43111faeb91..2b8d97d20b9 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py @@ -34,7 +34,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): # TODO(b/121264236) @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + combinations.combine(tf_api_version=[1], mode=["graph"])) def testPrefetchWithSlackOption(self): """Determines slack_period based on num devices attached to iterator.""" dataset = dataset_ops.Dataset.range(10) diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py index 756c1e0e743..5b3aaea95f6 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py @@ -20,12 +20,12 @@ from __future__ import print_function from absl.testing import parameterized import numpy as np -from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_base from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.experimental.ops import stats_ops +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.framework import combinations from tensorflow.python.framework import errors diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py index 2576c25f0f5..9a51c4224ff 100644 --- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py @@ -53,8 +53,7 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): for element in expected ]) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + @combinations.generate(test_base.graph_only_combinations()) def testSimpleInt(self): for dtype in [dtypes.int32, dtypes.int64]: self._testSimpleHelper(dtype, [ @@ -67,8 +66,7 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]), ]) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + @combinations.generate(test_base.graph_only_combinations()) def testSimpleString(self): self._testSimpleHelper(dtypes.string, [ ([], []), diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py index b65c0fb260a..5d05332f0ab 100644 --- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py @@ -46,7 +46,7 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase, parameterized.TestCase): # TODO("b/123901304") @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph", "eager"])) + combinations.combine(tf_api_version=[1], mode=["graph"])) def testSkipEagerGPU(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access From 610a78b98569e2908809645626b4bd6afd2a22d8 Mon Sep 17 00:00:00 2001 From: Balint Cristian Date: Tue, 3 Dec 2019 13:29:47 +0200 Subject: [PATCH 038/383] Update systemlibs protobuf vars --- third_party/systemlibs/protobuf.BUILD | 1 - third_party/systemlibs/protobuf.bzl | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD index ab96c253bb4..118135d1290 100644 --- a/third_party/systemlibs/protobuf.BUILD +++ b/third_party/systemlibs/protobuf.BUILD @@ -31,7 +31,6 @@ HEADERS = [ "google/protobuf/io/zero_copy_stream.h", "google/protobuf/io/zero_copy_stream_impl_lite.h", "google/protobuf/map.h", - "google/protobuf/port_def.inc", "google/protobuf/repeated_field.h", "google/protobuf/text_format.h", "google/protobuf/timestamp.pb.h", diff --git a/third_party/systemlibs/protobuf.bzl b/third_party/systemlibs/protobuf.bzl index 6818e4a4c0b..bb807e904a3 100644 --- a/third_party/systemlibs/protobuf.bzl +++ b/third_party/systemlibs/protobuf.bzl @@ -274,8 +274,8 @@ def internal_gen_well_known_protos_java(srcs): Args: srcs: the well known protos """ - root = Label("%s//protobuf_java" % (REPOSITORY_NAME)).workspace_root - pkg = PACKAGE_NAME + "/" if PACKAGE_NAME else "" + root = Label("%s//protobuf_java" % (native.repository_name())).workspace_root + pkg = native.package_name() + "/" if native.package_name() else "" if root == "": include = " -I%ssrc " % pkg else: From c45ea870d61ec7146bea04caea6d318dfec6e227 Mon Sep 17 00:00:00 2001 From: Fei Hu Date: Tue, 3 Dec 2019 08:40:35 -0800 Subject: [PATCH 039/383] Update the test cases for testNumThreadsDeprecated --- .../experimental/kernel_tests/override_threadpool_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py index 65565d183e2..d7944042c6e 100644 --- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py @@ -75,9 +75,7 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, combinations.combine( num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + combinations.combine( - num_threads=[4], max_intra_op_parallelism=[0, 1, 4]) + - combinations.combine( - num_threads=[5], max_intra_op_parallelism=[-1]))) + num_threads=[4], max_intra_op_parallelism=[-1, 0, 4]))) def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): From 25fc64af4437a43714646b8c9b502f25c42895ef Mon Sep 17 00:00:00 2001 From: amoitra Date: Tue, 3 Dec 2019 11:59:27 -0800 Subject: [PATCH 040/383] Revert change --- tensorflow/compiler/xla/debug_options_flags.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) mode change 100755 => 100644 tensorflow/compiler/xla/debug_options_flags.cc diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc old mode 100755 new mode 100644 index acffd7734b3..ec0059d37d9 --- a/tensorflow/compiler/xla/debug_options_flags.cc +++ b/tensorflow/compiler/xla/debug_options_flags.cc @@ -42,8 +42,9 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_cpu_use_mkl_dnn(true); #endif // INTEL_MKL opts.set_xla_gpu_max_kernel_unroll_factor(4); - // Set cudnn batchnorm on by default. - opts.set_xla_gpu_use_cudnn_batchnorm(true); + // Set cudnn batchnorm off by default; it does not provide a performance win + // on average. + opts.set_xla_gpu_use_cudnn_batchnorm(false); // Run all GPU work on one stream by default. Using multiple streams // increases memory usage and we lack strong motivating benchmarks for tuning From c87a0e20d31cccd70a0f1c9687221b45066786fa Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 4 Dec 2019 00:32:52 +0000 Subject: [PATCH 041/383] Test multiple assignment and read_value=False --- .../experimental/autocast_variable_test.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py index 205adbb04eb..9a184fcc27e 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py @@ -311,6 +311,25 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase): self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(3.14))) self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14))) + # Assign multiple times + assign = x.assign(1.) + self.assertAllClose(1., self.evaluate(assign)) + self.assertAllClose(0., self.evaluate(assign.assign(0.))) + assign_add = x.assign_add(3.14) + self.assertAllClose(3.14, self.evaluate(assign_add)) + self.assertAllClose(3.14 * 2, self.evaluate(assign_add.assign_add(3.14))) + assign_sub = x.assign_sub(3.14) + self.assertAllClose(3.14, self.evaluate(assign_sub)) + self.assertAllClose(0., self.evaluate(assign_sub.assign_sub(3.14))) + + # Assign with read_value=False + self.assertIsNone(self.evaluate(x.assign(1., read_value=False))) + self.assertAllClose(1., self.evaluate(x)) + self.assertIsNone(self.evaluate(x.assign_add(2., read_value=False))) + self.assertAllClose(3., self.evaluate(x)) + self.assertIsNone(self.evaluate(x.assign_sub(3., read_value=False))) + self.assertAllClose(0., self.evaluate(x)) + # Use the tf.assign functions instead of the var.assign methods. self.assertAllClose(0., self.evaluate(state_ops.assign(x, 0.))) self.assertAllClose(3.14, self.evaluate(state_ops.assign(x, 3.14))) From ccb2c904d6ee249cd65ef52ec14628b249383184 Mon Sep 17 00:00:00 2001 From: Bairen Yi Date: Fri, 22 Nov 2019 13:17:28 +0800 Subject: [PATCH 042/383] Add headers for networking C API in Python wheel Signed-off-by: Bairen Yi --- tensorflow/tools/pip_package/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD index ec0fa2ec2d2..0cdd013275f 100644 --- a/tensorflow/tools/pip_package/BUILD +++ b/tensorflow/tools/pip_package/BUILD @@ -26,6 +26,7 @@ package(default_visibility = ["//visibility:private"]) transitive_hdrs( name = "included_headers", deps = [ + "//tensorflow/c/experimental:network", "//tensorflow/core:core_cpu", "//tensorflow/core:framework", "//tensorflow/core:lib", From 78fbe39329aaf4645c2a308eb805e3661e397bd3 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 4 Dec 2019 10:50:20 +0000 Subject: [PATCH 043/383] Only wrap assignment in AutoCastVariable if resource variable --- .../experimental/autocast_variable.py | 12 +++++++++--- .../experimental/autocast_variable_test.py | 18 +++++++++++------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py index 469c6902025..fbe0cde5e72 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py +++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py @@ -186,15 +186,21 @@ class AutoCastVariable(variables.Variable): def assign(self, value, use_locking=None, name=None, read_value=True): assign_op = self._variable.assign(value, use_locking, name, read_value) - return create_autocast_variable(assign_op) if read_value else assign_op + if read_value and resource_variable_ops.is_resource_variable(assign_op): + return create_autocast_variable(assign_op) + return assign_op def assign_add(self, delta, use_locking=None, name=None, read_value=True): assign_op = self._variable.assign_add(delta, use_locking, name, read_value) - return create_autocast_variable(assign_op) if read_value else assign_op + if read_value and resource_variable_ops.is_resource_variable(assign_op): + return create_autocast_variable(assign_op) + return assign_op def assign_sub(self, delta, use_locking=None, name=None, read_value=True): assign_op = self._variable.assign_sub(delta, use_locking, name, read_value) - return create_autocast_variable(assign_op) if read_value else assign_op + if read_value and resource_variable_ops.is_resource_variable(assign_op): + return create_autocast_variable(assign_op) + return assign_op def scatter_sub(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_sub(sparse_delta, use_locking, name) diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py index 9a184fcc27e..393cfa999a6 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable_test.py @@ -18,7 +18,6 @@ from __future__ import division from __future__ import print_function import os -from functools import partial from absl.testing import parameterized import numpy as np @@ -72,11 +71,6 @@ def get_var(val, dtype, name=None): @test_util.run_all_in_graph_and_eager_modes class AutoCastVariableTest(test.TestCase, parameterized.TestCase): - def check_and_evaluate(self, var, dtype=None): - self.assertIsInstance(var, autocast_variable.AutoCastVariable) - if dtype: - self.assertEqual(var.dtype, dtype) - return self.evaluate(var) @parameterized.named_parameters(*TESTCASES) def test_read(self, distribute): @@ -164,7 +158,17 @@ class AutoCastVariableTest(test.TestCase, parameterized.TestCase): # underlying variable. with get_distribute_scope(distribute): for read_dtype in (dtypes.float32, dtypes.float16): - evaluate = partial(self.check_and_evaluate, dtype=read_dtype) + if distribute: + # MirroredVariable.assign will (incorrectly) return a Mirrored value + # instead of a MirroredVariable. So we cannot properly wrap it in an + # AutoCastVariable. + evaluate = self.evaluate + else: + def evaluate(var): + self.assertIsInstance(var, autocast_variable.AutoCastVariable) + self.assertEqual(var.dtype, read_dtype) + return self.evaluate(var) + x = get_var(7., dtypes.float32) x = autocast_variable.create_autocast_variable(x) with ops.get_default_graph()._enable_auto_casting_variables( From 7f986cd6971a509305943594639893759cfe1acd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 03:19:00 -0800 Subject: [PATCH 044/383] Updated broken link in TF Lite iOS getting started doc. PiperOrigin-RevId: 283722601 Change-Id: Iecc8b3d13c001252668ee25e963dc8a3602db023 --- tensorflow/lite/g3doc/guide/ios.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md index 0c7e5dc9c90..fc997bccf9d 100644 --- a/tensorflow/lite/g3doc/guide/ios.md +++ b/tensorflow/lite/g3doc/guide/ios.md @@ -7,7 +7,7 @@ example: image classification example For an explanation of the source code, you should also read -[TensorFlow Lite iOS image classification](https://www.tensorflow.org/code/py/tensorflow_examples/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md). +[TensorFlow Lite iOS image classification](https://www.tensorflow.org/lite/models/image_classification/ios). This example app uses [image classification](https://www.tensorflow.org/lite/models/image_classification/overview) From 813a645f1719c075c20d800ed1449543fb817c4e Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 4 Dec 2019 11:26:22 +0000 Subject: [PATCH 045/383] Fix typo in TFLite quantization spec --- tensorflow/lite/g3doc/performance/quantization_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md index d6b7029ecfd..ea0dc644cce 100644 --- a/tensorflow/lite/g3doc/performance/quantization_spec.md +++ b/tensorflow/lite/g3doc/performance/quantization_spec.md @@ -75,7 +75,7 @@ $A$ is a $m \times n$ matrix of quantized activations.
$B$ is a $n \times p$ matrix of quantized weights.
Consider multiplying the $j$th row of $A$, $a_j$ by the $k$th column of $B$, $b_k$, both of length $n$. The quantized integer values and -zero-points values are $q_a$, $z_a$ and $q_b$, $q_b$ respectively. +zero-points values are $q_a$, $z_a$ and $q_b$, $z_b$ respectively. $$a_j \cdot b_k = \sum_{i=0}^{n} a_{j}^{(i)} b_{k}^{(i)} = \sum_{i=0}^{n} (q_{a}^{(i)} - z_a) (q_{b}^{(i)} - z_b) = From fd6d6f5eda0f1520fc49e60cf88946b53a7a9c14 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 4 Dec 2019 11:28:56 +0000 Subject: [PATCH 046/383] Fix rendering of "and" in MathJax --- tensorflow/lite/g3doc/performance/quantization_spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md index ea0dc644cce..4d11906bde4 100644 --- a/tensorflow/lite/g3doc/performance/quantization_spec.md +++ b/tensorflow/lite/g3doc/performance/quantization_spec.md @@ -87,7 +87,7 @@ $$a_j \cdot b_k = \sum_{i=0}^{n} a_{j}^{(i)} b_{k}^{(i)} = The \\(\sum_{i=0}^{n} q_{a}^{(i)} q_{b}^{(i)}\\) term is unavoidable since it’s performing the dot product of the input value and the weight value. -The $$\sum_{i=0}^{n} q_{b}^{(i)} z_a and \sum_{i=0}^{n} z_a z_b$$ terms are made +The $$\sum_{i=0}^{n} q_{b}^{(i)} z_a$$ and $$\sum_{i=0}^{n} z_a z_b$$ terms are made up of constants that remain the same per inference invocation, and thus can be pre-calculated. From fd9697d81d3977b312763ce8a4949bc20d7b2f50 Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Wed, 4 Dec 2019 04:58:12 -0800 Subject: [PATCH 047/383] minor spelling tweaks Closes #250 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/250 from kiszk:spelling_tweaks_201911 50fc04443723190b764e824b6fcd2469fecb56e6 PiperOrigin-RevId: 283733032 Change-Id: Ie3099f3e02f59c5b54c2c7767fc9d282a223b12b --- third_party/mlir/g3doc/DeclarativeRewrites.md | 2 +- third_party/mlir/g3doc/Dialects/SPIR-V.md | 2 +- third_party/mlir/g3doc/OpDefinitions.md | 2 +- third_party/mlir/g3doc/Tutorials/Toy/Ch-2.md | 2 +- third_party/mlir/g3doc/Tutorials/Toy/Ch-5.md | 4 ++-- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/third_party/mlir/g3doc/DeclarativeRewrites.md b/third_party/mlir/g3doc/DeclarativeRewrites.md index c7276daccd8..e319b7d7a83 100644 --- a/third_party/mlir/g3doc/DeclarativeRewrites.md +++ b/third_party/mlir/g3doc/DeclarativeRewrites.md @@ -50,7 +50,7 @@ features: * Matching and generating ops with block arguments. * Matching multi-result ops in nested patterns. * Matching and generating variadic operand/result ops in nested patterns. -* Packing and unpacking variaidc operands/results during generation. +* Packing and unpacking variadic operands/results during generation. * [`NativeCodeCall`](#native-code-call-transforming-the-generated-op) returning more than one results. diff --git a/third_party/mlir/g3doc/Dialects/SPIR-V.md b/third_party/mlir/g3doc/Dialects/SPIR-V.md index 82922de6d11..58bd5ee828f 100644 --- a/third_party/mlir/g3doc/Dialects/SPIR-V.md +++ b/third_party/mlir/g3doc/Dialects/SPIR-V.md @@ -474,7 +474,7 @@ the representational differences between SPIR-V dialect and binary format: Similarly, a few transformations are performed during deserialization: * Instructions for execution environment requirements will be placed as - attribues on `spv.module`. + attributes on `spv.module`. * `OpConstant*` instructions are materialized as `spv.constant` at each use site. * `OpPhi` instructions are converted to block arguments. diff --git a/third_party/mlir/g3doc/OpDefinitions.md b/third_party/mlir/g3doc/OpDefinitions.md index 25865593800..b72b9937ebb 100644 --- a/third_party/mlir/g3doc/OpDefinitions.md +++ b/third_party/mlir/g3doc/OpDefinitions.md @@ -263,7 +263,7 @@ TODO: Design and implement more primitive constraints Similar to operands, results are specified inside the `dag`-typed `results`, led by `outs`: -```tablgen +```tablegen let results = (outs :$, ... diff --git a/third_party/mlir/g3doc/Tutorials/Toy/Ch-2.md b/third_party/mlir/g3doc/Tutorials/Toy/Ch-2.md index d797624ed72..ce46788f4ae 100755 --- a/third_party/mlir/g3doc/Tutorials/Toy/Ch-2.md +++ b/third_party/mlir/g3doc/Tutorials/Toy/Ch-2.md @@ -434,7 +434,7 @@ invariants of the operation have already been verified: ```tablegen def ConstantOp : Toy_Op<"constant", [NoSideEffect]> { // Provide a summary and description for this operation. This can be used to - // auto-generate documenatation of the operations within our dialect. + // auto-generate documentation of the operations within our dialect. let summary = "constant operation"; let description = [{ Constant operation turns a literal into an SSA value. The data is attached diff --git a/third_party/mlir/g3doc/Tutorials/Toy/Ch-5.md b/third_party/mlir/g3doc/Tutorials/Toy/Ch-5.md index 2cf5e59d1b3..5573354aef1 100644 --- a/third_party/mlir/g3doc/Tutorials/Toy/Ch-5.md +++ b/third_party/mlir/g3doc/Tutorials/Toy/Ch-5.md @@ -118,8 +118,8 @@ struct TransposeOpLowering : public mlir::ConversionPattern { // This allows for using the nice named accessors that are generated // by the ODS. This adaptor is automatically provided by the ODS // framework. - TransposeOpOperandAdaptor tranposeAdaptor(memRefOperands); - mlir::Value *input = tranposeAdaptor.input(); + TransposeOpOperandAdaptor transposeAdaptor(memRefOperands); + mlir::Value *input = transposeAdaptor.input(); // Transpose the elements by generating a load from the reverse // indices. From 222977dffd1689625794a13d9cc2be3d7cbe3bad Mon Sep 17 00:00:00 2001 From: Guangda Lai Date: Wed, 4 Dec 2019 05:53:57 -0800 Subject: [PATCH 048/383] Fix the unit test broken by PR #34288. PiperOrigin-RevId: 283739627 Change-Id: If2de1fd30bd8c0e3c77d63ac2b594c2a71383ac5 --- .../tf2tensorrt/convert/convert_nodes_test.cc | 34 +++++-------------- 1 file changed, 9 insertions(+), 25 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index 6c2b8fdc091..ef03ab91714 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -654,9 +654,8 @@ class ConverterTest : public ::testing::Test { ConverterTest() { Reset(); } void Reset() { - builder_.reset(nvinfer1::createInferBuilder(logger_)); converter_ = - std::move(Converter::Create(builder_.get(), TrtPrecisionMode::FP32, + std::move(Converter::Create(TrtPrecisionMode::FP32, /*use_calibration=*/false, &logger_) .ValueOrDie()); weight_store_ = &converter_->weight_store_; @@ -702,9 +701,6 @@ class ConverterTest : public ::testing::Test { private: Logger logger_; - // These members are ordered in a way such that the destruction order is: - // converter_ -> builder_ - TrtUniquePtrType builder_; protected: std::unique_ptr converter_; @@ -996,9 +992,7 @@ TEST_F(ConverterTest, MaybeApplyQuantizationRanges) { FakeITensor input, infer_1, infer_2, infer_3; FakeITensor not_infer; Logger logger; - TrtUniquePtrType builder( - nvinfer1::createInferBuilder(logger)); - auto int8_converter = Converter::Create(builder.get(), TrtPrecisionMode::INT8, + auto int8_converter = Converter::Create(TrtPrecisionMode::INT8, /*use_calibration=*/true, &logger) .ValueOrDie(); int8_converter->ProvideQuantizationRange(&input, -5.0f, 5.0f); @@ -1255,12 +1249,8 @@ class OpConverterTest : public ::testing::Test { engine_.reset(nullptr); // Re-create them in proper order. - builder_.reset(nvinfer1::createInferBuilder(logger_)); - builder_->setMaxWorkspaceSize(1 << 26); - - // Reset the converter. converter_ = - std::move(Converter::Create(builder_.get(), precision_mode_to_test_, + std::move(Converter::Create(precision_mode_to_test_, /*use_calibration=*/false, &logger_) .ValueOrDie()); @@ -1294,18 +1284,13 @@ class OpConverterTest : public ::testing::Test { TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(output_info)); // Build the TRT engine. - if (precision_mode == TrtPrecisionMode::FP16) { - builder_->setFp16Mode(true); - } else if (precision_mode == TrtPrecisionMode::INT8) { - // Setting FP16 mode as well allows TRT to also consider FP16 kernels and - // use them in situations where they are faster than INT8 or where INT8 is - // not supported for a given layer. - builder_->setFp16Mode(true); - builder_->setInt8Mode(true); - } ASSERT_EQ(nullptr, engine_.get()); - builder_->setMaxBatchSize(batch_size); - TF_ASSERT_OK(converter_->BuildCudaEngine(&engine_)); + TF_ASSERT_OK( + converter_->BuildCudaEngine(&engine_, + /*max_batch_size=*/batch_size, + /*max_workspace_size_bytes=*/1 << 26, + /*allocator=*/nullptr, + /*calibrator=*/nullptr)); CHECK_NOTNULL(engine_.get()); CheckDataTypeMatches(input_data); CheckDataTypeMatches(*output_data); @@ -1473,7 +1458,6 @@ class OpConverterTest : public ::testing::Test { private: Logger logger_; - TrtUniquePtrType builder_; TrtUniquePtrType engine_; cudaStream_t stream_; // Used to create placeholders with shape and data type information. The From 8f661bace20b40d7a6d1de8294bcc283e87627e9 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 06:53:07 -0800 Subject: [PATCH 049/383] Adds support for unrolling single-result vector operations with iterator type lists and indexing maps to a target vector size. Adds unit tests for unrolling the vector ContractionOp with different iteration orders. PiperOrigin-RevId: 283747503 Change-Id: Ib7e4f757d15760cd89fc09fedc49b7a2dc2a1fe6 --- .../mlir/Dialect/VectorOps/VectorOps.td | 12 + .../VectorOps/VectorTransformPatterns.td | 5 + .../mlir/lib/Dialect/VectorOps/VectorOps.cpp | 38 +++ .../lib/Dialect/VectorOps/VectorToVector.cpp | 270 ++++++++++++++++-- 4 files changed, 304 insertions(+), 21 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td index d34fa9a245d..36c26fe577f 100644 --- a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td +++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td @@ -157,6 +157,18 @@ def Vector_ContractionOp : static StringRef getParallelIteratorTypeName() { return "parallel"; } + + // Returns the bounds of each dimension in the iteration space spanned + // by the iterator types of this operation. + void getIterationBounds(SmallVectorImpl &iterationBounds); + + // Returns a list of index maps, where there is a list entry for each + // op indexing map attribute (i.e. one for each input and output, with + // the output listed last). Each index map, maps from this operations + // iteration space, to vector dimensions of the maps input/output. + void getIterationIndexMap( + std::vector> &iterationIndexMap); + std::vector> getContractingDimMap(); std::vector> getBatchDimMap(); }]; diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td index fe0940c0d76..e71679620d6 100644 --- a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td +++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorTransformPatterns.td @@ -40,4 +40,9 @@ def : Pat<(AddFOp:$op_results $a, $b), (UnrollVectorOp<[2, 2]> $op_results, $a, $b), [(Constraint> $a)]>; +// TODO(andydavis) Add Constraints on lhs/rhs shapes. +def : Pat<(Vector_ContractionOp:$op_results $a, $b, $c, $masks, $attr0, $attr1), + (UnrollVectorOp<[2, 2, 2]> $op_results, $a, $b, $c), + [(Constraint> $c)]>; + #endif // VECTOR_TRANSFORMS diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp index 7f3be9d9fa9..ab457a6b833 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp @@ -271,6 +271,44 @@ getDimMap(ArrayRef indexingMaps, ArrayAttr iteratorTypes, return dimMap; } +void ContractionOp::getIterationBounds( + SmallVectorImpl &iterationBounds) { + auto lhsShape = getLhsType().getShape(); + auto resShape = getResultType().getShape(); + SmallVector indexingMaps(getIndexingMaps()); + SmallVector iterationShape; + for (auto it : llvm::enumerate(iterator_types())) { + // Search lhs/rhs map results for 'targetExpr'. + auto targetExpr = getAffineDimExpr(it.index(), getContext()); + auto iteratorTypeName = it.value().cast().getValue(); + if (iteratorTypeName == getReductionIteratorTypeName()) { + // Get reduction dim size from lhs shape (same size in rhsShape). + int64_t lhsDimIndex = getResultIndex(indexingMaps[0], targetExpr); + assert(lhsDimIndex >= 0); + iterationBounds.push_back(lhsShape[lhsDimIndex]); + continue; + } + // Get parallel dimension size from result shape. + int64_t resDimIndex = getResultIndex(indexingMaps[2], targetExpr); + assert(resDimIndex >= 0); + iterationBounds.push_back(resShape[resDimIndex]); + } +} + +void ContractionOp::getIterationIndexMap( + std::vector> &iterationIndexMap) { + unsigned numMaps = indexing_maps().getValue().size(); + iterationIndexMap.resize(numMaps); + for (auto it : llvm::enumerate(indexing_maps())) { + auto index = it.index(); + auto map = it.value().cast().getValue(); + for (unsigned i = 0, e = map.getNumResults(); i < e; ++i) { + auto dim = map.getResult(i).cast(); + iterationIndexMap[index][dim.getPosition()] = i; + } + } +} + std::vector> ContractionOp::getContractingDimMap() { SmallVector indexingMaps(getIndexingMaps()); return getDimMap(indexingMaps, iterator_types(), diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp index 1e2e651189f..0952312b67d 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp @@ -77,6 +77,15 @@ static int64_t computeMaxLinearIndex(ArrayRef basis) { return res; } +/// Computes and returns the linearized index of 'offsets' w.r.t. 'basis'. +static int64_t linearize(ArrayRef offsets, ArrayRef basis) { + assert(offsets.size() == basis.size()); + int64_t linearIndex = 0; + for (unsigned idx = 0, e = basis.size(); idx < e; ++idx) + linearIndex += offsets[idx] * basis[idx]; + return linearIndex; +} + /// Given a shape with sizes greater than 0 along all dimensions, returns the /// delinearized components of linearIndex along shape. static SmallVector delinearize(int64_t linearIndex, @@ -151,9 +160,9 @@ static Operation *cloneOpWithOperandsAndTypes(PatternRewriter &builder, Location loc, Operation *op, ArrayRef operands, ArrayRef resultTypes) { - OperationState *res = new OperationState(loc, op->getName().getStringRef(), - operands, resultTypes, {}); - return builder.createOperation(*res); + OperationState res(loc, op->getName().getStringRef(), operands, resultTypes, + op->getAttrs()); + return builder.createOperation(res); } // Helper function for Tablegen. @@ -164,6 +173,223 @@ static bool hasShape(Value *v, ArrayRef shape) { return std::equal(t.getShape().begin(), t.getShape().end(), shape.begin()); } +static Value *makeSplatZero(Location loc, PatternRewriter &rewriter, + VectorType vt) { + auto t = vt.getElementType(); + Value *f = nullptr; + if (t.isBF16() || t.isF16()) + f = rewriter.create(loc, t, rewriter.getF64FloatAttr(0.0f)); + else if (t.isF32()) + f = rewriter.create(loc, t, rewriter.getF32FloatAttr(0.0f)); + else if (t.isF64()) + f = rewriter.create(loc, t, rewriter.getF64FloatAttr(0.0f)); + if (f) + return rewriter.create(loc, vt, f); + llvm_unreachable("Unsupported type in `makeSplatZero`"); +} + +// Populates 'resultElements[indexMap[i]]' with elements from 'inputElements[i]' +// for each index 'i' in inputElements with a valid mapping in 'indexMap'. +static void getMappedElements(const DenseMap &indexMap, + ArrayRef inputElements, + SmallVectorImpl &resultElements) { + assert(indexMap.size() == resultElements.size()); + assert(inputElements.size() >= resultElements.size()); + for (unsigned i = 0, e = inputElements.size(); i < e; ++i) { + auto it = indexMap.find(i); + if (it != indexMap.end()) + resultElements[it->second] = inputElements[i]; + } +} + +// UnrolledOperandState aggregates per-operand state required for op unrolling. +struct UnrolledOperandState { + Value *operand; + SmallVector unrolledShape; + SmallVector unrollFactors; + SmallVector basis; + int64_t numInstances; +}; + +// Populates 'state' with unrolled shape, unroll factors, basis and +// num unrolled instances for 'operand'. +static void getUnrolledOperandState(Value *operand, + const DenseMap &indexMap, + ArrayRef targetShape, + UnrolledOperandState &state) { + auto vectorType = operand->getType().cast(); + state.operand = operand; + // Compute unrolled shape of 'operand'. + state.unrolledShape.resize(vectorType.getRank()); + getMappedElements(indexMap, targetShape, state.unrolledShape); + // Compute unroll factors for unrolled shape. + auto maybeUnrollFactors = + shapeRatio(vectorType.getShape(), state.unrolledShape); + assert(maybeUnrollFactors.hasValue()); + state.unrollFactors = *maybeUnrollFactors; + // Compute 'basis' and 'numInstances' based on 'state.unrollFactors'. + state.basis = computeStrides(state.unrollFactors); + state.numInstances = computeMaxLinearIndex(state.unrollFactors); +} + +// Computes and returns the linear index of the unrolled vector at +// 'vectorOffsets' within the vector operand represented by 'state'. +static int64_t +getUnrolledOperandLinearIndex(UnrolledOperandState &state, + ArrayRef vectorOffsets, + DenseMap &indexMap) { + // Compute operand offsets. + SmallVector sliceOffsets(state.unrolledShape.size()); + getMappedElements(indexMap, vectorOffsets, sliceOffsets); + // Compute and return linear index of 'sliceOffsets' w.r.t 'state.basis'. + return linearize(sliceOffsets, state.basis); +} + +// Returns an unrolled vector at 'vectorOffsets' within the vector operand +// represented by 'state'. The value is created if not present in 'cache'. +static Value *getOrCreateUnrolledOperandSlice( + Location loc, UnrolledOperandState &state, ArrayRef vectorOffsets, + ArrayRef offsets, DenseMap &indexMap, + SmallVectorImpl &cache, PatternRewriter &builder) { + // Compute operand offsets. + SmallVector sliceOffsets(state.unrolledShape.size()); + getMappedElements(indexMap, offsets, sliceOffsets); + // TODO(b/144845578) Support non-1 strides. + SmallVector sliceStrides(state.unrolledShape.size(), 1); + // Compute linear index of 'sliceOffsets' w.r.t 'state.basis'. + int64_t sliceLinearIndex = + getUnrolledOperandLinearIndex(state, vectorOffsets, indexMap); + assert(sliceLinearIndex < static_cast(cache.size())); + auto *operandSlice = cache[sliceLinearIndex]; + if (operandSlice == nullptr) { + // Initialize 'cache' with slice from 'state.operand'. + operandSlice = builder.create( + loc, state.operand, sliceOffsets, state.unrolledShape, sliceStrides); + // Store value back to 'cache'. + cache[sliceLinearIndex] = operandSlice; + } + return operandSlice; +} + +// +// unrollSingleResultStructuredOp +// +// Returns a value representing the result of structured operation 'op' +// with iteration bounds 'iterationBounds' unrolled to 'targetShape'. +// An iteration space index map argument 'iterationIndexMapList' must be +// specified, with a map for each structured op input and a single map for the +// single result. The last map in the list must be the single result map. +// Extra operands can be passed to unrolled instances of 'op' using the +// 'extraOperands' argument. +// +// Example: +// +// // Before unrolling +// +// operand0 operand1 operand2 +// \ | / +// -------------------- opA -------------------- +// +// // After unrolling by 2 +// +// operand0 operand1 operand2 +// / \ / \ / \ +// slice00 slice01 slice10 slice11 slice20 slice21 +// \ | | | / | +// -------------------- opA0 -------------------- | +// | | | | +// \ | | / +// -------------------- opA1 ------------------- +// | | +// \ / +// insertslice +// | + +// TODO(andydavis) Generalize this to support structured ops beyond +// vector ContractionOp, and merge it with 'unrollSingleResultOpMatchingType' +static Value *unrollSingleResultStructuredOp( + Operation *op, ArrayRef iterationBounds, + std::vector> &iterationIndexMapList, + ArrayRef targetShape, ArrayRef extraOperands, + PatternRewriter &builder) { + auto shapedType = op->getResult(0)->getType().dyn_cast_or_null(); + if (!shapedType || !shapedType.hasStaticShape()) + assert(false && "Expected a statically shaped result type"); + + // Compute unroll factors for 'iterationBounds' based on 'targetShape' + auto maybeUnrollFactors = shapeRatio(iterationBounds, targetShape); + if (!maybeUnrollFactors.hasValue()) + assert(false && "Failed to compute unroll factors for target shape"); + auto unrollFactors = *maybeUnrollFactors; + + // Compute unrolled operation state for each mapped operand. + unsigned numMaps = iterationIndexMapList.size(); + SmallVector unrolledOperandState(numMaps); + assert(op->getNumOperands() >= numMaps); + for (unsigned i = 0; i < numMaps; ++i) { + getUnrolledOperandState(op->getOperand(i), iterationIndexMapList[i], + targetShape, unrolledOperandState[i]); + } + // Compute number of total unrolled instances. + auto numUnrolledInstances = computeMaxLinearIndex(unrollFactors); + auto basis = computeStrides(unrollFactors); + + auto &resultOperandState = unrolledOperandState[numMaps - 1]; + auto unrolledResultType = VectorType::get(resultOperandState.unrolledShape, + shapedType.getElementType()); + + // Initialize caches for intermediate vector results. + std::vector> caches(numMaps); + for (unsigned i = 0; i < numMaps; ++i) { + caches[i].resize(unrolledOperandState[i].numInstances); + } + + // Unroll 'numUnrolledInstances' of 'op', storing results in 'caches'. + for (unsigned i = 0; i < numUnrolledInstances; ++i) { + // De-linearize w.r.t. 'basis'. + auto vectorOffsets = delinearize(i, basis); + // Convert from unrolled vector-space offsets to element-space offsets. + auto offsets = zipMap([](int64_t v1, int64_t v2) { return v1 * v2; }, + vectorOffsets, targetShape); + // Get cached slice (or create slice) for each operand at 'offsets'. + SmallVector operands; + operands.reserve(numMaps); + for (unsigned i = 0; i < numMaps; ++i) { + operands.push_back(getOrCreateUnrolledOperandSlice( + op->getLoc(), unrolledOperandState[i], vectorOffsets, offsets, + iterationIndexMapList[i], caches[i], builder)); + } + // Create op on sliced vector arguments. + operands.append(extraOperands.begin(), extraOperands.end()); + auto resultVector = + cloneOpWithOperandsAndTypes(builder, op->getLoc(), op, operands, + unrolledResultType) + ->getResult(0); + + // Compute linear result index. + int64_t resultIndex = getUnrolledOperandLinearIndex( + resultOperandState, vectorOffsets, iterationIndexMapList[numMaps - 1]); + // Update result cache at 'resultIndex'. + caches[numMaps - 1][resultIndex] = resultVector; + } + + // Make zero splat into which we will insert results from 'cache[numMaps - 1]' + auto resultVectorType = op->getResult(0)->getType().cast(); + auto *res = makeSplatZero(op->getLoc(), builder, resultVectorType); + SmallVector strides(resultOperandState.unrollFactors.size(), 1); + // Insert vector accumulators into output. + for (unsigned i = 0; i < resultOperandState.numInstances; ++i) { + auto vectorOffsets = delinearize(i, resultOperandState.basis); + // Convert from unrolled vector-space offsets to element-space offsets. + auto offsets = zipMap([](int64_t v1, int64_t v2) { return v1 * v2; }, + vectorOffsets, resultOperandState.unrolledShape); + res = builder.create( + op->getLoc(), caches[numMaps - 1][i], res, offsets, strides); + } + + return res; +} + // Entry point for unrolling declarative pattern rewrites. // `op` is unrolled to the `targetShape` as follows, for each of its operands: // 1. the unrolled type `unrolledVectorType` and number of unrolled instances @@ -200,6 +426,26 @@ static bool hasShape(Value *v, ArrayRef shape) { Value * mlir::vector::unrollSingleResultOpMatchingType(PatternRewriter &builder, Operation *op, ArrayRef targetShape) { + if (auto contractionOp = dyn_cast(op)) { + // Get contraction op iteration bounds. + SmallVector iterationBounds; + contractionOp.getIterationBounds(iterationBounds); + assert(iterationBounds.size() == targetShape.size()); + // Get map from iteration space index to lhs/rhs/result shape index. + std::vector> iterationIndexMapList; + contractionOp.getIterationIndexMap(iterationIndexMapList); + // TODO(andydavis) Support unrollable vector masks. + SmallVector masks(contractionOp.masks().begin(), + contractionOp.masks().end()); + // Unroll 'op' 'iterationBounds' to 'targetShape'. + return unrollSingleResultStructuredOp(op, iterationBounds, + iterationIndexMapList, targetShape, + masks, builder); + } + // TODO(andydavis) Create trivial iteration bounds and index map for + // elementwise operations and call 'unrollSingleResultStructuredOp'. Remove + // fakefork/join if possible. + LLVM_DEBUG(dbgs() << "\n[" DEBUG_TYPE "]: unrollSingleResultOpMatchingType on func:\n"); LLVM_DEBUG(op->getParentOfType().print(dbgs())); @@ -365,24 +611,6 @@ struct ConvertFakeForkFromBlockArgsOp : public RewritePattern { } }; -static Value *makeSplatZero(Location loc, PatternRewriter &rewriter, - VectorType vt) { - auto t = vt.getElementType(); - Value *f = nullptr; - if (t.isBF16() || t.isF16()) - f = rewriter.create(loc, t, rewriter.getF16FloatAttr(0.0f)) - .getResult(); - else if (t.isF32()) - f = rewriter.create(loc, t, rewriter.getF32FloatAttr(0.0f)) - .getResult(); - else if (t.isF64()) - f = rewriter.create(loc, t, rewriter.getF64FloatAttr(0.0f)) - .getResult(); - if (f) - return rewriter.create(loc, vt, f).getResult(); - llvm_unreachable("Unsupported type in `makeSplatZero`"); -} - // Rewrites a fakeJoin, whose (unique) operand is a blockArgument, into multiple // vector.strided_slice ops. struct ConvertFakeJoinOp : public RewritePattern { From 48200148281358406eef4e0874a526775e7671f1 Mon Sep 17 00:00:00 2001 From: Julian Gross Date: Wed, 4 Dec 2019 07:17:01 -0800 Subject: [PATCH 050/383] Added new FAbs, FCeil, Cos, Neg, Sign, Tanh operations. Closes #251 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/251 from dfki-jugr:new_ops 0398997bf9953016898f873068e22916a062eb2b PiperOrigin-RevId: 283750699 Change-Id: Iada2142257da7e4bade7183d84e7434a6c06707e --- third_party/mlir/g3doc/Dialects/Standard.md | 156 +++++ .../include/mlir/Dialect/StandardOps/Ops.td | 543 ++++++++++-------- 2 files changed, 458 insertions(+), 241 deletions(-) diff --git a/third_party/mlir/g3doc/Dialects/Standard.md b/third_party/mlir/g3doc/Dialects/Standard.md index ed650a56636..cbea654a256 100644 --- a/third_party/mlir/g3doc/Dialects/Standard.md +++ b/third_party/mlir/g3doc/Dialects/Standard.md @@ -454,6 +454,84 @@ tensor_store %8, %10 : memref<4x?xf32, #layout, memspace0> ## Unary Operations +### 'absf' operation + +Syntax: + +``` {.ebnf} +operation ::= ssa-id `=` `absf` ssa-use `:` type +``` + +Examples: + +```mlir {.mlir} +// Scalar absolute value. +%a = absf %b : f64 + +// SIMD vector element-wise absolute value. +%f = absf %g : vector<4xf32> + +// Tensor element-wise absolute value. +%x = absf %y : tensor<4x?xf8> +``` + +The `absf` operation computes the absolute value. It takes one operand and +returns one result of the same type. This type may be a float scalar type, a +vector whose element type is float, or a tensor of floats. It has no standard +attributes. + +### 'ceilf' operation + +Syntax: + +``` {.ebnf} +operation ::= ssa-id `=` `ceilf` ssa-use `:` type +``` + +Examples: + +```mlir {.mlir} +// Scalar ceiling value. +%a = ceilf %b : f64 + +// SIMD vector element-wise ceiling value. +%f = ceilf %g : vector<4xf32> + +// Tensor element-wise ceiling value. +%x = ceilf %y : tensor<4x?xf8> +``` + +The `ceilf` operation computes the ceiling of a given value. It takes one +operand and returns one result of the same type. This type may be a float +scalar type, a vector whose element type is float, or a tensor of floats. It +has no standard attributes. + +### 'cos' operation + +Syntax: + +``` {.ebnf} +operation ::= ssa-id `=` `cos` ssa-use `:` type +``` + +Examples: + +```mlir {.mlir} +// Scalar cosine value. +%a = cos %b : f64 + +// SIMD vector element-wise cosine value. +%f = cos %g : vector<4xf32> + +// Tensor element-wise cosine value. +%x = cos %y : tensor<4x?xf8> +``` + +The `cos` operation computes the cosine of a given value. It takes one operand +and returns one result of the same type. This type may be a float scalar type, +a vector whose element type is float, or a tensor of floats. It has no standard +attributes. + ### 'exp' operation Syntax: @@ -479,6 +557,58 @@ The `exp` operation takes one operand and returns one result of the same type. This type may be a float scalar type, a vector whose element type is float, or a tensor of floats. It has no standard attributes. +### 'negf' operation + +Syntax: + +``` {.ebnf} +operation ::= ssa-id `=` `negf` ssa-use `:` type +``` + +Examples: + +```mlir {.mlir} +// Scalar negation value. +%a = negf %b : f64 + +// SIMD vector element-wise negation value. +%f = negf %g : vector<4xf32> + +// Tensor element-wise negation value. +%x = negf %y : tensor<4x?xf8> +``` + +The `negf` operation computes the negation of a given value. It takes one +operand and returns one result of the same type. This type may be a float +scalar type, a vector whose element type is float, or a tensor of floats. It +has no standard attributes. + +### 'tanh' operation + +Syntax: + +``` {.ebnf} +operation ::= ssa-id `=` `tanh` ssa-use `:` type +``` + +Examples: + +```mlir {.mlir} +// Scalar hyperbolic tangent value. +%a = tanh %b : f64 + +// SIMD vector element-wise hyperbolic tangent value. +%f = tanh %g : vector<4xf32> + +// Tensor element-wise hyperbolic tangent value. +%x = tanh %y : tensor<4x?xf8> +``` + +The `tanh` operation computes the hyperbolic tangent. It takes one operand and +returns one result of the same type. This type may be a float scalar type, a +vector whose element type is float, or a tensor of floats. It has no standard +attributes. + ## Arithmetic Operations Basic arithmetic in MLIR is specified by standard operations described in this @@ -675,6 +805,32 @@ compiler is multithreaded, and disallowing SSA values to directly reference a function simplifies this ([rationale](../Rationale.md#multithreading-the-compiler)). +### 'copysign' operation + +Syntax: + +``` {.ebnf} +operation ::= ssa-id `=` `copysign` ssa-use `:` type +``` + +Examples: + +```mlir {.mlir} +// Scalar copysign value. +%a = copysign %b %c : f64 + +// SIMD vector element-wise copysign value. +%f = copysign %g %h : vector<4xf32> + +// Tensor element-wise copysign value. +%x = copysign %y %z : tensor<4x?xf8> +``` + +The `copysign` returns a value with the magnitude of the first operand and the +sign of the second operand. It takes two operands and returns one result of the +same type. This type may be a float scalar type, a vector whose element type is +float, or a tensor of floats. It has no standard attributes. + ### 'divis' operation Signed integer division. Rounds towards zero. Treats the leading bit as sign, diff --git a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td index 51c7bfbccdc..e7439e49502 100644 --- a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td +++ b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td @@ -130,6 +130,16 @@ class FloatArithmeticOp traits = []> : ArithmeticOp, Arguments<(ins FloatLike:$lhs, FloatLike:$rhs)>; +def AbsFOp : FloatUnaryOp<"absf"> { + let summary = "floating point absolute-value operation"; + let description = [{ + The `absf` operation computes the absolute value. It takes one operand and + returns one result of the same type. This type may be a float scalar type, + a vector whose element type is float, or a tensor of floats. It has no + standard attributes. + }]; +} + def AddFOp : FloatArithmeticOp<"addf"> { let summary = "floating point addition operation"; let hasFolder = 1; @@ -345,6 +355,63 @@ def CallIndirectOp : Std_Op<"call_indirect", [CallOpInterface]> { let hasCanonicalizer = 1; } +def CeilFOp : FloatUnaryOp<"ceilf"> { + let summary = "ceiling of the specified value"; + let description = [{ + The `ceilf` operation computes the ceiling of a given value. It takes one + operand and returns one result of the same type. This type may be a float + scalar type, a vector whose element type is float, or a tensor of floats. + It has no standard attributes. + }]; +} + +def CmpFOp : Std_Op<"cmpf", + [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]> { + let summary = "floating-point comparison operation"; + let description = [{ + The "cmpf" operation compares its two operands according to the float + comparison rules and the predicate specified by the respective attribute. + The predicate defines the type of comparison: (un)orderedness, (in)equality + and signed less/greater than (or equal to) as well as predicates that are + always true or false. The operands must have the same type, and this type + must be a float type, or a vector or tensor thereof. The result is an i1, + or a vector/tensor thereof having the same shape as the inputs. Unlike cmpi, + the operands are always treated as signed. The u prefix indicates + *unordered* comparison, not unsigned comparison, so "une" means unordered or + not equal. For the sake of readability by humans, custom assembly form for + the operation uses a string-typed attribute for the predicate. The value of + this attribute corresponds to lower-cased name of the predicate constant, + e.g., "one" means "ordered not equal". The string representation of the + attribute is merely a syntactic sugar and is converted to an integer + attribute by the parser. + + %r1 = cmpf "oeq" %0, %1 : f32 + %r2 = cmpf "ult" %0, %1 : tensor<42x42xf64> + %r3 = "std.cmpf"(%0, %1) {predicate: 0} : (f8, f8) -> i1 + }]; + + let arguments = (ins FloatLike:$lhs, FloatLike:$rhs); + let results = (outs BoolLike); + + let builders = [OpBuilder< + "Builder *builder, OperationState &result, CmpFPredicate predicate," + "Value *lhs, Value *rhs", [{ + ::buildCmpFOp(builder, result, predicate, lhs, rhs); + }]>]; + + let extraClassDeclaration = [{ + static StringRef getPredicateAttrName() { return "predicate"; } + static CmpFPredicate getPredicateByName(StringRef name); + + CmpFPredicate getPredicate() { + return (CmpFPredicate)getAttrOfType(getPredicateAttrName()) + .getInt(); + } + }]; + + let hasFolder = 1; +} + def CMPI_P_EQ : I64EnumAttrCase<"eq", 0>; def CMPI_P_NE : I64EnumAttrCase<"ne", 1>; def CMPI_P_SLT : I64EnumAttrCase<"slt", 2>; @@ -415,53 +482,6 @@ def CmpIOp : Std_Op<"cmpi", let hasFolder = 1; } -def CmpFOp : Std_Op<"cmpf", - [NoSideEffect, SameTypeOperands, SameOperandsAndResultShape]> { - let summary = "floating-point comparison operation"; - let description = [{ - The "cmpf" operation compares its two operands according to the float - comparison rules and the predicate specified by the respective attribute. - The predicate defines the type of comparison: (un)orderedness, (in)equality - and signed less/greater than (or equal to) as well as predicates that are - always true or false. The operands must have the same type, and this type - must be a float type, or a vector or tensor thereof. The result is an i1, - or a vector/tensor thereof having the same shape as the inputs. Unlike cmpi, - the operands are always treated as signed. The u prefix indicates - *unordered* comparison, not unsigned comparison, so "une" means unordered or - not equal. For the sake of readability by humans, custom assembly form for - the operation uses a string-typed attribute for the predicate. The value of - this attribute corresponds to lower-cased name of the predicate constant, - e.g., "one" means "ordered not equal". The string representation of the - attribute is merely a syntactic sugar and is converted to an integer - attribute by the parser. - - %r1 = cmpf "oeq" %0, %1 : f32 - %r2 = cmpf "ult" %0, %1 : tensor<42x42xf64> - %r3 = "std.cmpf"(%0, %1) {predicate: 0} : (f8, f8) -> i1 - }]; - - let arguments = (ins FloatLike:$lhs, FloatLike:$rhs); - let results = (outs BoolLike); - - let builders = [OpBuilder< - "Builder *builder, OperationState &result, CmpFPredicate predicate," - "Value *lhs, Value *rhs", [{ - ::buildCmpFOp(builder, result, predicate, lhs, rhs); - }]>]; - - let extraClassDeclaration = [{ - static StringRef getPredicateAttrName() { return "predicate"; } - static CmpFPredicate getPredicateByName(StringRef name); - - CmpFPredicate getPredicate() { - return (CmpFPredicate)getAttrOfType(getPredicateAttrName()) - .getInt(); - } - }]; - - let hasFolder = 1; -} - def CondBranchOp : Std_Op<"cond_br", [Terminator]> { let summary = "conditional branch operation"; let description = [{ @@ -602,6 +622,27 @@ def ConstantOp : Std_Op<"constant", let hasFolder = 1; } +def CopySignOp : FloatArithmeticOp<"copysign"> { + let summary = "A copysign operation"; + let description = [{ + The `copysign` returns a value with the magnitude of the first operand and + the sign of the second operand. It takes two operands and returns one + result of the same type. This type may be a float scalar type, a vector + whose element type is float, or a tensor of floats. It has no standard + attributes. + }]; +} + +def CosOp : FloatUnaryOp<"cos"> { + let summary = "cosine of the specified value"; + let description = [{ + The `cos` operation computes the cosine of a given value. It takes one + operand and returns one result of the same type. This type may be a float + scalar type, a vector whose element type is float, or a tensor of floats. + It has no standard attributes. + }]; +} + def DeallocOp : Std_Op<"dealloc"> { let summary = "memory deallocation operation"; let description = [{ @@ -724,24 +765,6 @@ def IndexCastOp : CastOp<"index_cast">, Arguments<(ins AnyType:$in)> { let hasFolder = 0; } -def SIToFPOp : CastOp<"sitofp">, Arguments<(ins AnyType:$in)> { - let summary = "cast from integer type to floating-point"; - let description = [{ - Cast from a value interpreted as signed integer to the corresponding - floating-point value. If the value cannot be exactly represented, it is - rounded using the default rounding mode. Only scalars are currently - supported. - }]; - - let extraClassDeclaration = [{ - /// Return true if `a` and `b` are valid operand and result pairs for - /// the operation. - static bool areCastCompatible(Type a, Type b); - }]; - - let hasFolder = 0; -} - def FPExtOp : CastOp<"fpext">, Arguments<(ins AnyType:$in)> { let summary = "cast from floating-point to wider floating-point"; let description = [{ @@ -866,6 +889,16 @@ def MulIOp : IntArithmeticOp<"muli", [Commutative]> { let hasFolder = 1; } +def NegFOp : FloatUnaryOp<"negf"> { + let summary = "floating point negation"; + let description = [{ + The `negf` operation computes the negation of a given value. It takes one + operand and returns one result of the same type. This type may be a float + scalar type, a vector whose element type is float, or a tensor of floats. + It has no standard attributes. + }]; +} + def OrOp : IntArithmeticOp<"or", [Commutative]> { let summary = "integer binary or"; let hasFolder = 1; @@ -1000,6 +1033,24 @@ def ShlISOp : IntArithmeticOp<"shlis"> { let summary = "signed integer shift left"; } +def SIToFPOp : CastOp<"sitofp">, Arguments<(ins AnyType:$in)> { + let summary = "cast from integer type to floating-point"; + let description = [{ + Cast from a value interpreted as signed integer to the corresponding + floating-point value. If the value cannot be exactly represented, it is + rounded using the default rounding mode. Only scalars are currently + supported. + }]; + + let extraClassDeclaration = [{ + /// Return true if `a` and `b` are valid operand and result pairs for + /// the operation. + static bool areCastCompatible(Type a, Type b); + }]; + + let hasFolder = 0; +} + def SplatOp : Std_Op<"splat", [NoSideEffect]> { let summary = "splat or broadcast operation"; let description = [{ @@ -1026,16 +1077,6 @@ def SplatOp : Std_Op<"splat", [NoSideEffect]> { let hasFolder = 1; } -def SubFOp : FloatArithmeticOp<"subf"> { - let summary = "floating point subtraction operation"; - let hasFolder = 1; -} - -def SubIOp : IntArithmeticOp<"subi"> { - let summary = "integer subtraction operation"; - let hasFolder = 1; -} - def StoreOp : Std_Op<"store"> { let summary = "store operation"; let description = [{ @@ -1075,6 +1116,192 @@ def StoreOp : Std_Op<"store"> { let hasCanonicalizer = 1; } +def SubFOp : FloatArithmeticOp<"subf"> { + let summary = "floating point subtraction operation"; + let hasFolder = 1; +} + +def SubIOp : IntArithmeticOp<"subi"> { + let summary = "integer subtraction operation"; + let hasFolder = 1; +} + +def SubViewOp : Std_Op<"subview", [AttrSizedOperandSegments, NoSideEffect]> { + let summary = "memref subview operation"; + let description = [{ + The "subview" operation converts a memref type to another memref type + which represents a reduced-size view of the original memref as specified by + the operation's offsets, sizes and strides arguments. + + The SubView operation supports the following arguments: + *) Memref: the "base" memref on which to create a "view" memref. + *) Offsets: zero or memref-rank number of dynamic offsets into the "base" + memref at which to create the "view" memref. + *) Sizes: zero or memref-rank dynamic size operands which specify the + dynamic sizes of the result "view" memref type. + *) Strides: zero or memref-rank number of dynamic strides which are applied + multiplicatively to the base memref strides in each dimension. + + Note on the number of operands for offsets, sizes and strides: For + each of these, the number of operands must either be same as the + memref-rank number or empty. For the latter, those values will be + treated as constants. + + Example 1: + + %0 = alloc() : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)> + + // Create a sub-view of "base" memref '%0' with offset arguments '%c0', + // dynamic sizes for each dimension, and stride arguments '%c1'. + %1 = subview %0[%c0, %c0][%size0, %size1][%c1, %c1] + : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1) > to + memref (d0 * s1 + d1 + s0)> + + Example 2: + + %0 = alloc() : memref<8x16x4xf32, (d0, d1, d1) -> (d0 * 64 + d1 * 4 + d2)> + + // Create a sub-view of "base" memref '%0' with dynamic offsets, sizes, + // and strides. + // Note that dynamic offsets are represented by the linearized dynamic + // offset symbol 's0' in the subview memref layout map, and that the + // dynamic strides operands, after being applied to the base memref + // strides in each dimension, are represented in the view memref layout + // map as symbols 's1', 's2' and 's3'. + %1 = subview %0[%i, %j, %k][%size0, %size1, %size2][%x, %y, %z] + : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to + memref (d0 * s1 + d1 * s2 + d2 * s3 + s0)> + + Example 3: + + %0 = alloc() : memref<8x16x4xf32, (d0, d1, d1) -> (d0 * 64 + d1 * 4 + d2)> + + // Subview with constant offsets, sizes and strides. + %1 = subview %0[][][] + : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to + memref<4x4x4xf32, (d0, d1, d2) -> (d0 * 16 + d1 * 4 + d2 + 8)> + + Example 4: + + %0 = alloc(%arg0, %arg1) : memref + + // Subview with constant size, but dynamic offsets and + // strides. The resulting memref has a static shape, but if the + // base memref has an affine map to describe the layout, the result + // memref also uses an affine map to describe the layout. The + // strides of the result memref is computed as follows: + // + // Let #map1 represents the layout of the base memref, and #map2 + // represents the layout of the result memref. A #mapsubview can be + // constructed to map an index from the result memref to the base + // memref (note that the description below uses more convenient + // naming for symbols, while in affine maps, symbols are + // represented as unsigned numbers that identify that symbol in the + // given affine map. + // + // #mapsubview = (d0, d1)[o0, o1, t0, t1] -> (d0 * t0 + o0, d1 * t1 + o1) + // + // where, o0, o1, ... are offsets, and t0, t1, ... are strides. Then, + // + // #map2 = #map1.compose(#mapsubview) + // + // If the layout map is represented as + // + // #map1 = (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0) + // + // then, + // + // #map2 = (d0, d1)[s0, s1, s2, o0, o1, t0, t1] -> + // (d0 * s1 * t0 + d1 * s2 * t1 + o0 * s1 + o1 * s2 + s0) + // + // Representing this canonically + // + // #map2 = (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0) + // + // where, r0 = o0 * s1 + o1 * s2 + s0, r1 = s1 * t0, r2 = s2 * t1. + %1 = subview %0[%i, %j][][%x, %y] : + : memref (d0 * s1 + d1 * s2 + s0)> to + memref<4x4xf32, (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0)> + + // Note that the subview op does not gaurantee that the result + // memref is "inbounds" w.r.t to base memref. It is upto the client + // to ensure that the subview is accessed in a manner that is + // in-bounds. + + } + }]; + + // TODO(b/144779634, ravishankarm) : Use different arguments for + // offsets, sizes and strides. + let arguments = (ins + AnyMemRef:$source, + Variadic:$offsets, + Variadic:$sizes, + Variadic:$strides, + I32ElementsAttr:$operand_segment_sizes + ); + let results = (outs AnyMemRef); + + let builders = [ + OpBuilder< + "Builder *b, OperationState &result, Value *source, " + "ArrayRef offsets, ArrayRef sizes, " + "ArrayRef strides, Type resultType = Type(), " + "ArrayRef attrs = {}">, + OpBuilder< + "Builder *builder, OperationState &result, " + "Type resultType, Value *source"> + ]; + + let extraClassDeclaration = [{ + /// Returns the type of the base memref operand. + MemRefType getBaseMemRefType() { + return source()->getType().cast(); + } + + /// The result of a subview is always a memref. + MemRefType getType() { return getResult()->getType().cast(); } + + /// Returns as integer value the number of offset operands. + int64_t getNumOffsets() { return llvm::size(offsets()); } + + /// Returns as integer value the number of size operands. + int64_t getNumSizes() { return llvm::size(sizes()); } + + /// Returns as integer value the number of stride operands. + int64_t getNumStrides() { return llvm::size(strides()); } + + /// Returns the dynamic sizes for this subview operation if specified. + operand_range getDynamicSizes() { return sizes(); } + + /// Returns in `staticStrides` the static value of the stride + /// operands. Returns failure() if the static value of the stride + /// operands could not be retrieved. + LogicalResult getStaticStrides(SmallVectorImpl &staticStrides); + + // Auxiliary range data structure and helper function that unpacks the + // offset, size and stride operands of the SubViewOp into a list of triples. + // Such a list of triple is sometimes more convenient to manipulate. + struct Range { + Value *offset, *size, *stride; + }; + SmallVector getRanges(); + }]; + + let hasCanonicalizer = 1; +} + +def TanhOp : FloatUnaryOp<"tanh"> { + let summary = "hyperbolic tangent of the specified value"; + let description = [{ + The `tanh` operation computes the hyperbolic tangent. It takes one operand + and returns one result of the same type. This type may be a float scalar + type, a vector whose element type is float, or a tensor of floats. It has + no standard attributes. + }]; +} + def TensorCastOp : CastOp<"tensor_cast"> { let summary = "tensor cast operation"; let description = [{ @@ -1248,172 +1475,6 @@ def ViewOp : Std_Op<"view", [NoSideEffect]> { let hasCanonicalizer = 1; } -def SubViewOp : Std_Op<"subview", [AttrSizedOperandSegments, NoSideEffect]> { - let summary = "memref subview operation"; - let description = [{ - The "subview" operation converts a memref type to another memref type - which represents a reduced-size view of the original memref as specified by - the operation's offsets, sizes and strides arguments. - - The SubView operation supports the following arguments: - *) Memref: the "base" memref on which to create a "view" memref. - *) Offsets: zero or memref-rank number of dynamic offsets into the "base" - memref at which to create the "view" memref. - *) Sizes: zero or memref-rank dynamic size operands which specify the - dynamic sizes of the result "view" memref type. - *) Strides: zero or memref-rank number of dynamic strides which are applied - multiplicatively to the base memref strides in each dimension. - - Note on the number of operands for offsets, sizes and strides: For - each of these, the number of operands must either be same as the - memref-rank number or empty. For the latter, those values will be - treated as constants. - - Example 1: - - %0 = alloc() : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1)> - - // Create a sub-view of "base" memref '%0' with offset arguments '%c0', - // dynamic sizes for each dimension, and stride arguments '%c1'. - %1 = subview %0[%c0, %c0][%size0, %size1][%c1, %c1] - : memref<64x4xf32, (d0, d1) -> (d0 * 4 + d1) > to - memref (d0 * s1 + d1 + s0)> - - Example 2: - - %0 = alloc() : memref<8x16x4xf32, (d0, d1, d1) -> (d0 * 64 + d1 * 4 + d2)> - - // Create a sub-view of "base" memref '%0' with dynamic offsets, sizes, - // and strides. - // Note that dynamic offsets are represented by the linearized dynamic - // offset symbol 's0' in the subview memref layout map, and that the - // dynamic strides operands, after being applied to the base memref - // strides in each dimension, are represented in the view memref layout - // map as symbols 's1', 's2' and 's3'. - %1 = subview %0[%i, %j, %k][%size0, %size1, %size2][%x, %y, %z] - : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to - memref (d0 * s1 + d1 * s2 + d2 * s3 + s0)> - - Example 3: - - %0 = alloc() : memref<8x16x4xf32, (d0, d1, d1) -> (d0 * 64 + d1 * 4 + d2)> - - // Subview with constant offsets, sizes and strides. - %1 = subview %0[][][] - : memref<8x16x4xf32, (d0, d1, d2) -> (d0 * 64 + d1 * 4 + d2)> to - memref<4x4x4xf32, (d0, d1, d2) -> (d0 * 16 + d1 * 4 + d2 + 8)> - - Example 4: - - %0 = alloc(%arg0, %arg1) : memref - - // Subview with constant size, but dynamic offsets and - // strides. The resulting memref has a static shape, but if the - // base memref has an affine map to describe the layout, the result - // memref also uses an affine map to describe the layout. The - // strides of the result memref is computed as follows: - // - // Let #map1 represents the layout of the base memref, and #map2 - // represents the layout of the result memref. A #mapsubview can be - // constructed to map an index from the result memref to the base - // memref (note that the description below uses more convenient - // naming for symbols, while in affine maps, symbols are - // represented as unsigned numbers that identify that symbol in the - // given affine map. - // - // #mapsubview = (d0, d1)[o0, o1, t0, t1] -> (d0 * t0 + o0, d1 * t1 + o1) - // - // where, o0, o1, ... are offsets, and t0, t1, ... are strides. Then, - // - // #map2 = #map1.compose(#mapsubview) - // - // If the layout map is represented as - // - // #map1 = (d0, d1)[s0, s1, s2] -> (d0 * s1 + d1 * s2 + s0) - // - // then, - // - // #map2 = (d0, d1)[s0, s1, s2, o0, o1, t0, t1] -> - // (d0 * s1 * t0 + d1 * s2 * t1 + o0 * s1 + o1 * s2 + s0) - // - // Representing this canonically - // - // #map2 = (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0) - // - // where, r0 = o0 * s1 + o1 * s2 + s0, r1 = s1 * t0, r2 = s2 * t1. - %1 = subview %0[%i, %j][][%x, %y] : - : memref (d0 * s1 + d1 * s2 + s0)> to - memref<4x4xf32, (d0, d1)[r0, r1, r2] -> (d0 * r1 + d1 * r2 + r0)> - - // Note that the subview op does not gaurantee that the result - // memref is "inbounds" w.r.t to base memref. It is upto the client - // to ensure that the subview is accessed in a manner that is - // in-bounds. - - } - }]; - - // TODO(b/144779634, ravishankarm) : Use different arguments for - // offsets, sizes and strides. - let arguments = (ins - AnyMemRef:$source, - Variadic:$offsets, - Variadic:$sizes, - Variadic:$strides, - I32ElementsAttr:$operand_segment_sizes - ); - let results = (outs AnyMemRef); - - let builders = [ - OpBuilder< - "Builder *b, OperationState &result, Value *source, " - "ArrayRef offsets, ArrayRef sizes, " - "ArrayRef strides, Type resultType = Type(), " - "ArrayRef attrs = {}">, - OpBuilder< - "Builder *builder, OperationState &result, " - "Type resultType, Value *source"> - ]; - - let extraClassDeclaration = [{ - /// Returns the type of the base memref operand. - MemRefType getBaseMemRefType() { - return source()->getType().cast(); - } - - /// The result of a subview is always a memref. - MemRefType getType() { return getResult()->getType().cast(); } - - /// Returns as integer value the number of offset operands. - int64_t getNumOffsets() { return llvm::size(offsets()); } - - /// Returns as integer value the number of size operands. - int64_t getNumSizes() { return llvm::size(sizes()); } - - /// Returns as integer value the number of stride operands. - int64_t getNumStrides() { return llvm::size(strides()); } - - /// Returns the dynamic sizes for this subview operation if specified. - operand_range getDynamicSizes() { return sizes(); } - - /// Returns in `staticStrides` the static value of the stride - /// operands. Returns failure() if the static value of the stride - /// operands could not be retrieved. - LogicalResult getStaticStrides(SmallVectorImpl &staticStrides); - - // Auxiliary range data structure and helper function that unpacks the - // offset, size and stride operands of the SubViewOp into a list of triples. - // Such a list of triple is sometimes more convenient to manipulate. - struct Range { - Value *offset, *size, *stride; - }; - SmallVector getRanges(); - }]; - - let hasCanonicalizer = 1; -} - def XOrOp : IntArithmeticOp<"xor", [Commutative]> { let summary = "integer binary xor"; let hasFolder = 1; From 45dc6b7be1719e80c7126a27c111fce8df25e5d6 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 07:41:55 -0800 Subject: [PATCH 051/383] Loop coalescing: fix pointer chainsing in use-chain traversal In the replaceAllUsesExcept utility function called from loop coalescing the iteration over the use-chain is incorrect. The use list nodes (IROperands) have next/prev links, and bluntly resetting the use would make the loop to continue on uses of the value that was replaced instead of the original one. As a result, it could miss the existing uses and update the wrong ones. Make sure we increment the iterator before updating the use in the loop body. Reported-by: Uday Bondhugula Closes #291. PiperOrigin-RevId: 283754195 Change-Id: Ia9478e50544b2cd30b42e18262e3038009cf05b3 --- third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp b/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp index 0ee1220b720..7b38809ce49 100644 --- a/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/third_party/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -979,7 +979,7 @@ TileLoops mlir::extractFixedOuterLoops(loop::ForOp rootForOp, static void replaceAllUsesExcept(Value *orig, Value *replacement, const SmallPtrSetImpl &exceptions) { - for (auto &use : orig->getUses()) { + for (auto &use : llvm::make_early_inc_range(orig->getUses())) { if (exceptions.count(use.getOwner()) == 0) use.set(replacement); } From 6f7dd8b22d5746a8f86707464eaf9699fa336bbc Mon Sep 17 00:00:00 2001 From: Yuan Tang Date: Wed, 4 Dec 2019 12:11:58 -0500 Subject: [PATCH 052/383] Remove code owners for tf.contrib modules --- CODEOWNERS | 51 --------------------------------------------------- 1 file changed, 51 deletions(-) diff --git a/CODEOWNERS b/CODEOWNERS index 271e3b5b2ff..3ef02ffd68c 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -13,55 +13,4 @@ /tensorflow/tensorboard/ @jart /tensorflow/tools/docs/ @markdaoust -# contrib - -# NEED OWNER: /tensorflow/contrib/all_reduce -/tensorflow/contrib/autograph/ @mdanatg @kkimdev -/tensorflow/contrib/batching/ @alextp @chrisolston -/tensorflow/contrib/bayesflow/ @ebrevdo @rsepassi @jvdillon -/tensorflow/contrib/boosted_trees/ @sshrdp @yk5 @nataliaponomareva -/tensorflow/contrib/checkpoint/ @allenlavoie -/tensorflow/contrib/contrib/cluster_resolver/ @frankchn -/tensorflow/contrib/cmake/ @mrry -/tensorflow/contrib/copy_graph/ @tucker @poxvoculi -/tensorflow/contrib/crf/ @kentonl -/tensorflow/contrib/data/ @mrry -/tensorflow/tensorflow/contrib/distribute @joshl @priyag @sourabhbajaj @frankchn -/tensorflow/contrib/distributions/ @jvdillon @langmore @rsepassi -/tensorflow/contrib/eager @jaingaurav @alextp -/tensorflow/contrib/factorization/ @agarwal-ashish @xavigonzalvo -/tensorflow/contrib/ffmpeg/ @fredbertsch -/tensorflow/contrib/framework/ @ebrevdo -/tensorflow/contrib/graph_editor/ @purpledog -# NEED OWNER: /tensorflow/contrib/grid_rnn/ -/tensorflow/contrib/hadoop @yongtang -/tensorflow/contrib/hvx/ @satok16 -/tensorflow/contrib/integrate/ @shoyer -/tensorflow/contrib/kernel_methods/ @petrosmol -/tensorflow/contrib/ios_examples/ @petewarden -/tensorflow/contrib/labeled_tensor/ @shoyer -/tensorflow/contrib/layers/ @fchollet @martinwicke -/tensorflow/contrib/learn/ @martinwicke @ispirmustafa @alextp -/tensorflow/contrib/linear_optimizer/ @petrosmol @andreasst @katsiapis -/tensorflow/contrib/lookup/ @ysuematsu @andreasst -/tensorflow/contrib/losses/ @alextp @ispirmustafa -/tensorflow/contrib/makefile/ @petewarden @satok16 @wolffg -/tensorflow/contrib/metrics/ @alextp @honkentuber @ispirmustafa -/tensorflow/contrib/opt/ @strategist333 @alextp -/tensorflow/contrib/pi_examples/ @maciekcc -/tensorflow/contrib/quantization/ @petewarden -/tensorflow/contrib/rnn/ @ebrevdo @scottzhu -/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenlavoie -/tensorflow/contrib/seq2seq/ @ebrevdo @lmthang -/tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh -/tensorflow/contrib/slim/ @sguada @thenbasilmanran -/tensorflow/contrib/stateless/ @girving @alextp -/tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank -/tensorflow/contrib/tensorrt/ @aaroey @smit-hinsu @azaks2 -# NEED OWNER: /tensorflow/contrib/testing/ -/tensorflow/contrib/timeseries/ @allenlavoie -/tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj -/tensorflow/contrib/training/ @joel-shor @ebrevdo -/tensorflow/contrib/util/ @sherrym - /third_party/systemlibs/ @perfinion From 256811da7e843bc3b5c6a9a36f2f49ea91bae83f Mon Sep 17 00:00:00 2001 From: Scott Todd Date: Wed, 4 Dec 2019 09:15:49 -0800 Subject: [PATCH 053/383] [spirv] Adding sqrt op in the GLSL extension. PiperOrigin-RevId: 283769736 Change-Id: Idf156efd2c223488ba3a4e63ad04c393cde443bc --- .../mlir/Dialect/SPIRV/SPIRVGLSLOps.td | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td index 217b6d92865..2a1e8f32807 100644 --- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td @@ -516,6 +516,36 @@ def SPV_GLSLSSignOp : SPV_GLSLUnaryArithmeticOp<"SSign", 7, SPV_Integer> { // ----- +def SPV_GLSLSqrtOp : SPV_GLSLUnaryArithmeticOp<"Sqrt", 31, SPV_Float> { + let summary = "Returns the square root of the operand"; + + let description = [{ + Result is the square root of x. Result is undefined if x < 0. + + The operand x must be a scalar or vector whose component type is + floating-point. + + Result Type and the type of x must be the same type. Results are computed + per component. + + ### Custom assembly format + ``` {.ebnf} + float-scalar-vector-type ::= float-type | + `vector<` integer-literal `x` float-type `>` + sqrt-op ::= ssa-id `=` `spv.GLSL.Sqrt` ssa-use `:` + float-scalar-vector-type + ``` + For example: + + ``` + %2 = spv.GLSL.Sqrt %0 : f32 + %3 = spv.GLSL.Sqrt %1 : vector<3xf16> + ``` + }]; +} + +// ----- + def SPV_GLSLTanhOp : SPV_GLSLUnaryArithmeticOp<"Tanh", 21, SPV_Float16or32> { let summary = "Hyperbolic tangent of operand in radians"; From 4554a1b426401a7d2dbefd48767ead408bb41426 Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Wed, 4 Dec 2019 09:29:51 -0800 Subject: [PATCH 054/383] NFC - fix name / comments - isAccessInvariant - the name was misleading; this is really checking if a Value being used to index was loop IV invariant. Update comment. - the method is only used locally; what can be exposed in the future is isAccessInvariant(LoadOrStoreOp op, Value *iv) Signed-off-by: Uday Bondhugula Closes #285 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/285 from bondhugula:quickfix fe5837abe987980c4ab469a9aa7de8e4f0007d9f PiperOrigin-RevId: 283771923 Change-Id: Ic0ac78b08ead73bc56cad843d9c535524a57a921 --- .../mlir/include/mlir/Analysis/LoopAnalysis.h | 17 --------------- .../mlir/lib/Analysis/LoopAnalysis.cpp | 21 ++++++++++++++++--- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h b/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h index 8832c1469bc..140d9e91719 100644 --- a/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h +++ b/third_party/mlir/include/mlir/Analysis/LoopAnalysis.h @@ -57,23 +57,6 @@ llvm::Optional getConstantTripCount(AffineForOp forOp); /// this method is thus able to determine non-trivial divisors. uint64_t getLargestDivisorOfTripCount(AffineForOp forOp); -/// Given an induction variable `iv` of type AffineForOp and an `index` of type -/// IndexType, returns `true` if `index` is independent of `iv` and false -/// otherwise. -/// The determination supports composition with at most one AffineApplyOp. -/// The at most one AffineApplyOp comes from the fact that composition of -/// AffineApplyOp need to be canonicalized by construction to avoid writing code -/// that composes arbitrary numbers of AffineApplyOps everywhere. To achieve -/// this, at the very least, the compose-affine-apply pass must have been run. -/// -/// Prerequisites: -/// 1. `iv` and `index` of the proper type; -/// 2. at most one reachable AffineApplyOp from index; -/// -/// Returns false in cases with more than one AffineApplyOp, this is -/// conservative. -bool isAccessInvariant(Value *iv, Value *index); - /// Given an induction variable `iv` of type AffineForOp and `indices` of type /// IndexType, returns the set of `indices` that are independent of `iv`. /// diff --git a/third_party/mlir/lib/Analysis/LoopAnalysis.cpp b/third_party/mlir/lib/Analysis/LoopAnalysis.cpp index b297a63cb62..1d88d09d269 100644 --- a/third_party/mlir/lib/Analysis/LoopAnalysis.cpp +++ b/third_party/mlir/lib/Analysis/LoopAnalysis.cpp @@ -158,7 +158,22 @@ uint64_t mlir::getLargestDivisorOfTripCount(AffineForOp forOp) { return gcd.getValue(); } -bool mlir::isAccessInvariant(Value *iv, Value *index) { +/// Given an induction variable `iv` of type AffineForOp and an access `index` +/// of type index, returns `true` if `index` is independent of `iv` and +/// false otherwise. The determination supports composition with at most one +/// AffineApplyOp. The 'at most one AffineApplyOp' comes from the fact that +/// the composition of AffineApplyOp needs to be canonicalized by construction +/// to avoid writing code that composes arbitrary numbers of AffineApplyOps +/// everywhere. To achieve this, at the very least, the compose-affine-apply +/// pass must have been run. +/// +/// Prerequisites: +/// 1. `iv` and `index` of the proper type; +/// 2. at most one reachable AffineApplyOp from index; +/// +/// Returns false in cases with more than one AffineApplyOp, this is +/// conservative. +static bool isAccessIndexInvariant(Value *iv, Value *index) { assert(isForInductionVar(iv) && "iv must be a AffineForOp"); assert(index->getType().isa() && "index must be of IndexType"); SmallVector affineApplyOps; @@ -187,7 +202,7 @@ mlir::getInvariantAccesses(Value *iv, llvm::ArrayRef indices) { llvm::DenseSet res; for (unsigned idx = 0, n = indices.size(); idx < n; ++idx) { auto *val = indices[idx]; - if (isAccessInvariant(iv, val)) { + if (isAccessIndexInvariant(iv, val)) { res.insert(val); } } @@ -249,7 +264,7 @@ static bool isContiguousAccess(Value *iv, LoadOrStoreOp memoryOp, }); // Check access invariance of each operand in 'exprOperands'. for (auto *exprOperand : exprOperands) { - if (!isAccessInvariant(iv, exprOperand)) { + if (!isAccessIndexInvariant(iv, exprOperand)) { if (uniqueVaryingIndexAlongIv != -1) { // 2+ varying indices -> do not vectorize along iv. return false; From 4f26a9d9c6588df692434dc1e8eafa48c0e436bb Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 09:42:14 -0800 Subject: [PATCH 055/383] Additional build visibility for internal use case PiperOrigin-RevId: 283774088 Change-Id: I3bc4a0b3512db2f477f4071b1055d1cc537c75d9 --- tensorflow/core/BUILD | 3 ++- tensorflow/core/platform/build_config.bzl | 2 ++ tensorflow/core/platform/default/build_config.bzl | 3 +++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 588420eb1b6..79d4affa68f 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -105,6 +105,7 @@ load( "//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_additional_core_deps", + "tf_additional_env_hdrs", "tf_additional_lib_deps", "tf_additional_monitoring_hdrs", "tf_additional_test_deps", @@ -398,7 +399,7 @@ filegroup( "//tensorflow/core/platform:file_statistics.h", "//tensorflow/core/platform:file_system.h", "//tensorflow/core/platform:path.h", - ], + ] + tf_additional_env_hdrs(), visibility = ["//visibility:private"], ) diff --git a/tensorflow/core/platform/build_config.bzl b/tensorflow/core/platform/build_config.bzl index 4bea0f946cc..ab8a9b0c93f 100644 --- a/tensorflow/core/platform/build_config.bzl +++ b/tensorflow/core/platform/build_config.bzl @@ -9,6 +9,7 @@ load( _tf_additional_cupti_test_flags = "tf_additional_cupti_test_flags", _tf_additional_cupti_utils_cuda_deps = "tf_additional_cupti_utils_cuda_deps", _tf_additional_device_tracer_srcs = "tf_additional_device_tracer_srcs", + _tf_additional_env_hdrs = "tf_additional_env_hdrs", _tf_additional_lib_deps = "tf_additional_lib_deps", _tf_additional_lib_hdrs = "tf_additional_lib_hdrs", _tf_additional_lib_srcs = "tf_additional_lib_srcs", @@ -44,6 +45,7 @@ tf_additional_core_deps = _tf_additional_core_deps tf_additional_cupti_test_flags = _tf_additional_cupti_test_flags tf_additional_cupti_utils_cuda_deps = _tf_additional_cupti_utils_cuda_deps tf_additional_device_tracer_srcs = _tf_additional_device_tracer_srcs +tf_additional_env_hdrs = _tf_additional_env_hdrs tf_additional_lib_deps = _tf_additional_lib_deps tf_additional_lib_hdrs = _tf_additional_lib_hdrs tf_additional_lib_srcs = _tf_additional_lib_srcs diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl index a95de6632ce..fd6cdbd8bc1 100644 --- a/tensorflow/core/platform/default/build_config.bzl +++ b/tensorflow/core/platform/default/build_config.bzl @@ -600,6 +600,9 @@ def tf_additional_monitoring_srcs(): "default/monitoring.cc", ] +def tf_additional_env_hdrs(): + return [] + def tf_additional_proto_hdrs(): return [ "default/integral_types.h", From de4e14925ee9192c1e3dd566bd139b1ec060cec0 Mon Sep 17 00:00:00 2001 From: David Majnemer Date: Wed, 4 Dec 2019 09:45:12 -0800 Subject: [PATCH 056/383] [XLA] Slightly change some test inputs/tolerances PiperOrigin-RevId: 283774638 Change-Id: I9b5bf178dafe02e18d630e0cb2b867448dade7fa --- tensorflow/compiler/tests/binary_ops_test.py | 8 +++++++- tensorflow/compiler/tests/tensor_array_ops_test.py | 9 +++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py index bfd0ce3d072..4d85ca67777 100644 --- a/tensorflow/compiler/tests/binary_ops_test.py +++ b/tensorflow/compiler/tests/binary_ops_test.py @@ -653,7 +653,13 @@ class BinaryOpsTest(xla_test.XLATestCase): divs = np.arange(-3, 3, .25, dtype=dtype).reshape(1, 24) np_result = np.true_divide(nums, divs) np_result[:, divs[0] == 0] = 0 - self._testBinary(gen_math_ops.div_no_nan, nums, divs, expected=np_result) + self._testBinary( + gen_math_ops.div_no_nan, + nums, + divs, + expected=np_result, + rtol=7e-15 if dtype == np.float64 else None, + atol=3.9e-15 if dtype == np.float64 else None) if dtype not in self.complex_types: # floordiv unsupported for complex. self._testBinary( diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py index 99847e84c28..1bc88509542 100644 --- a/tensorflow/compiler/tests/tensor_array_ops_test.py +++ b/tensorflow/compiler/tests/tensor_array_ops_test.py @@ -164,7 +164,8 @@ class TensorArrayTest(xla_test.XLATestCase): dtype=tf_dtype, tensor_array_name="foo", size=3) # Unpack a matrix into vectors. - w1 = ta.unstack(convert([[1.0, 1.1], [2.0, 2.1], [3.0, 3.1]])) + w1 = ta.unstack( + convert([[1.0, 1.03125], [2.0, 2.03125], [3.0, 3.03125]])) r0 = w1.read(0) r1 = w1.read(1) r2 = w1.read(2) @@ -172,9 +173,9 @@ class TensorArrayTest(xla_test.XLATestCase): d0, d1, d2 = self.evaluate(xla.compile(fn)) - self.assertAllEqual(convert([1.0, 1.1]), d0) - self.assertAllEqual(convert([2.0, 2.1]), d1) - self.assertAllEqual(convert([3.0, 3.1]), d2) + self.assertAllEqual(convert([1.0, 1.03125]), d0) + self.assertAllEqual(convert([2.0, 2.03125]), d1) + self.assertAllEqual(convert([3.0, 3.03125]), d2) def fn(): # Reset ta because we're going to change the shape, else shape From 9eea1b6de56392fe57950810fc734c1abd2a7a0d Mon Sep 17 00:00:00 2001 From: Jared Duke Date: Wed, 4 Dec 2019 10:17:57 -0800 Subject: [PATCH 057/383] Use int32_t instead of int32 in resource code PiperOrigin-RevId: 283781525 Change-Id: I2f3d51fc934ea98a3ef1c3504cb4b4b29a157d5a --- .../lite/experimental/kernels/hashtable.cc | 3 ++- .../kernels/hashtable_ops_test.cc | 25 +++++++++++-------- .../experimental/resource/resource_base.h | 6 ++--- .../experimental/resource/static_hashtable.cc | 5 ++-- 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/tensorflow/lite/experimental/kernels/hashtable.cc b/tensorflow/lite/experimental/kernels/hashtable.cc index 9fea3566f04..dd0e75d4f54 100644 --- a/tensorflow/lite/experimental/kernels/hashtable.cc +++ b/tensorflow/lite/experimental/kernels/hashtable.cc @@ -86,7 +86,8 @@ TfLiteStatus EvalHashtable(TfLiteContext* context, TfLiteNode* node) { TfLiteTensor* resource_handle_tensor = GetOutput(context, node, kResourceHandleTensor); - auto* resource_handle_data = GetTensorData(resource_handle_tensor); + auto* resource_handle_data = + GetTensorData(resource_handle_tensor); resource_handle_data[0] = resource_id; Subgraph* subgraph = reinterpret_cast(context->impl_); diff --git a/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc b/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc index 8790a2c9960..4c8ca6c476b 100644 --- a/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc +++ b/tensorflow/lite/experimental/kernels/hashtable_ops_test.cc @@ -808,7 +808,8 @@ class HashtableLookupOpModel : public BaseHashtableOpModel { TEST(HashtableOpsTest, TestHashtableLookupIntToInt) { const int kResourceId = 42; - HashtableLookupOpModel m(TensorType_INT32, TensorType_INT32, 3); + HashtableLookupOpModel m(TensorType_INT32, + TensorType_INT32, 3); m.SetResourceId({kResourceId}); m.SetLookup({5, 6, 7}); @@ -818,14 +819,14 @@ TEST(HashtableOpsTest, TestHashtableLookupIntToInt) { kTfLiteInt32, {4, 5, 6}, {1, 2, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4})); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3})); } TEST(HashtableOpsTest, TestHashtableLookupIntToFloat) { const int kResourceId = 42; - HashtableLookupOpModel m(TensorType_INT32, TensorType_FLOAT32, - 3); + HashtableLookupOpModel m(TensorType_INT32, + TensorType_FLOAT32, 3); m.SetResourceId({kResourceId}); m.SetLookup({5, 6, 7}); @@ -869,8 +870,8 @@ class HashtableImportOpModel : public BaseHashtableOpModel { TEST(HashtableOpsTest, TestHashtableImport) { const int kResourceId = 42; - HashtableImportOpModel m(TensorType_INT32, TensorType_FLOAT32, - 3); + HashtableImportOpModel m(TensorType_INT32, + TensorType_FLOAT32, 3); EXPECT_EQ(m.GetResources().size(), 0); m.SetResourceId({kResourceId}); m.SetKeys({1, 2, 3}); @@ -890,8 +891,8 @@ TEST(HashtableOpsTest, TestHashtableImport) { TEST(HashtableOpsTest, TestHashtableImportTwice) { const int kResourceId = 42; - HashtableImportOpModel m(TensorType_INT32, TensorType_FLOAT32, - 3); + HashtableImportOpModel m(TensorType_INT32, + TensorType_FLOAT32, 3); EXPECT_EQ(m.GetResources().size(), 0); m.SetResourceId({kResourceId}); m.SetKeys({1, 2, 3}); @@ -929,7 +930,8 @@ class HashtableSizeOpModel : public BaseHashtableOpModel { TEST(HashtableOpsTest, TestHashtableSize) { const int kResourceId = 42; - HashtableSizeOpModel m(TensorType_INT32, TensorType_INT32); + HashtableSizeOpModel m(TensorType_INT32, + TensorType_INT32); m.SetResourceId({kResourceId}); @@ -937,13 +939,14 @@ TEST(HashtableOpsTest, TestHashtableSize) { kTfLiteInt32, {4, 5, 6}, {1, 2, 3}); m.Invoke(); - EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); + EXPECT_THAT(m.GetOutput(), ElementsAreArray({3})); EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1})); } TEST(HashtableOpsTest, TestHashtableSizeNonInitialized) { const int kResourceId = 42; - HashtableSizeOpModel m(TensorType_INT32, TensorType_INT32); + HashtableSizeOpModel m(TensorType_INT32, + TensorType_INT32); m.SetResourceId({kResourceId}); // Invoke without hash table initialization. diff --git a/tensorflow/lite/experimental/resource/resource_base.h b/tensorflow/lite/experimental/resource/resource_base.h index 48a00b93957..ff69c3ab356 100644 --- a/tensorflow/lite/experimental/resource/resource_base.h +++ b/tensorflow/lite/experimental/resource/resource_base.h @@ -15,11 +15,10 @@ limitations under the License. #ifndef TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_BASE_H_ #define TENSORFLOW_LITE_EXPERIMENTAL_RESOURCE_RESOURCE_BASE_H_ +#include #include #include -#include "tensorflow/lite/kernels/internal/compatibility.h" - namespace tflite { namespace resource { @@ -35,7 +34,8 @@ class ResourceBase { }; /// WARNING: Experimental interface, subject to change. -using ResourceMap = std::unordered_map>; +using ResourceMap = + std::unordered_map>; } // namespace resource } // namespace tflite diff --git a/tensorflow/lite/experimental/resource/static_hashtable.cc b/tensorflow/lite/experimental/resource/static_hashtable.cc index 47e3b762607..f90ae146959 100644 --- a/tensorflow/lite/experimental/resource/static_hashtable.cc +++ b/tensorflow/lite/experimental/resource/static_hashtable.cc @@ -85,7 +85,7 @@ LookupInterface* CreateStaticHashtableWithGivenKey(TfLiteType key_type, TfLiteType value_type) { switch (value_type) { case kTfLiteInt32: - return new StaticHashtable(key_type, value_type); + return new StaticHashtable(key_type, value_type); case kTfLiteString: return new StaticHashtable(key_type, value_type); case kTfLiteFloat32: @@ -99,7 +99,8 @@ LookupInterface* CreateStaticHashtable(TfLiteType key_type, TfLiteType value_type) { switch (key_type) { case kTfLiteInt32: - return CreateStaticHashtableWithGivenKey(key_type, value_type); + return CreateStaticHashtableWithGivenKey(key_type, + value_type); case kTfLiteString: return CreateStaticHashtableWithGivenKey(key_type, value_type); From d2e9dc16d1418b81f9d0d59eac7886fe65798ebd Mon Sep 17 00:00:00 2001 From: Sean Silva Date: Wed, 4 Dec 2019 10:19:20 -0800 Subject: [PATCH 058/383] Print out large elementsattr's such that they are parseable. I found that when running crash reproducers, the elided elementsattr's would prevent parsing the IR repro. I found myself manually going and replacing the "..." with some valid IR. With this change, we now print elided attrs as `opaque<"", "0xDEADBEEF">` to clearly delineate them as being elided while still being parseable. PiperOrigin-RevId: 283781806 Change-Id: I44aef05323e5577f64078a084a7271b3b2c2caa1 --- third_party/mlir/lib/IR/AsmPrinter.cpp | 42 +++++++++++++++++--------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/third_party/mlir/lib/IR/AsmPrinter.cpp b/third_party/mlir/lib/IR/AsmPrinter.cpp index 655a776118c..1d3f9d74403 100644 --- a/third_party/mlir/lib/IR/AsmPrinter.cpp +++ b/third_party/mlir/lib/IR/AsmPrinter.cpp @@ -733,6 +733,19 @@ static void printSymbolReference(StringRef symbolRef, raw_ostream &os) { os << '"'; } +// Print out a valid ElementsAttr that is succinct and can represent any +// potential shape/type, for use when eliding a large ElementsAttr. +// +// We choose to use an opaque ElementsAttr literal with conspicuous content to +// hopefully alert readers to the fact that this has been elided. +// +// Unfortunately, neither of the strings of an opaque ElementsAttr literal will +// accept the string "elided". The first string must be a registered dialect +// name and the latter must be a hex constant. +static void printElidedElementsAttr(raw_ostream &os) { + os << R"(opaque<"", "0xDEADBEEF">)"; +} + void ModulePrinter::printAttribute(Attribute attr, bool mayElideType) { if (!attr) { os << "<>"; @@ -836,19 +849,20 @@ void ModulePrinter::printAttribute(Attribute attr, bool mayElideType) { } case StandardAttributes::OpaqueElements: { auto eltsAttr = attr.cast(); + if (printerFlags.shouldElideElementsAttr(eltsAttr)) { + printElidedElementsAttr(os); + break; + } os << "opaque<\"" << eltsAttr.getDialect()->getNamespace() << "\", "; - os << '"' << "0x"; - - // Check for large ElementsAttr elision. - if (printerFlags.shouldElideElementsAttr(eltsAttr)) - os << "..."; - else - os << llvm::toHex(eltsAttr.getValue()); - os << "\">"; + os << '"' << "0x" << llvm::toHex(eltsAttr.getValue()) << "\">"; break; } case StandardAttributes::DenseElements: { auto eltsAttr = attr.cast(); + if (printerFlags.shouldElideElementsAttr(eltsAttr)) { + printElidedElementsAttr(os); + break; + } os << "dense<"; printDenseElementsAttr(eltsAttr); os << '>'; @@ -856,6 +870,11 @@ void ModulePrinter::printAttribute(Attribute attr, bool mayElideType) { } case StandardAttributes::SparseElements: { auto elementsAttr = attr.cast(); + if (printerFlags.shouldElideElementsAttr(elementsAttr.getIndices()) || + printerFlags.shouldElideElementsAttr(elementsAttr.getValues())) { + printElidedElementsAttr(os); + break; + } os << "sparse<"; printDenseElementsAttr(elementsAttr.getIndices()); os << ", "; @@ -916,13 +935,6 @@ void ModulePrinter::printDenseElementsAttr(DenseElementsAttr attr) { return; } - // Check for large elements attr elision. We explicitly check *after* splat, - // as the splat printing is already elided. - if (printerFlags.shouldElideElementsAttr(attr)) { - os << "..."; - return; - } - // Special case for degenerate tensors. auto numElements = type.getNumElements(); if (numElements == 0) { From e3a7bdbebb99352351a19e2e403136166aa52934 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 10:58:00 -0800 Subject: [PATCH 059/383] Pull eigen repo from github mirror instead of bitbucket.org. PiperOrigin-RevId: 283790370 Change-Id: Ifd84ad58b1bc350a17cdf9cf24677365febc6be5 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index a79444c221c..4c73d2e2c94 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -172,11 +172,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "eigen_archive", build_file = clean_dep("//third_party:eigen.BUILD"), patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"), - sha256 = "9edd4860b52813eaf8c023f0de1767ec58e2d67a290b718e6702469208ac5be1", - strip_prefix = "eigen-eigen-54bca9936424", + sha256 = "add24720f99ab4f3222f4c8a887f2609554cf9187d4f7d24a777a151a0ee2548", + strip_prefix = "eigen-git-mirror-4898dcdb06f1b1b0441b8e15119764793f8997e2", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/bitbucket.org/eigen/eigen/get/54bca9936424.tar.gz", - "https://bitbucket.org/eigen/eigen/get/54bca9936424.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/eigenteam/eigen-git-mirror/archive/4898dcdb06f1b1b0441b8e15119764793f8997e2.tar.gz", + "https://github.com/eigenteam/eigen-git-mirror/archive/4898dcdb06f1b1b0441b8e15119764793f8997e2.tar.gz", ], ) From 71421f37a825e6a789ae7cb35f409e1aabbf396f Mon Sep 17 00:00:00 2001 From: Robert David Date: Wed, 4 Dec 2019 11:20:57 -0800 Subject: [PATCH 060/383] Change VectorBatchVectorCwiseProduct(Accumulate) to call VectorVectorCwiseProduct(Accumulate). Also prevent Clang from vectorizing postamble loops after the manually vectorized loops. This reduces code size (both C++ and binary) while having no impact on performance. PiperOrigin-RevId: 283795648 Change-Id: I1427778e42290c9b88e0a70fca12d2135907953a --- .../internal/optimized/neon_tensor_utils.cc | 80 +++---------------- .../internal/optimized/neon_tensor_utils.h | 14 ---- .../optimized/neon_tensor_utils_impl.h | 13 --- .../internal/optimized/sse_tensor_utils.h | 14 ---- .../reference/portable_tensor_utils.cc | 26 +----- .../reference/portable_tensor_utils.h | 14 ---- .../reference/portable_tensor_utils_impl.h | 14 ---- .../lite/kernels/internal/tensor_utils.h | 28 +++++-- 8 files changed, 33 insertions(+), 170 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc index d5c1f227b9a..5f75699e2ca 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -1831,13 +1831,14 @@ void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2, int v = 0; for (; v < postamble_start; v += kFloatValuesPerNeonVector) { // Load 4 float values from vector1 and vector2. - float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); - float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); + const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); + const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); // Vector multiply 4 float - float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4); + const float32x4_t mul_32x4 = vmulq_f32(v1_f32x4, v2_f32x4); // Save to result array. - vst1q_f32(&result[v], mul_32x4); + vst1q_f32(result + v, mul_32x4); } +#pragma clang loop vectorize(disable) unroll(disable) for (; v < v_size; v++) { result[v] = vector1[v] * vector2[v]; } @@ -1854,83 +1855,20 @@ void NeonVectorVectorCwiseProductAccumulate(const float* vector1, int v = 0; for (; v < postamble_start; v += kFloatValuesPerNeonVector) { // Load 4 float values from vector1 and vector2 and accumulator. - float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); - float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); + const float32x4_t v1_f32x4 = vld1q_f32(vector1 + v); + const float32x4_t v2_f32x4 = vld1q_f32(vector2 + v); float32x4_t acc_32x4 = vld1q_f32(result + v); // Vector multiply-accumulate 4 float acc_32x4 = vmlaq_f32(acc_32x4, v1_f32x4, v2_f32x4); // Save to result array. - vst1q_f32(&result[v], acc_32x4); + vst1q_f32(result + v, acc_32x4); } +#pragma clang loop vectorize(disable) unroll(disable) for (; v < v_size; v++) { result[v] += vector1[v] * vector2[v]; } } -void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result) { - // If v_size is not divisible by the vector size, then we need to process the - // final few elements sequentially. postamble_start shows the start index - // where this should happen. - const int postamble_start = - RoundDownVectors(v_size); - - for (int b = 0; b < n_batch; b++) { - int v = 0; - for (; v < postamble_start; v += kFloatValuesPerNeonVector) { - // Load from memory to vectors. - float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector + v); - float32x4_t vector_f32x4 = vld1q_f32(vector + v); - // Multiply. - float32x4_t result_f32x4 = vmulq_f32(batch_vector_f32x4, vector_f32x4); - // Store. - vst1q_f32(result + v, result_f32x4); - } - // Postamble loop - for (; v < v_size; v++) { - result[v] = vector[v] * batch_vector[v]; - } - // Update the pointers. - result += v_size; - batch_vector += v_size; - } -} - -void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, float* result) { - // If v_size is not divisible by the vector size, then we need to process the - // final few elements sequentially. postamble_start shows the start index - // where this should happen. - const int postamble_start = - RoundDownVectors(v_size); - - float* result_ptr = result; - const float* batch_vector_ptr = batch_vector; - for (int b = 0; b < n_batch; b++) { - int v = 0; - for (; v < postamble_start; v += kFloatValuesPerNeonVector) { - // Load from memory to vectors. - float32x4_t result_f32x4 = vld1q_f32(result_ptr + v); - float32x4_t batch_vector_f32x4 = vld1q_f32(batch_vector_ptr + v); - float32x4_t vector_f32x4 = vld1q_f32(vector + v); - // Multiply-accumulate. - result_f32x4 = vmlaq_f32(result_f32x4, batch_vector_f32x4, vector_f32x4); - // Store. - vst1q_f32(result_ptr + v, result_f32x4); - } - // Postamble loop - for (; v < v_size; v++) { - result_ptr[v] += vector[v] * batch_vector_ptr[v]; - } - // Update the pointers. - result_ptr += v_size; - batch_vector_ptr += v_size; - } -} - void NeonSub1Vector(const float* vector, int v_size, float* result) { // If v_size is not divisible by the vector size, then we need to process the // final few elements sequentially. postamble_start shows the start index diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h index 626afbe5d8d..892fcebd110 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h @@ -172,20 +172,6 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, result); } -void VectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result) { - NEON_OR_PORTABLE(VectorBatchVectorCwiseProduct, vector, v_size, batch_vector, - n_batch, result); -} - -void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result) { - NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size, - batch_vector, n_batch, result); -} - float VectorVectorDotProduct(const float* vector1, const float* vector2, int v_size) { return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size); diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h index ecd124c315d..5b189f761b6 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h @@ -127,19 +127,6 @@ void NeonVectorVectorCwiseProductAccumulate(const float* vector1, float NeonVectorVectorDotProduct(const float* vector1, const float* vector2, int v_size); -// Cwise product of a vector and a batch-vector. -void NeonVectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result); - -// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC -// operation, the assumption here is that result array is initialized to valid -// values. -void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, float* result); - // Compute "1.0f - elements of vector" (used in CIFG). void NeonSub1Vector(const float* vector, int v_size, float* result); diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h index 37c1c5ce05a..ebad7b70a95 100644 --- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h @@ -182,20 +182,6 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, result); } -void VectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result) { - NEON_OR_PORTABLE(VectorBatchVectorCwiseProduct, vector, v_size, batch_vector, - n_batch, result); -} - -void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result) { - NEON_OR_PORTABLE(VectorBatchVectorCwiseProductAccumulate, vector, v_size, - batch_vector, n_batch, result); -} - float VectorVectorDotProduct(const float* vector1, const float* vector2, int v_size) { return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size); diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index 1ba34d45987..b2b4dd25770 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -513,7 +513,7 @@ void PortableVectorVectorCwiseProduct(const float* vector1, const float* vector2, int v_size, float* result) { for (int v = 0; v < v_size; v++) { - *result++ = *vector1++ * *vector2++; + result[v] = vector1[v] * vector2[v]; } } @@ -554,29 +554,7 @@ void PortableVectorVectorCwiseProductAccumulate(const float* vector1, const float* vector2, int v_size, float* result) { for (int v = 0; v < v_size; v++) { - *result++ += *vector1++ * *vector2++; - } -} - -void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result) { - for (int b = 0; b < n_batch; b++) { - for (int v = 0; v < v_size; v++) { - *result++ = vector[v] * *batch_vector++; - } - } -} - -void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, - float* result) { - for (int b = 0; b < n_batch; b++) { - for (int v = 0; v < v_size; v++) { - *result++ += vector[v] * *batch_vector++; - } + result[v] += vector1[v] * vector2[v]; } } diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h index 587501fe2cb..918775234f7 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -197,20 +197,6 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result); } -void VectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result) { - PortableVectorBatchVectorCwiseProduct(vector, v_size, batch_vector, n_batch, - result); -} - -void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result) { - PortableVectorBatchVectorCwiseProductAccumulate(vector, v_size, batch_vector, - n_batch, result); -} - float VectorVectorDotProduct(const float* vector1, const float* vector2, int v_size) { return PortableVectorVectorDotProduct(vector1, vector2, v_size); diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h index 954ef6716b6..448c510e58a 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h @@ -99,20 +99,6 @@ void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1, int32_t* result, int result_stride); -// Cwise product of a vector and a batch-vector. -void PortableVectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result); - -// Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC -// operation, the assumption here is that result array is initialized to valid -// values. -void PortableVectorBatchVectorCwiseProductAccumulate(const float* vector, - int v_size, - const float* batch_vector, - int n_batch, - float* result); - void PortableMatrixBatchVectorMultiplyAccumulate( const int8_t* input, const int32_t* bias, const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift, diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h index fccd058bea5..a9a2b839547 100644 --- a/tensorflow/lite/kernels/internal/tensor_utils.h +++ b/tensorflow/lite/kernels/internal/tensor_utils.h @@ -379,16 +379,32 @@ void BatchVectorBatchVectorDotProduct(const int16_t* vector1, int result_stride); // Cwise product of a vector and a batch-vector. -void VectorBatchVectorCwiseProduct(const float* vector, int v_size, - const float* batch_vector, int n_batch, - float* result); +template +inline void VectorBatchVectorCwiseProduct(const T* vector, int v_size, + const T* batch_vector, int n_batch, + T* result) { + for (int b = 0; b < n_batch; b++) { + VectorVectorCwiseProduct(vector, batch_vector, v_size, result); + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} // Cwise product and accumulate of a vector and a batch-vector. Since it's a MAC // operation, the assumption here is that result array is initialized to valid // values. -void VectorBatchVectorCwiseProductAccumulate(const float* vector, int v_size, - const float* batch_vector, - int n_batch, float* result); +template +inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size, + const T* batch_vector, + int n_batch, T* result) { + for (int b = 0; b < n_batch; b++) { + VectorVectorCwiseProductAccumulate(vector, batch_vector, v_size, result); + // Update the pointers. + result += v_size; + batch_vector += v_size; + } +} // Add another vector for each batch in the batch vector. void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch, From 9bbb12d481de11e2611fa6fbb4dc13cb7bb09176 Mon Sep 17 00:00:00 2001 From: Niels Ole Salscheider Date: Wed, 4 Dec 2019 20:27:42 +0100 Subject: [PATCH 061/383] Fix segfault in error handling In the case that we did not find a node we also cannot call (**node).name() since we never assigned *node. Fix this by printing node_name. --- tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc index d011be2c5af..20804af5229 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_graph.cc @@ -307,7 +307,7 @@ void UpdateToEngineNode(const std::vector& infos, } } } - LOG(FATAL) << "Node " << (**node).name() << " not found in any engine."; + LOG(FATAL) << "Node " << node_name << " not found in any engine."; } // Function to insert a TRT engine node into the graph. From 5cb564092f6f7865ae1134212643042fc2ba5594 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 4 Dec 2019 11:22:25 -0800 Subject: [PATCH 062/383] Ruy: Profile cache ejection. PiperOrigin-RevId: 283795957 Change-Id: I139678b43340a134e74d30704d3a07f73c7ca8c2 --- tensorflow/lite/experimental/ruy/BUILD | 1 + .../lite/experimental/ruy/prepacked_cache.cc | 21 ++++++++++--------- .../lite/experimental/ruy/prepacked_cache.h | 6 ++++++ 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/BUILD b/tensorflow/lite/experimental/ruy/BUILD index e6fff55b77e..310cc6e0e40 100644 --- a/tensorflow/lite/experimental/ruy/BUILD +++ b/tensorflow/lite/experimental/ruy/BUILD @@ -118,6 +118,7 @@ cc_library( ":opt_set", ":platform", ":time", + "@gemmlowp//:profiler", ], ) diff --git a/tensorflow/lite/experimental/ruy/prepacked_cache.cc b/tensorflow/lite/experimental/ruy/prepacked_cache.cc index 2bd23f834c4..eab1b6acdfd 100644 --- a/tensorflow/lite/experimental/ruy/prepacked_cache.cc +++ b/tensorflow/lite/experimental/ruy/prepacked_cache.cc @@ -15,6 +15,7 @@ limitations under the License. #include "tensorflow/lite/experimental/ruy/prepacked_cache.h" +#include "profiling/instrumentation.h" #include "tensorflow/lite/experimental/ruy/matrix.h" namespace ruy { @@ -26,7 +27,7 @@ CacheIterator PrepackedCache::FindAndUpdate(const CacheKey &key) { auto itr = cache_.find(key); // If found, update with new access time for this entry. if (itr != cache_.end()) { - const TimePoint time = CoarseNow(); + const TimePoint time = CacheNow(); itr->second.second = time; } return itr; @@ -47,12 +48,15 @@ void PrepackedCache::Insert(const CacheKey &key, } void PrepackedCache::EjectOne() { - TimePoint oldest_time = CoarseNow(); + TimePoint oldest_time = CacheNow(); auto oldest = cache_.begin(); - for (auto itr = cache_.begin(); itr != cache_.end(); ++itr) { - if (itr->second.second < oldest_time) { - oldest_time = itr->second.second; - oldest = itr; + { + gemmlowp::ScopedProfilingLabel label("PepackedCacheEjection"); + for (auto itr = cache_.begin(); itr != cache_.end(); ++itr) { + if (itr->second.second < oldest_time) { + oldest_time = itr->second.second; + oldest = itr; + } } } PrepackedMatrix &pmatrix = oldest->second.first; @@ -70,10 +74,7 @@ void PrepackedCache::AllocatePrepackedMatrix(PrepackedMatrix *pmatrix) { void PrepackedCache::DoInsert(const CacheKey &key, const PrepackedMatrix &matrix) { - // TODO(talumbau) Profile timestamps on relevant models to see if - // this level of granularity is sufficient. CoarseNow is cheap so - // it would be nice to keep it. - const TimePoint t = CoarseNow(); + const TimePoint t = CacheNow(); const MatrixWithTimeStamp mts({matrix, t}); cache_.insert({key, mts}); } diff --git a/tensorflow/lite/experimental/ruy/prepacked_cache.h b/tensorflow/lite/experimental/ruy/prepacked_cache.h index 9c77c48cf69..3f25b451ce1 100644 --- a/tensorflow/lite/experimental/ruy/prepacked_cache.h +++ b/tensorflow/lite/experimental/ruy/prepacked_cache.h @@ -106,6 +106,12 @@ class PrepackedCache { // Returns the total size (in bytes) of data held in this cache. int TotalSize() const { return cache_size_; } + // All calls to get current TimePoints go through here. + // TODO(b/145625614) Profile timestamps on relevant models to see if + // this level of granularity is sufficient. CoarseNow is cheap so + // it would be nice to keep it. + TimePoint CacheNow() const { return CoarseNow(); } + // Performs the memory allocation for the `data` and `sums` members of a // PrepackedMatrix. void AllocatePrepackedMatrix(PrepackedMatrix *pmatrix); From 8ba0ba1ea275da4c9e4920e3ba354e7ca3e4fb85 Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Wed, 4 Dec 2019 11:30:12 -0800 Subject: [PATCH 063/383] [tflite] Fix OSS build: int32 --> int32_t PiperOrigin-RevId: 283797658 Change-Id: I1ff08717dbc83f88cd3aed3b413c3a482bc904e5 --- tensorflow/lite/testing/tflite_driver.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc index 47293016ab6..3d988eb624a 100644 --- a/tensorflow/lite/testing/tflite_driver.cc +++ b/tensorflow/lite/testing/tflite_driver.cc @@ -256,11 +256,11 @@ bool TfLiteDriver::DataExpectation::QuantizedCheck(bool verbose, auto* quantization = reinterpret_cast(tensor.quantization.params); const float scale = quantization->scale->data[0]; - const int32 zero_point = quantization->zero_point->data[0]; + const int32_t zero_point = quantization->zero_point->data[0]; bool good_result = true; for (int i = 0; i < tensor.bytes; i++) { - const int32 computed = tensor.data.int8[i]; + const int32_t computed = tensor.data.int8[i]; const float dequantized = static_cast(scale * (computed - zero_point)); const float reference = Value(data_.get(), i); From 4abe9e27bc075d646c48dd250df572423ada9683 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 4 Dec 2019 11:34:01 -0800 Subject: [PATCH 064/383] [spirv] Define a few more extensions in SPIRVBase.td PiperOrigin-RevId: 283798496 Change-Id: I24b5782eac3eb0660836cb0c0a41d9e86442499f --- .../include/mlir/Dialect/SPIRV/SPIRVBase.td | 46 +++++++++++++++++-- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td index 2ee8f3bdd43..dd15895fbb6 100644 --- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td @@ -62,20 +62,60 @@ def SPV_Dialect : Dialect { // https://github.com/KhronosGroup/SPIRV-Registry has the full list. def SPV_KHR_16bit_storage : StrEnumAttrCase<"SPV_KHR_16bit_storage">; def SPV_KHR_8bit_storage : StrEnumAttrCase<"SPV_KHR_8bit_storage">; +def SPV_KHR_device_group : StrEnumAttrCase<"SPV_KHR_device_group">; def SPV_KHR_float_controls : StrEnumAttrCase<"SPV_KHR_float_controls">; +def SPV_KHR_physical_storage_buffer : StrEnumAttrCase<"SPV_KHR_physical_storage_buffer">; +def SPV_KHR_multiview : StrEnumAttrCase<"SPV_KHR_multiview">; +def SPV_KHR_no_integer_wrap_decoration : StrEnumAttrCase<"SPV_KHR_no_integer_wrap_decoration">; +def SPV_KHR_post_depth_coverage : StrEnumAttrCase<"SPV_KHR_post_depth_coverage">; def SPV_KHR_shader_atomic_counter_ops : StrEnumAttrCase<"SPV_KHR_shader_atomic_counter_ops">; def SPV_KHR_shader_ballot : StrEnumAttrCase<"SPV_KHR_shader_ballot">; +def SPV_KHR_shader_draw_parameters : StrEnumAttrCase<"SPV_KHR_shader_draw_parameters">; def SPV_KHR_storage_buffer_storage_class : StrEnumAttrCase<"SPV_KHR_storage_buffer_storage_class">; def SPV_KHR_subgroup_vote : StrEnumAttrCase<"SPV_KHR_subgroup_vote">; def SPV_KHR_variable_pointers : StrEnumAttrCase<"SPV_KHR_variable_pointers">; def SPV_KHR_vulkan_memory_model : StrEnumAttrCase<"SPV_KHR_vulkan_memory_model">; +def SPV_EXT_fragment_fully_covered : StrEnumAttrCase<"SPV_EXT_fragment_fully_covered">; +def SPV_EXT_fragment_invocation_density : StrEnumAttrCase<"SPV_EXT_fragment_invocation_density">; +def SPV_EXT_fragment_shader_interlock : StrEnumAttrCase<"SPV_EXT_fragment_shader_interlock">; +def SPV_EXT_physical_storage_buffer : StrEnumAttrCase<"SPV_EXT_physical_storage_buffer">; +def SPV_EXT_shader_stencil_export : StrEnumAttrCase<"SPV_EXT_shader_stencil_export">; + +def SPV_AMD_shader_explicit_vertex_parameter : StrEnumAttrCase<"SPV_AMD_shader_explicit_vertex_parameter">; + +def SPV_GOOGLE_user_type : StrEnumAttrCase<"SPV_GOOGLE_user_type">; + +def SPV_NV_compute_shader_derivatives : StrEnumAttrCase<"SPV_NV_compute_shader_derivatives">; +def SPV_NV_fragment_shader_barycentric : StrEnumAttrCase<"SPV_NV_fragment_shader_barycentric">; +def SPV_NV_geometry_shader_passthrough : StrEnumAttrCase<"SPV_NV_geometry_shader_passthrough">; +def SPV_NV_mesh_shader : StrEnumAttrCase<"SPV_NV_mesh_shader">; +def SPV_NV_ray_tracing : StrEnumAttrCase<"SPV_NV_ray_tracing">; +def SPV_NV_sample_mask_override_coverage : StrEnumAttrCase<"SPV_NV_sample_mask_override_coverage">; +def SPV_NV_shader_sm_builtins : StrEnumAttrCase<"SPV_NV_shader_sm_builtins">; +def SPV_NV_shading_rate : StrEnumAttrCase<"SPV_NV_shading_rate">; +def SPV_NV_stereo_view_rendering : StrEnumAttrCase<"SPV_NV_stereo_view_rendering">; +def SPV_NV_viewport_array2 : StrEnumAttrCase<"SPV_NV_viewport_array2">; + +def SPV_NVX_multiview_per_view_attributes : StrEnumAttrCase<"SPV_NVX_multiview_per_view_attributes">; + def SPV_ExtensionAttr : StrEnumAttr<"Extension", "supported SPIR-V extensions", [ - SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_float_controls, + SPV_KHR_16bit_storage, SPV_KHR_8bit_storage, SPV_KHR_device_group, + SPV_KHR_float_controls, SPV_KHR_physical_storage_buffer, SPV_KHR_multiview, + SPV_KHR_no_integer_wrap_decoration, SPV_KHR_post_depth_coverage, SPV_KHR_shader_atomic_counter_ops, SPV_KHR_shader_ballot, - SPV_KHR_storage_buffer_storage_class, SPV_KHR_subgroup_vote, - SPV_KHR_variable_pointers, SPV_KHR_vulkan_memory_model + SPV_KHR_shader_draw_parameters, SPV_KHR_storage_buffer_storage_class, + SPV_KHR_subgroup_vote, SPV_KHR_variable_pointers, + SPV_KHR_vulkan_memory_model, SPV_EXT_fragment_fully_covered, + SPV_EXT_fragment_invocation_density, SPV_EXT_fragment_shader_interlock, + SPV_EXT_physical_storage_buffer, SPV_EXT_shader_stencil_export, + SPV_AMD_shader_explicit_vertex_parameter, SPV_GOOGLE_user_type, + SPV_NV_compute_shader_derivatives, SPV_NV_fragment_shader_barycentric, + SPV_NV_geometry_shader_passthrough, SPV_NV_mesh_shader, SPV_NV_ray_tracing, + SPV_NV_sample_mask_override_coverage, SPV_NV_shader_sm_builtins, + SPV_NV_shading_rate, SPV_NV_stereo_view_rendering, + SPV_NV_viewport_array2, SPV_NVX_multiview_per_view_attributes, ]> { let cppNamespace = "::mlir::spirv"; } From fb23e44515781d6a22d84b10768eb583a57cd566 Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Wed, 4 Dec 2019 11:37:37 -0800 Subject: [PATCH 065/383] Handle edge cases gracefully in GetInvSqrtQuantizedMultiplierExp. PiperOrigin-RevId: 283799303 Change-Id: Iecffd57dbce7fa231cc20f3db5efa3f2bb9d474a --- tensorflow/lite/kernels/internal/BUILD | 1 + tensorflow/lite/kernels/internal/common.h | 13 +++++++++- .../internal/quantization_util_test.cc | 25 +++++++++++++++++++ 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index 646f14680ac..d71b36547f2 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -361,6 +361,7 @@ cc_test( name = "quantization_util_test", srcs = ["quantization_util_test.cc"], deps = [ + ":common", ":quantization_util", "@com_google_googletest//:gtest_main", ], diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h index 0c4fbc1e84e..5e4ba25b711 100644 --- a/tensorflow/lite/kernels/internal/common.h +++ b/tensorflow/lite/kernels/internal/common.h @@ -432,12 +432,23 @@ inline int32 GetReciprocal(int32 x, int x_integer_digits, inline void GetInvSqrtQuantizedMultiplierExp(int32 input, int reverse_shift, int32* output_inv_sqrt, int* output_shift) { + TFLITE_DCHECK_GE(input, 0); + if (input <= 1) { + // Handle the input value 1 separately to avoid overflow in that case + // in the general computation below (b/143972021). Also handle 0 as if it + // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid + // but rare/unrealistic input value. We can expect both to occur in some + // incompletely trained models, but probably not in fully trained models. + *output_inv_sqrt = std::numeric_limits::max(); + *output_shift = 0; + return; + } + TFLITE_DCHECK_GT(input, 1); *output_shift = 11; while (input >= (1 << 29)) { input /= 4; ++*output_shift; } - TFLITE_DCHECK_GT(input, 0); const unsigned max_left_shift_bits = CountLeadingZeros(static_cast(input)) - 1; const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2; diff --git a/tensorflow/lite/kernels/internal/quantization_util_test.cc b/tensorflow/lite/kernels/internal/quantization_util_test.cc index 132befbb020..053b3116a15 100644 --- a/tensorflow/lite/kernels/internal/quantization_util_test.cc +++ b/tensorflow/lite/kernels/internal/quantization_util_test.cc @@ -14,8 +14,11 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/kernels/internal/quantization_util.h" +#include + #include #include +#include "tensorflow/lite/kernels/internal/common.h" namespace tflite { namespace { @@ -397,6 +400,28 @@ TEST(QuantizationUtilTest, QuantizeMultiplierUnderflow) { } #endif +TEST(QuantizationUtilTest, GetInvSqrtQuantizedMultiplierExp) { + auto inv_sqrt = [](std::int32_t input) { + int32_t output; + int output_shift; + GetInvSqrtQuantizedMultiplierExp(input, 1, &output, &output_shift); + return std::pair{output, output_shift}; + }; + + const auto kInt32Max = std::numeric_limits::max(); + EXPECT_THAT(inv_sqrt(0), Pair(kInt32Max, 0)); + EXPECT_THAT(inv_sqrt(1), Pair(kInt32Max, 0)); + EXPECT_THAT(inv_sqrt(2), Pair(1518498372, 0)); + EXPECT_THAT(inv_sqrt(3), Pair(1239850284, 0)); + EXPECT_THAT(inv_sqrt(4), Pair(1073741828, 0)); + EXPECT_THAT(inv_sqrt(100), Pair(214748363, 0)); + EXPECT_THAT(inv_sqrt(10000), Pair(343597361, 4)); + EXPECT_THAT(inv_sqrt(1000000), Pair(274877901, 7)); + EXPECT_THAT(inv_sqrt(100000000), Pair(219902323, 10)); + EXPECT_THAT(inv_sqrt((1 << 30)), Pair(268435457, 12)); + EXPECT_THAT(inv_sqrt(kInt32Max), Pair(189812531, 12)); +} + TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) { auto quantize = [](double beta, double scale, int integer_bits) { int32_t q; From be81dfd5160daac5ef293d4541e38249846803f1 Mon Sep 17 00:00:00 2001 From: Robert Suderman Date: Wed, 4 Dec 2019 11:43:24 -0800 Subject: [PATCH 066/383] Extend support for TF Transpose Lowering TF Transpose lowering should support if permutations are i32. Added a secondary pass to handle. PiperOrigin-RevId: 283800701 Change-Id: I2e907c8c8c37bf2d36d0dad4265ca2812d17a47b --- tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir | 8 ++++++++ .../compiler/mlir/xla/transforms/legalize_tf_patterns.td | 5 +++-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index 8aa9b5ef101..25a50309bb9 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -1019,6 +1019,14 @@ func @transpose_2d(%arg0: tensor<2x3xf32>) -> tensor<3x2xf32> { return %0 : tensor<3x2xf32> } +// CHECK-LABEL: @transpose_3d_int32 +func @transpose_3d_int32(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> { + %permutation = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi32>} : () -> (tensor<3xi32>) + // CHECK: "xla_hlo.transpose" + %0 = "tf.Transpose"(%arg0, %permutation) : (tensor<1x2x3xf32>, tensor<3xi32>) -> tensor<3x2x1xf32> + return %0 : tensor<3x2x1xf32> +} + // CHECK-LABEL: @transpose_3d func @transpose_3d(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> { %permutation = "tf.Const"() {value = dense<[2, 1, 0]> : tensor<3xi64>} : () -> (tensor<3xi64>) diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td index fb8c6736309..c921e12f1f2 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td @@ -383,8 +383,9 @@ foreach Mapping = [ def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse), (HLO_ConvertOp $arg)>; -def : Pat<(TF_TransposeOp:$res $arg, (TF_ConstOp I64ElementsAttr:$permutation)), - (HLO_TransposeOp $arg, (CastIntElementsAttr $permutation))>; + +def : Pat<(TF_TransposeOp:$res $arg, (TF_ConstOp $permutation)), + (HLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>; foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in { def : Pat<(TfOp:$res AnyStaticShapeTensor:$arg, $ignored), From eeeb33262fad18c1c84774709828e3ad0630e47e Mon Sep 17 00:00:00 2001 From: Robert Suderman Date: Wed, 4 Dec 2019 11:56:49 -0800 Subject: [PATCH 067/383] Extend tf.split to include partial dynamic shape support If the splitting dimension is static then we can still support divide tf.split into a set of slice operations. PiperOrigin-RevId: 283803626 Change-Id: I3eaf2d565c7b39f069b101176720b54c600aa910 --- tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir | 10 ++++++++++ tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc | 8 ++++++-- .../mlir/xla/transforms/legalize_tf_patterns.td | 1 - 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index 25a50309bb9..5af7a1ffc31 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -1932,6 +1932,16 @@ func @split_match_and_split_into_two(%input: tensor<4x6xf32>) -> (tensor<2x6xf32 return %0#0, %0#1 : tensor<2x6xf32>, tensor<2x6xf32> } +// CHECK-LABEL: @split_match_and_split_into_two_dynamic +func @split_match_and_split_into_two_dynamic(%input: tensor<4x?xf32>) -> (tensor<2x?xf32>, tensor<2x?xf32>) { + %cst = "tf.Const"() {value = dense<0> : tensor} : () -> tensor + // CHECK: %[[ONE:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[2, -1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x?xf32>) -> tensor<2x?xf32> + // CHECK: %[[TWO:.*]] = "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[4, -1]> : tensor<2xi64>, start_indices = dense<[2, 0]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x?xf32>) -> tensor<2x?xf32> + %0:2 = "tf.Split"(%cst, %input) : (tensor, tensor<4x?xf32>) -> (tensor<2x?xf32>, tensor<2x?xf32>) + // CHECK: return %[[ONE]], %[[TWO]] + return %0#0, %0#1 : tensor<2x?xf32>, tensor<2x?xf32> +} + // CHECK-LABEL: @split_match_and_split_into_three func @split_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>) { %cst = "tf.Const"() {value = dense<1> : tensor} : () -> tensor diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc index f0ba67e2fd5..262091eb4c2 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc @@ -962,9 +962,9 @@ class ConvertSplitOp : public OpRewritePattern { PatternMatchResult matchAndRewrite(TF::SplitOp op, PatternRewriter &rewriter) const override { - // We can only match when the tensor to be split has fully static shape. + // We can only split along static dimensions. auto input_type = op.value()->getType().dyn_cast(); - if (!input_type || !input_type.hasStaticShape()) return matchFailure(); + if (!input_type) return matchFailure(); // We can only match when the split dimension is a constant scalar. DenseIntElementsAttr split_dim_attr; @@ -978,6 +978,10 @@ class ConvertSplitOp : public OpRewritePattern { // Calculate the dimension size for each slice along the split dimension. int64_t input_dim_size = input_type.getDimSize(dim_index); + // If we are splitting along the dynamic dimension then we cannot compute + // the static dimension length. + if (TensorType::isDynamic(input_dim_size)) return matchFailure(); + int64_t num_splits = op.getNumResults(); int64_t slice_size = input_dim_size / num_splits; diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td index c921e12f1f2..ca3a8406a2e 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td @@ -383,7 +383,6 @@ foreach Mapping = [ def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse), (HLO_ConvertOp $arg)>; - def : Pat<(TF_TransposeOp:$res $arg, (TF_ConstOp $permutation)), (HLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>; From 87ca1e5b05ce38035f5ba6f5ee0ab144d4b48ef5 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Wed, 4 Dec 2019 12:03:00 -0800 Subject: [PATCH 068/383] [XLA GPU] Remove degenerate dimensions for the reduction operands. This allows us to perform fast tiled reduce for reductions which weren't supported before, due to the presence of degenerate dimensions, e.g. reducing dimensions (0,2,3) on [100,1,200,1,300]. This also allows us in the future commit to simplify the pattern-matching logic for row/column reductions. PiperOrigin-RevId: 283805142 Change-Id: Ic9c45f922427e6fa7d1f6abac12eaad764682404 --- tensorflow/compiler/xla/service/gpu/BUILD | 22 ++++ .../xla/service/gpu/nvptx_compiler.cc | 3 + .../gpu/reduction_degenerate_dim_remover.cc | 103 ++++++++++++++++++ .../gpu/reduction_degenerate_dim_remover.h | 53 +++++++++ .../compiler/xla/service/gpu/tests/BUILD | 27 +++++ .../reduction_degenerate_dim_remover_test.cc | 65 +++++++++++ 6 files changed, 273 insertions(+) create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h create mode 100644 tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 9634401fe96..96cf0c5c22b 100755 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -1196,6 +1196,7 @@ cc_library( ":gpu_conv_padding_legalization", ":gpu_conv_rewriter", ":gpu_layout_assignment", + ":reduction_degenerate_dim_remover", ":stream_executor_util", ":target_constants", "//tensorflow/compiler/xla:status_macros", @@ -1664,3 +1665,24 @@ tf_cc_test( "//tensorflow/core:test", ], ) + +cc_library( + name = "reduction_degenerate_dim_remover", + srcs = ["reduction_degenerate_dim_remover.cc"], + hdrs = ["reduction_degenerate_dim_remover.h"], + deps = [ + ":ir_emission_utils", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_casting_utils", + "//tensorflow/compiler/xla/service:hlo_pass", + "//tensorflow/compiler/xla/service:pattern_matcher", + "//tensorflow/core:lib", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + ], +) diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 489cbd101e2..0c46910e86e 100755 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -32,6 +32,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/gpu_conv_rewriter.h" #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h" #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" +#include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h" #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" #include "tensorflow/compiler/xla/service/gpu/target_constants.h" #include "tensorflow/compiler/xla/service/hlo_constant_folding.h" @@ -154,6 +155,8 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment( /*allow_mixed_precision=*/false, LayoutAssignment::InstructionCanChangeLayout); + pipeline.AddPass(); + // The LayoutAssignment pass may leave behind kCopy instructions which are // duplicate or NOPs, so remove them with algebraic simplification and CSE. AlgebraicSimplifierOptions options; diff --git a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc new file mode 100644 index 00000000000..e3762aaef3a --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc @@ -0,0 +1,103 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h" + +#include + +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/pattern_matcher.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace xla { +namespace gpu { + +class ReductionDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor { + public: + Status HandleReduce(HloInstruction *instr) override { + HloInstruction *reduced_op = instr->mutable_operand(0); + const Shape &input_shape = reduced_op->shape(); + const Shape &reduce_shape = instr->shape(); + + if (!instr->shape().IsArray() || + !ShapeUtil::HasDegenerateDimensions(reduced_op->shape())) { + return Status::OK(); + } + Shape canonical_input_shape = + ShapeUtil::DropDegenerateDimensions(input_shape); + + Shape canonical_reduce_shape = + ShapeUtil::DropDegenerateDimensions(reduce_shape); + + const std::vector &reduced_dimensions = instr->dimensions(); + std::vector updated_reduced_dimensions; + int64 shift = 0; + + for (int dim = 0; dim < input_shape.rank(); dim++) { + if (input_shape.dimensions(dim) == 1) { + shift++; + } else { + if (absl::c_linear_search(reduced_dimensions, dim)) { + updated_reduced_dimensions.push_back(dim - shift); + } + } + } + + HloInstruction *input_reshape = instr->parent()->AddInstruction( + HloInstruction::CreateBitcast(canonical_input_shape, reduced_op)); + + std::unique_ptr new_reduce = HloInstruction::CreateReduce( + canonical_reduce_shape, input_reshape, instr->mutable_operand(1), + updated_reduced_dimensions, instr->to_apply()); + + if (canonical_reduce_shape != reduce_shape) { + HloInstruction *wrapped_reduce = + instr->parent()->AddInstruction(std::move(new_reduce)); + new_reduce = HloInstruction::CreateBitcast(reduce_shape, wrapped_reduce); + } + + return ReplaceWithNewInstruction(instr, std::move(new_reduce)); + } +}; + +template +static Status RunVisitor(HloModule *module, bool *changed) { + for (const auto &computation : module->computations()) { + Visitor visitor; + TF_RETURN_IF_ERROR(computation->Accept(&visitor)); + *changed |= visitor.changed(); + } + return Status::OK(); +} + +StatusOr ReductionDegenerateDimRemover::Run(HloModule *module) { + bool changed = false; + TF_RETURN_IF_ERROR( + RunVisitor(module, &changed)); + return changed; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h new file mode 100644 index 00000000000..eeb26da607a --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h @@ -0,0 +1,53 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_DEGENERATE_DIM_REMOVER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_DEGENERATE_DIM_REMOVER_H_ + +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { +namespace gpu { + +// Enforces the invariant that reduction input and output have no degenerate +// (size 1) dimension. Since these dimensions are physically meaningless, they +// are removed using bitcasts. +// +// For example, +// +// f[1] out = reduce(f[100, 1, 1] input, dimensions={0, 1}) +// +// becomes: +// +// +// f[100] tmp1 = f[100] bitcast(f[100, 1, 1], input) +// f[] tmp2 = reduce(f[100] tmp1, dimensions={0}) +// f[1] out = f[] bitcast(tmp2) +// +class ReductionDegenerateDimRemover : public HloModulePass { + public: + absl::string_view name() const override { + return "reduction-degenerate-dim-remover"; + } + + StatusOr Run(HloModule* module) override; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_DEGENERATE_DIM_REMOVER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD index 11cb5f0cbf7..a472bfc19d2 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/BUILD +++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD @@ -81,6 +81,33 @@ tf_cc_test( ], ) +tf_cc_test( + name = "reduction_degenerate_dim_remover_test", + srcs = [ + "reduction_degenerate_dim_remover_test.cc", + ], + tags = tf_cuda_tests_tags(), + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/service/gpu:gemm_rewriter", + "//tensorflow/compiler/xla/service/gpu:gpu_executable", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:llvm_irgen_test_base", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/memory", + ], +) + tf_cc_test( name = "gpu_copy_test", srcs = ["gpu_copy_test.cc"], diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc new file mode 100644 index 00000000000..9dd8a6fc664 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc @@ -0,0 +1,65 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace xla { +namespace gpu { + +namespace { + +class ReductionDegenerateDimRemoverTest : public GpuCodegenTest {}; + +TEST_F(ReductionDegenerateDimRemoverTest, ReductionWithDegenerateDimensions) { + const char* hlo_text = R"( +HloModule ReduceWithDegenerateDimensions + +add { + accum = f32[] parameter(0) + op = f32[] parameter(1) + ROOT out = f32[] add(accum, op) +} + +ENTRY main { + input = f32[1,3,1,4,1,5,1] parameter(0) + zero = f32[] constant(0) + + ROOT out = f32[1,1,1,1] reduce(input, zero), dimensions={1,3,5}, to_apply=add +} + +)"; + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); + MatchOptimizedHloWithShapes(hlo_text, + R"( +// CHECK: f32[] reduce(f32[3,4,5]{2,1,0} {{.+}}, f32[] {{.+}}), dimensions={0,1,2}, to_apply=%add + )"); +} + +} // namespace +} // namespace gpu +} // namespace xla From 53b5ce0b59b4d1e349975105791bfc01665b91d3 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Wed, 4 Dec 2019 12:05:52 -0800 Subject: [PATCH 069/383] NFC: Fix mismatches between LangRef.md and actual parser implementation. PiperOrigin-RevId: 283805832 Change-Id: I593e7a75775e4f92ce01a03145738d59ac68511b --- third_party/mlir/g3doc/LangRef.md | 42 ++++++++++++++------------ third_party/mlir/lib/Parser/Lexer.cpp | 2 +- third_party/mlir/lib/Parser/Parser.cpp | 14 ++++++--- 3 files changed, 33 insertions(+), 25 deletions(-) diff --git a/third_party/mlir/g3doc/LangRef.md b/third_party/mlir/g3doc/LangRef.md index 4e4345d450d..d084f0fa149 100644 --- a/third_party/mlir/g3doc/LangRef.md +++ b/third_party/mlir/g3doc/LangRef.md @@ -36,8 +36,8 @@ representing many different concepts: allocating buffers, producing views to transform them, target-independent arithmetic, target-specific operations, and even arbitrary user-defined high-level operations including the [Module](#module) and [Function](#functions) operations. Operations may contain -[Regions](#regions) that contain a Control Flow Graph (CFG) of -[Blocks](#blocks), which contain operations and end with a +[Regions](#regions) that represent a Control Flow Graph (CFG) of +[Blocks](#blocks), that contain operations and end with a [terminator operation](#terminator-operations) (like branches). Here's an example of an MLIR module: @@ -162,9 +162,10 @@ Syntax: // Identifiers bare-id ::= (letter|[_]) (letter|digit|[_$.])* bare-id-list ::= bare-id (`,` bare-id)* -ssa-id ::= `%` (digit+ | ((letter|id-punct) (letter|id-punct|digit)*)) +ssa-id ::= `%` suffix-id +suffix-id ::= (digit+ | ((letter|id-punct) (letter|id-punct|digit)*)) -symbol-ref-id ::= `@` (bare-id | string-literal) +symbol-ref-id ::= `@` (suffix-id | string-literal) ssa-id-list ::= ssa-id (`,` ssa-id)* // Uses of an SSA value, e.g. in an operand list to an operation. @@ -238,13 +239,17 @@ GPUs), and are required to align with the LLVM definition of these intrinsics. Syntax: ``` {.ebnf} -operation ::= op-result? string-literal `(` ssa-use-list? `)` - (`[` successor-list `]`)? (`(` region-list `)`)? - attribute-dict? `:` function-type -op-result ::= ssa-id ((`:` integer-literal) | (`,` ssa-id)*) `=` -successor ::= caret-id (`:` bb-arg-list)? -successor-list ::= successor (`,` successor)* -region-list ::= region (`,` region)* +operation ::= op-result-list? (generic-operation | custom-operation) + trailing-location? +generic-operation ::= string-literal '(' ssa-use-list? ')' attribute-dict? + `:` function-type +custom-operation ::= bare-id custom-operation-format +op-result-list ::= op-result (`,` op-result)* `=` +op-result ::= ssa-id (`:` integer-literal) +successor-list ::= successor (`,` successor)* +successor ::= caret-id (`:` bb-arg-list)? +region-list ::= region (`,` region)* +trailing-location ::= (`loc` `(` location `)`)? ``` MLIR introduces a uniform concept called _operations_ to enable describing many @@ -276,7 +281,6 @@ Example: // Invoke a TensorFlow function called tf.scramble with two inputs // and an attribute "fruit". %2 = "tf.scramble"(%result#0, %bar) {fruit: "banana"} : (f32, i32) -> f32 - ``` In addition to the basic syntax above, dialects may register known operations. @@ -374,16 +378,16 @@ func @example_fn_attr() attributes {dialectName.attrName = false} Syntax: ``` {.ebnf} -block ::= bb-label operation+ -bb-label ::= bb-id bb-arg-list? `:` -bb-id ::= caret-id -caret-id ::= `^` bare-id +block ::= block-label operation+ +block-label ::= block-id block-arg-list? `:` +block-id ::= caret-id +caret-id ::= `^` suffix-id ssa-id-and-type ::= ssa-id `:` type // Non-empty list of names and types. ssa-id-and-type-list ::= ssa-id-and-type (`,` ssa-id-and-type)* -bb-arg-list ::= `(` ssa-id-and-type-list? `)` +block-arg-list ::= `(` ssa-id-and-type-list? `)` ``` A [block](https://en.wikipedia.org/wiki/Basic_block) is a sequential list of @@ -444,7 +448,7 @@ The first block in the region cannot be a successor of any other block. The syntax for the region is as follows: ``` {.ebnf} -region ::= `{` block+ `}` +region ::= `{` block* `}` ``` The function body is an example of a region: it consists of a CFG of blocks and @@ -1120,7 +1124,7 @@ attribute-value ::= attribute-alias | dialect-attribute | standard-attribute ### Attribute Value Aliases ``` {.ebnf} -attribute-alias ::= '#' alias-name '=' 'type' type +attribute-alias ::= '#' alias-name '=' attribute-value attribute-alias ::= '#' alias-name ``` diff --git a/third_party/mlir/lib/Parser/Lexer.cpp b/third_party/mlir/lib/Parser/Lexer.cpp index e8034d82c15..29104c82e23 100644 --- a/third_party/mlir/lib/Parser/Lexer.cpp +++ b/third_party/mlir/lib/Parser/Lexer.cpp @@ -315,7 +315,7 @@ Token Lexer::lexNumber(const char *tokStart) { /// Lex an identifier that starts with a prefix followed by suffix-id. /// -/// affine-map-id ::= `#` suffix-id +/// attribute-id ::= `#` suffix-id /// ssa-id ::= '%' suffix-id /// block-id ::= '^' suffix-id /// type-id ::= '!' suffix-id diff --git a/third_party/mlir/lib/Parser/Parser.cpp b/third_party/mlir/lib/Parser/Parser.cpp index ccd5e17bda2..3ccee376985 100644 --- a/third_party/mlir/lib/Parser/Parser.cpp +++ b/third_party/mlir/lib/Parser/Parser.cpp @@ -324,7 +324,7 @@ public: /// Parse an optional trailing location. /// - /// trailing-location ::= location? + /// trailing-location ::= (`loc` `(` location `)`)? /// ParseResult parseOptionalTrailingLocation(Location &loc) { // If there is a 'loc' we parse a trailing location. @@ -3519,10 +3519,14 @@ Value *OperationParser::createForwardRefPlaceholder(SMLoc loc, Type type) { /// Parse an operation. /// -/// operation ::= -/// operation-result? string '(' ssa-use-list? ')' attribute-dict? -/// `:` function-type trailing-location? -/// operation-result ::= ssa-id ((`:` integer-literal) | (`,` ssa-id)*) `=` +/// operation ::= op-result-list? +/// (generic-operation | custom-operation) +/// trailing-location? +/// generic-operation ::= string-literal '(' ssa-use-list? ')' attribute-dict? +/// `:` function-type +/// custom-operation ::= bare-id custom-operation-format +/// op-result-list ::= op-result (`,` op-result)* `=` +/// op-result ::= ssa-id (`:` integer-literal) /// ParseResult OperationParser::parseOperation() { auto loc = getToken().getLoc(); From 63803fbbfac00cc5fde367c2e31dfd35e7c43661 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Wed, 4 Dec 2019 12:11:08 -0800 Subject: [PATCH 070/383] Drop MaterializeVectorTransfers in favor of simpler declarative unrolling Now that we have unrolling as a declarative pattern, we can drop a full pass that has gone stale. In the future we may want to add specific unrolling patterns for VectorTransferReadOp. PiperOrigin-RevId: 283806880 Change-Id: Icaaff22e7057f1779c6637afa69fcaf528ab185b --- third_party/mlir/BUILD | 1 - .../lib/Dialect/VectorOps/VectorToVector.cpp | 27 +- .../mlir/lib/Transforms/CMakeLists.txt | 1 - .../lib/Transforms/MaterializeVectors.cpp | 778 ------------------ 4 files changed, 14 insertions(+), 793 deletions(-) delete mode 100644 third_party/mlir/lib/Transforms/MaterializeVectors.cpp diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index 26e03c46df9..62343699e1e 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -1281,7 +1281,6 @@ cc_library( "lib/Transforms/LoopTiling.cpp", "lib/Transforms/LoopUnroll.cpp", "lib/Transforms/LoopUnrollAndJam.cpp", - "lib/Transforms/MaterializeVectors.cpp", "lib/Transforms/MemRefDataFlowOpt.cpp", "lib/Transforms/PipelineDataTransfer.cpp", "lib/Transforms/SimplifyAffineStructures.cpp", diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp index 0952312b67d..4654aff4582 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp @@ -549,8 +549,8 @@ struct ConvertMatchingFakeForkFakeJoinOp : public RewritePattern { // Rewrites a fakeFork, whose (unique) operand is a blockArgument, into multiple // vector.strided_slice ops. -struct ConvertFakeForkFromBlockArgsOp : public RewritePattern { - ConvertFakeForkFromBlockArgsOp(MLIRContext *context) +struct ConvertFakeForkFromBlockArgsOrTransferReadOp : public RewritePattern { + ConvertFakeForkFromBlockArgsOrTransferReadOp(MLIRContext *context) // low-benefit to kick-in late : RewritePattern(kFakeForkOp, 0, context) {} @@ -564,8 +564,9 @@ struct ConvertFakeForkFromBlockArgsOp : public RewritePattern { return matchSuccess(); } - auto *blockArg = op->getOperand(0); - if (!isa(blockArg)) + auto *arg = op->getOperand(0); + if (!isa(arg) && + !isa(arg->getDefiningOp())) return matchFailure(); LLVM_DEBUG(dbgs() << "\n[" DEBUG_TYPE @@ -579,14 +580,14 @@ struct ConvertFakeForkFromBlockArgsOp : public RewritePattern { if (unrollFactors.empty()) { // No more unrollFactors, just sanity check + forward the unique operand. assert(op->getNumResults() == 1); - assert(op->getOperand(0)->getType() == op->getResult(0)->getType()); - rewriter.replaceOp(op, op->getOperand(0)); + assert(arg->getType() == op->getResult(0)->getType()); + rewriter.replaceOp(op, arg); return matchSuccess(); } // Strides are always 1 for now. // TODO(b/144845578) support non-1 strides. - auto forkedVectorType = op->getOperand(0)->getType().cast(); + auto forkedVectorType = arg->getType().cast(); SmallVector strides(unrollFactors.size(), 1); auto nUnrolled = computeMaxLinearIndex(unrollFactors); SmallVector extractedVectors; @@ -602,8 +603,8 @@ struct ConvertFakeForkFromBlockArgsOp : public RewritePattern { leadingSize, unrollFactors); extractedVectors.push_back( rewriter - .create(op->getLoc(), blockArg, offsets, - sizes, strides) + .create(op->getLoc(), arg, offsets, sizes, + strides) .getResult()); } rewriter.replaceOp(op, extractedVectors); @@ -679,8 +680,8 @@ void mlir::populateVectorToVectorConversionPatterns( MLIRContext *context, OwningRewritePatternList &patterns, ArrayRef coarseVectorShape, ArrayRef fineVectorShape) { vector::populateWithGenerated(context, &patterns); - patterns.insert, DCEPattern>( - context); + patterns + .insert, DCEPattern>(context); } diff --git a/third_party/mlir/lib/Transforms/CMakeLists.txt b/third_party/mlir/lib/Transforms/CMakeLists.txt index 304e0547edb..d6c5bd88f7f 100644 --- a/third_party/mlir/lib/Transforms/CMakeLists.txt +++ b/third_party/mlir/lib/Transforms/CMakeLists.txt @@ -13,7 +13,6 @@ add_llvm_library(MLIRTransforms LoopTiling.cpp LoopUnrollAndJam.cpp LoopUnroll.cpp - MaterializeVectors.cpp MemRefDataFlowOpt.cpp PipelineDataTransfer.cpp SimplifyAffineStructures.cpp diff --git a/third_party/mlir/lib/Transforms/MaterializeVectors.cpp b/third_party/mlir/lib/Transforms/MaterializeVectors.cpp deleted file mode 100644 index 33f5927d88e..00000000000 --- a/third_party/mlir/lib/Transforms/MaterializeVectors.cpp +++ /dev/null @@ -1,778 +0,0 @@ -//===- MaterializeVectors.cpp - MaterializeVectors Pass Impl --------------===// -// -// Copyright 2019 The MLIR Authors. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// ============================================================================= -// -// This file implements target-dependent materialization of super-vectors to -// vectors of the proper size for the hardware. -// -//===----------------------------------------------------------------------===// - -#include "mlir/Analysis/AffineAnalysis.h" -#include "mlir/Analysis/Dominance.h" -#include "mlir/Analysis/LoopAnalysis.h" -#include "mlir/Analysis/NestedMatcher.h" -#include "mlir/Analysis/SliceAnalysis.h" -#include "mlir/Analysis/Utils.h" -#include "mlir/Dialect/AffineOps/AffineOps.h" -#include "mlir/Dialect/StandardOps/Ops.h" -#include "mlir/Dialect/VectorOps/Utils.h" -#include "mlir/Dialect/VectorOps/VectorOps.h" -#include "mlir/IR/AffineExpr.h" -#include "mlir/IR/AffineMap.h" -#include "mlir/IR/Attributes.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Location.h" -#include "mlir/IR/OperationSupport.h" -#include "mlir/IR/Types.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Support/Functional.h" -#include "mlir/Support/LLVM.h" -#include "mlir/Transforms/Passes.h" - -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/raw_ostream.h" - -/// -/// Implements target-dependent materialization of virtual super-vectors to -/// vectors of the proper size for the hardware. -/// -/// While the physical vector size is target-dependent, the pass is written in -/// a target-independent way: the target vector size is specified as a parameter -/// to the pass. This pass is thus a partial lowering that opens the "greybox" -/// that is the super-vector abstraction. In particular, this pass can turn the -/// vector.transfer_read and vector.transfer_write ops in either: -/// 1. a loop nest with either scalar and vector load/store operations; or -/// 2. a loop-nest with DmaStartOp / DmaWaitOp; or -/// 3. a pre-existing blackbox library call that can be written manually or -/// synthesized using search and superoptimization. -/// An important feature that either of these 3 target lowering abstractions -/// must handle is the handling of "non-effecting" padding with the proper -/// neutral element in order to guarantee that all "partial tiles" are actually -/// "full tiles" in practice. -/// -/// In particular this pass is a MLIR-MLIR rewriting and does not concern itself -/// with target-specific instruction-selection and register allocation. These -/// will happen downstream in LLVM. -/// -/// In this sense, despite performing lowering to a target-dependent size, this -/// pass is still target-agnostic. -/// -/// Implementation details -/// ====================== -/// The current decisions made by the super-vectorization pass guarantee that -/// use-def chains do not escape an enclosing vectorized AffineForOp. In other -/// words, this pass operates on a scoped program slice. Furthermore, since we -/// do not vectorize in the presence of conditionals for now, sliced chains are -/// guaranteed not to escape the innermost scope, which has to be either the top -/// Function scope or the innermost loop scope, by construction. As a -/// consequence, the implementation just starts from vector.transfer_write -/// operations and builds the slice scoped the innermost loop enclosing the -/// current vector.transfer_write. These assumptions and the implementation -/// details are subject to revision in the future. -/// -/// Example -/// ======== -/// In the following, the single vector.transfer_write op operates on a -/// vector<4x4x4xf32>. Let's assume the HW supports vector<4x4xf32>. -/// Materialization is achieved by instantiating each occurrence of the leading -/// dimension of vector<4x4x4xf32> into a vector<4x4xf32>. -/// The program transformation that implements this instantiation is a -/// multi-loop unroll-and-jam (it can be partial or full depending on the ratio -/// of super-vector shape to HW-vector shape). -/// -/// As a simple case, the following: -/// -/// ```mlir -/// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) { -/// %A = alloc (%M, %N, %O, %P) : memref -/// %f1 = constant dense, 1.000000e+00> : -/// vector<4x4x4xf32> affine.for %i0 = 0 to %M step 4 { -/// affine.for %i1 = 0 to %N step 4 { -/// affine.for %i2 = 0 to %O { -/// affine.for %i3 = 0 to %P step 4 { -/// vector.transfer_write %f1, %A[%i0, %i1, %i2, %i3] -/// {permutation_map: (d0, d1, d2, d3) -> (d3, d1, d0)} : -/// vector<4x4x4xf32>, memref -/// }}}} -/// return -/// } -/// ``` -/// -/// is instantiated by unroll-and-jam (just unroll in this case) into: -/// -/// ```mlir -/// mlfunc @materialize(%M : index, %N : index, %O : index, %P : index) { -/// %A = alloc (%M, %N, %O, %P) : memref -/// %f1 = constant dense, 1.000000e+00> : vector<4x4x4xf32> -/// affine.for %i0 = 0 to %arg0 step 4 { -/// affine.for %i1 = 0 to %arg1 step 4 { -/// affine.for %i2 = 0 to %arg2 { -/// affine.for %i3 = 0 to %arg3 step 4 { -/// vector.transfer_write f1, %0[%i0, %i1, %i2, %i3] -/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : -/// vector<4x4xf32>, memref -/// %i3p1 = affine.apply (d0) -> (d0 + 1)(%i3) -/// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p1] -/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : -/// vector<4x4xf32>, memref -/// %i3p2 = affine.apply (d0) -> (d0 + 2)(%i3) -/// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p2] -/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : -/// vector<4x4xf32>, memref -/// %i3p3 = affine.apply (d0) -> (d0 + 3)(%i3) -/// vector.transfer_write {{.*}}, %0[%i0, %i1, %i2, %i3p3] -/// {permutation_map: (d0, d1, d2, d3) -> (d1, d0)} : -/// vector<4x4xf32>, memref -/// }}}} -/// return -/// } -/// ``` - -using llvm::dbgs; -using llvm::SetVector; - -using namespace mlir; -using vector::TransferReadOp; -using vector::TransferWriteOp; - -using functional::makePtrDynCaster; -using functional::map; - -static llvm::cl::list - clVectorSize("vector-size", - llvm::cl::desc("Specify the HW vector size for vectorization"), - llvm::cl::ZeroOrMore); - -#define DEBUG_TYPE "materialize-vect" - -namespace { -struct MaterializationState { - /// In practice, the determination of the HW-specific vector type to use when - /// lowering a super-vector type must be based on the elemental type. The - /// elemental type must be retrieved from the super-vector type. In the future - /// information about hardware vector type for a particular elemental type - /// will be part of the contract between MLIR and the backend. - /// - /// For example, 8xf32 has the same size as 16xf16 but the targeted HW itself - /// may exhibit the following property: - /// 1. have a special unit for a 128xf16 datapath; - /// 2. no F16 FPU support on the regular 8xf32/16xf16 vector datapath. - /// - /// For now, we just assume hwVectorSize has the proper information regardless - /// of the type and we assert everything is f32. - /// TODO(ntv): relax the assumptions on admissible element type once a - /// contract exists. - MaterializationState(SmallVector sizes) : hwVectorSize(sizes) {} - - SmallVector hwVectorSize; - VectorType superVectorType; - VectorType hwVectorType; - SmallVector hwVectorInstance; - DenseMap *substitutionsMap; -}; - -/// Base state for the vector materialization pass. -/// Command line arguments are preempted by non-empty pass arguments. -struct MaterializeVectorsPass : public FunctionPass { - MaterializeVectorsPass() - : hwVectorSize(clVectorSize.begin(), clVectorSize.end()) {} - MaterializeVectorsPass(ArrayRef hwVectorSize) - : MaterializeVectorsPass() { - if (!hwVectorSize.empty()) - this->hwVectorSize.assign(hwVectorSize.begin(), hwVectorSize.end()); - } - - SmallVector hwVectorSize; - void runOnFunction() override; -}; - -} // end anonymous namespace - -/// Given a shape with sizes greater than 0 along all dimensions, -/// returns the distance, in number of elements, between a slice in a dimension -/// and the next slice in the same dimension. -/// e.g. shape[3, 4, 5] -> strides[20, 5, 1] -static SmallVector makeStrides(ArrayRef shape) { - SmallVector tmp; - tmp.reserve(shape.size()); - int64_t running = 1; - for (auto rit = shape.rbegin(), reit = shape.rend(); rit != reit; ++rit) { - assert(*rit > 0 && "size must be greater than 0 along all dimensions of " - "shape"); - tmp.push_back(running); - running *= *rit; - } - return SmallVector(tmp.rbegin(), tmp.rend()); -} - -/// Given a shape with sizes greater than 0 along all dimensions, returns the -/// delinearized components of linearIndex along shape. -static SmallVector delinearize(int64_t linearIndex, - ArrayRef shape) { - SmallVector res; - res.reserve(shape.size()); - auto strides = makeStrides(shape); - for (unsigned idx = 0; idx < strides.size(); ++idx) { - assert(strides[idx] > 0); - auto val = linearIndex / strides[idx]; - res.push_back(val); - assert(val < shape[idx] && "delinearization is out of bounds"); - linearIndex %= strides[idx]; - } - // Sanity check. - assert(linearIndex == 0 && "linear index constructed from shape must " - "have 0 remainder after delinearization"); - return res; -} - -static Operation *instantiate(OpBuilder b, Operation *opInst, - VectorType hwVectorType, - DenseMap *substitutionsMap); - -/// Not all Values belong to a program slice scoped within the immediately -/// enclosing loop. -/// One simple example is constants defined outside the innermost loop scope. -/// For such cases the substitutionsMap has no entry and we allow an additional -/// insertion. -/// For now, this is limited to ConstantOp because we do not vectorize loop -/// indices and will need to be extended in the future. -/// -/// If substitution fails, returns nullptr. -static Value *substitute(Value *v, VectorType hwVectorType, - DenseMap *substitutionsMap) { - auto it = substitutionsMap->find(v); - if (it == substitutionsMap->end()) { - auto *opInst = v->getDefiningOp(); - if (isa(opInst)) { - OpBuilder b(opInst); - auto *op = instantiate(b, opInst, hwVectorType, substitutionsMap); - auto res = substitutionsMap->insert(std::make_pair(v, op->getResult(0))); - assert(res.second && "Insertion failed"); - return res.first->second; - } - v->getDefiningOp()->emitError("missing substitution"); - return nullptr; - } - return it->second; -} - -/// Returns a list of single result AffineApplyOps that reindex the -/// `memRefIndices` by the multi-dimensional `hwVectorInstance`. This is used by -/// the function that materializes a vector.transfer operation to use hardware -/// vector types instead of super-vector types. -/// -/// The general problem this function solves is as follows: -/// Assume a vector.transfer operation at the super-vector granularity that has -/// `l` enclosing loops (AffineForOp). Assume the vector transfer operation -/// operates on a MemRef of rank `r`, a super-vector of rank `s` and a hardware -/// vector of rank `h`. For the purpose of illustration assume l==4, r==3, s==2, -/// h==1 and that the super-vector is vector<3x32xf32> and the hardware vector -/// is vector<8xf32>. Assume the following MLIR snippet after -/// super-vectorization has been applied: -/// -/// ```mlir -/// affine.for %i0 = 0 to %M { -/// affine.for %i1 = 0 to %N step 3 { -/// affine.for %i2 = 0 to %O { -/// affine.for %i3 = 0 to %P step 32 { -/// %r = vector.transfer_read(%A, map0(%i..), map1(%i..), map2(%i..)) : -/// vector<3x32xf32>, memref -/// ... -/// }}}} -/// ``` -/// -/// where map denotes an AffineMap operating on enclosing loops with properties -/// compatible for vectorization (i.e. some contiguity left unspecified here). -/// Note that the vectorized loops are %i1 and %i3. -/// This function translates the vector.transfer_read operation to multiple -/// instances of vector.transfer_read that operate on vector<8x32>. -/// -/// Without loss of generality, we assume hwVectorInstance is: {2, 1}. -/// The only constraints on hwVectorInstance is they belong to: -/// [0, 2] x [0, 3], which is the span of ratio of super-vector shape to -/// hardware vector shape in our example. -/// -/// This function instantiates the iteration <2, 1> of vector.transfer_read -/// into the set of operations in pseudo-MLIR: -/// -/// ```mlir -/// #map2 = (d0, d1, d2, d3) -> (d0, d1 + 2, d2, d3 + 1 * 8) -/// #map3 = #map o #map2 // where o denotes composition -/// aff0 = affine.apply #map3.0(%i..) -/// aff1 = affine.apply #map3.1(%i..) -/// aff2 = affine.apply #map3.2(%i..) -/// %r = vector.transfer_read(%A, %aff0, %aff1, %aff2): -// vector<3x32xf32>, memref -/// ``` -/// -/// Practical considerations -/// ======================== -/// For now, `map` is assumed to be the identity map and the indices are -/// specified just as vector.transfer_read%A[%i0, %i1, %i2, %i3]. This will be -/// extended in the future once we have a proper Op for vector transfers. -/// Additionally, the example above is specified in pseudo-MLIR form; once we -/// have proper support for generic maps we can generate the code and show -/// actual MLIR. -/// -/// TODO(ntv): support a concrete AffineMap and compose with it. -/// TODO(ntv): these implementation details should be captured in a -/// vectorization trait at the op level directly. -static SmallVector -reindexAffineIndices(OpBuilder b, VectorType hwVectorType, - ArrayRef hwVectorInstance, - ArrayRef memrefIndices) { - auto vectorShape = hwVectorType.getShape(); - assert(hwVectorInstance.size() >= vectorShape.size()); - - unsigned numIndices = memrefIndices.size(); - auto numMemRefIndices = numIndices - hwVectorInstance.size(); - auto numVectorIndices = hwVectorInstance.size() - vectorShape.size(); - - SmallVector affineExprs; - // TODO(ntv): support a concrete map and composition. - unsigned i = 0; - // The first numMemRefIndices correspond to AffineForOp that have not been - // vectorized, the transformation is the identity on those. - for (i = 0; i < numMemRefIndices; ++i) { - auto d_i = b.getAffineDimExpr(i); - affineExprs.push_back(d_i); - } - // The next numVectorIndices correspond to super-vector dimensions that - // do not have a hardware vector dimension counterpart. For those we only - // need to increment the index by the corresponding hwVectorInstance. - for (i = numMemRefIndices; i < numMemRefIndices + numVectorIndices; ++i) { - auto d_i = b.getAffineDimExpr(i); - auto offset = hwVectorInstance[i - numMemRefIndices]; - affineExprs.push_back(d_i + offset); - } - // The remaining indices correspond to super-vector dimensions that - // have a hardware vector dimension counterpart. For those we to increment the - // index by "hwVectorInstance" multiples of the corresponding hardware - // vector size. - for (; i < numIndices; ++i) { - auto d_i = b.getAffineDimExpr(i); - auto offset = hwVectorInstance[i - numMemRefIndices]; - auto stride = vectorShape[i - numMemRefIndices - numVectorIndices]; - affineExprs.push_back(d_i + offset * stride); - } - - // Create a bunch of single result AffineApplyOp. - SmallVector res; - res.reserve(affineExprs.size()); - for (auto expr : affineExprs) { - auto map = AffineMap::get(numIndices, 0, expr); - res.push_back(makeComposedAffineApply(b, b.getInsertionPoint()->getLoc(), - map, memrefIndices)); - } - return res; -} - -/// Returns attributes with the following substitutions applied: -/// - constant splat is replaced by constant splat of `hwVectorType`. -/// TODO(ntv): add more substitutions on a per-need basis. -static SmallVector -materializeAttributes(Operation *opInst, VectorType hwVectorType) { - SmallVector res; - for (auto a : opInst->getAttrs()) { - if (auto splat = a.second.dyn_cast()) { - auto attr = SplatElementsAttr::get(hwVectorType, splat.getSplatValue()); - res.push_back(NamedAttribute(a.first, attr)); - } else { - res.push_back(a); - } - } - return res; -} - -/// Creates an instantiated version of `opInst`. -/// Ops other than VectorTransferReadOp/VectorTransferWriteOp require no -/// affine reindexing. Just substitute their Value operands and be done. For -/// this case the actual instance is irrelevant. Just use the values in -/// substitutionsMap. -/// -/// If the underlying substitution fails, this fails too and returns nullptr. -static Operation *instantiate(OpBuilder b, Operation *opInst, - VectorType hwVectorType, - DenseMap *substitutionsMap) { - assert(!isa(opInst) && - "Should call the function specialized for VectorTransferReadOp"); - assert(!isa(opInst) && - "Should call the function specialized for VectorTransferWriteOp"); - if (opInst->getNumRegions() != 0) - return nullptr; - - bool fail = false; - auto operands = map( - [hwVectorType, substitutionsMap, &fail](Value *v) -> Value * { - auto *res = - fail ? nullptr : substitute(v, hwVectorType, substitutionsMap); - fail |= !res; - return res; - }, - opInst->getOperands()); - if (fail) - return nullptr; - - auto attrs = materializeAttributes(opInst, hwVectorType); - - OperationState state(opInst->getLoc(), opInst->getName().getStringRef(), - operands, {hwVectorType}, attrs); - return b.createOperation(state); -} - -/// Computes the permutationMap required for a VectorTransferOp from the memref -/// to the `hwVectorType`. -/// This is achieved by returning the projection of the permutationMap along the -/// dimensions of the super-vector type that remain in the hwVectorType. -/// In particular, if a dimension is fully instantiated (i.e. unrolled) then it -/// is projected out in the final result. -template -static AffineMap projectedPermutationMap(VectorTransferOpTy transfer, - VectorType hwVectorType) { - static_assert(std::is_same::value || - std::is_same::value, - "Must be called on a VectorTransferOp"); - auto superVectorType = transfer.getVectorType(); - auto optionalRatio = shapeRatio(superVectorType, hwVectorType); - assert(optionalRatio && - (optionalRatio->size() == superVectorType.getShape().size()) && - "Shape and ratio not of the same size"); - unsigned dim = 0; - SmallVector keep; - MLIRContext *context = transfer.getContext(); - functional::zipApply( - [&dim, &keep, context](int64_t shape, int64_t ratio) { - assert(shape >= ratio && "shape dim must be greater than ratio dim"); - if (shape != ratio) { - // HW vector is not full instantiated along this dim, keep it. - keep.push_back(getAffineDimExpr(dim, context)); - } - ++dim; - }, - superVectorType.getShape(), *optionalRatio); - auto permutationMap = transfer.permutation_map(); - LLVM_DEBUG(permutationMap.print(dbgs() << "\npermutationMap: ")); - if (keep.empty()) { - return permutationMap; - } - auto projectionMap = AffineMap::get(optionalRatio->size(), 0, keep); - LLVM_DEBUG(projectionMap.print(dbgs() << "\nprojectionMap: ")); - return simplifyAffineMap(projectionMap.compose(permutationMap)); -} - -/// Creates an instantiated version of `read` for the instance of -/// `hwVectorInstance` when lowering from a super-vector type to -/// `hwVectorType`. `hwVectorInstance` represents one particular instance of -/// `hwVectorType` int the covering of the super-vector type. For a more -/// detailed description of the problem, see the description of -/// reindexAffineIndices. -static Operation *instantiate(OpBuilder b, TransferReadOp read, - VectorType hwVectorType, - ArrayRef hwVectorInstance, - DenseMap *substitutionsMap) { - SmallVector indices = - map(makePtrDynCaster(), read.indices()); - auto affineIndices = - reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices); - auto map = projectedPermutationMap(read, hwVectorType); - if (!map) { - return nullptr; - } - auto cloned = b.create( - read.getLoc(), hwVectorType, read.memref(), affineIndices, - AffineMapAttr::get(map), read.padding()); - return cloned.getOperation(); -} - -/// Creates an instantiated version of `write` for the instance of -/// `hwVectorInstance` when lowering from a super-vector type to -/// `hwVectorType`. `hwVectorInstance` represents one particular instance of -/// `hwVectorType` int the covering of th3e super-vector type. For a more -/// detailed description of the problem, see the description of -/// reindexAffineIndices. -static Operation *instantiate(OpBuilder b, TransferWriteOp write, - VectorType hwVectorType, - ArrayRef hwVectorInstance, - DenseMap *substitutionsMap) { - SmallVector indices = - map(makePtrDynCaster(), write.indices()); - auto affineIndices = - reindexAffineIndices(b, hwVectorType, hwVectorInstance, indices); - auto cloned = b.create( - write.getLoc(), - substitute(write.vector(), hwVectorType, substitutionsMap), - write.memref(), affineIndices, - AffineMapAttr::get(projectedPermutationMap(write, hwVectorType))); - return cloned.getOperation(); -} - -/// Returns `true` if op instance is properly cloned and inserted, false -/// otherwise. -/// The multi-dimensional `hwVectorInstance` belongs to the shapeRatio of -/// super-vector type to hw vector type. -/// A cloned instance of `op` is formed as follows: -/// 1. vector.transfer_read: the return `superVectorType` is replaced by -/// `hwVectorType`. Additionally, affine indices are reindexed with -/// `reindexAffineIndices` using `hwVectorInstance` and vector type -/// information; -/// 2. vector.transfer_write: the `valueToStore` type is simply substituted. -/// Since we operate on a topologically sorted slice, a substitution must -/// have been registered for non-constant ops. Additionally, affine indices -/// are reindexed in the same way as for vector.transfer_read; -/// 3. constant ops are splats of the super-vector type by construction. -/// They are cloned to a splat on the hw vector type with the same value; -/// 4. remaining ops are cloned to version of the op that returns a hw vector -/// type, all operands are substituted according to `substitutions`. Thanks -/// to the topological order of a slice, the substitution is always -/// possible. -/// -/// Returns true on failure. -static bool instantiateMaterialization(Operation *op, - MaterializationState *state) { - LLVM_DEBUG(dbgs() << "\ninstantiate: " << *op); - - // Create a builder here for unroll-and-jam effects. - OpBuilder b(op); - // AffineApplyOp are ignored: instantiating the proper vector op will take - // care of AffineApplyOps by composing them properly. - if (isa(op)) { - return false; - } - if (op->getNumRegions() != 0) - return op->emitError("NYI path Op with region"), true; - - if (auto write = dyn_cast(op)) { - auto *clone = instantiate(b, write, state->hwVectorType, - state->hwVectorInstance, state->substitutionsMap); - return clone == nullptr; - } - if (auto read = dyn_cast(op)) { - auto *clone = instantiate(b, read, state->hwVectorType, - state->hwVectorInstance, state->substitutionsMap); - if (!clone) { - return true; - } - state->substitutionsMap->insert( - std::make_pair(read.getResult(), clone->getResult(0))); - return false; - } - // The only op with 0 results reaching this point must, by construction, be - // VectorTransferWriteOps and have been caught above. Ops with >= 2 results - // are not yet supported. So just support 1 result. - if (op->getNumResults() != 1) { - return op->emitError("NYI: ops with != 1 results"), true; - } - if (op->getResult(0)->getType() != state->superVectorType) { - return op->emitError("op does not return a supervector."), true; - } - auto *clone = - instantiate(b, op, state->hwVectorType, state->substitutionsMap); - if (!clone) { - return true; - } - state->substitutionsMap->insert( - std::make_pair(op->getResult(0), clone->getResult(0))); - return false; -} - -/// Takes a slice and rewrites the operations in it so that occurrences -/// of `superVectorType` are replaced by `hwVectorType`. -/// -/// Implementation -/// ============== -/// 1. computes the shape ratio of super-vector to HW vector shapes. This -/// gives for each op in the slice, how many instantiations are required -/// in each dimension; -/// 2. performs the concrete materialization. Note that in a first -/// implementation we use full unrolling because it pragmatically removes -/// the need to explicitly materialize an AllocOp. Thanks to the properties -/// of super-vectors, this unrolling is always possible and simple: -/// vectorizing to a super-vector abstraction already achieved the -/// equivalent of loop strip-mining + loop sinking and encoded this in the -/// vector type. -/// -/// Returns true on failure. -/// -/// TODO(ntv): materialized allocs. -/// TODO(ntv): full loops + materialized allocs. -/// TODO(ntv): partial unrolling + materialized allocs. -static bool emitSlice(MaterializationState *state, - SetVector *slice) { - auto ratio = shapeRatio(state->superVectorType, state->hwVectorType); - assert(ratio.hasValue() && - "ratio of super-vector to HW-vector shape is not integral"); - // The number of integer points in a hyperrectangular region is: - // shape[0] * strides[0]. - auto numValueToUnroll = (*ratio)[0] * makeStrides(*ratio)[0]; - // Full unrolling to hardware vectors in a first approximation. - for (unsigned idx = 0; idx < numValueToUnroll; ++idx) { - // Fresh RAII instanceIndices and substitutionsMap. - MaterializationState scopedState = *state; - scopedState.hwVectorInstance = delinearize(idx, *ratio); - DenseMap substitutionMap; - scopedState.substitutionsMap = &substitutionMap; - // slice are topologically sorted, we can just clone them in order. - for (auto *op : *slice) { - auto fail = instantiateMaterialization(op, &scopedState); - if (fail) { - op->emitError("unhandled super-vector materialization failure"); - return true; - } - } - } - - LLVM_DEBUG(dbgs() << "\nFunction is now\n"); - LLVM_DEBUG((*slice)[0]->getParentOfType().print(dbgs())); - - // slice are topologically sorted, we can just erase them in reverse - // order. Reverse iterator does not just work simply with an operator* - // dereference. - for (int idx = slice->size() - 1; idx >= 0; --idx) { - LLVM_DEBUG(dbgs() << "\nErase: "); - LLVM_DEBUG((*slice)[idx]->print(dbgs())); - (*slice)[idx]->erase(); - } - return false; -} - -/// Materializes super-vector types into concrete hw vector types as follows: -/// 1. start from super-vector terminators (current vector.transfer_write -/// ops); -/// 2. collect all the operations that can be reached by transitive use-defs -/// chains; -/// 3. get the superVectorType for this particular terminator and the -/// corresponding hardware vector type (for now limited to F32) -/// TODO(ntv): be more general than F32. -/// 4. emit the transitive useDef set to operate on the finer-grain vector -/// types. -/// -/// Notes -/// ===== -/// The `slice` is sorted in topological order by construction. -/// Additionally, this set is limited to operations in the same lexical scope -/// because we currently disallow vectorization of defs that come from another -/// scope. -/// TODO(ntv): please document return value. -static bool materialize(FuncOp f, const SetVector &terminators, - MaterializationState *state) { - DenseSet seen; - DominanceInfo domInfo(f); - for (auto *term : terminators) { - // Short-circuit test, a given terminator may have been reached by some - // other previous transitive use-def chains. - if (seen.count(term) > 0) { - continue; - } - - auto terminator = cast(term); - LLVM_DEBUG(dbgs() << "\nFrom terminator:" << *term); - - // Get the transitive use-defs starting from terminator, limited to the - // current enclosing scope of the terminator. See the top of the function - // Note for the justification of this restriction. - // TODO(ntv): relax scoping constraints. - auto *enclosingScope = term->getParentOp(); - auto keepIfInSameScope = [enclosingScope, &domInfo](Operation *op) { - assert(op && "NULL op"); - if (!enclosingScope) { - // by construction, everyone is always under the top scope (null scope). - return true; - } - return domInfo.properlyDominates(enclosingScope, op); - }; - SetVector slice = - getSlice(term, keepIfInSameScope, keepIfInSameScope); - assert(!slice.empty()); - - // Sanity checks: transitive slice must be completely disjoint from - // what we have seen so far. - LLVM_DEBUG(dbgs() << "\nTransitive use-defs:"); - for (auto *ud : slice) { - LLVM_DEBUG(dbgs() << "\nud:" << *ud); - assert(seen.count(ud) == 0 && - "Transitive use-defs not disjoint from already seen"); - seen.insert(ud); - } - - // Emit the current slice. - // Set scoped super-vector and corresponding hw vector types. - state->superVectorType = terminator.getVectorType(); - assert((state->superVectorType.getElementType() == - FloatType::getF32(term->getContext())) && - "Only f32 supported for now"); - state->hwVectorType = VectorType::get( - state->hwVectorSize, state->superVectorType.getElementType()); - auto fail = emitSlice(state, &slice); - if (fail) { - return true; - } - LLVM_DEBUG(dbgs() << "\nFunction is now\n"); - LLVM_DEBUG(f.print(dbgs())); - } - return false; -} - -void MaterializeVectorsPass::runOnFunction() { - // Thread-safe RAII local context, BumpPtrAllocator freed on exit. - NestedPatternContext mlContext; - - // TODO(ntv): Check to see if this supports arbitrary top-level code. - FuncOp f = getFunction(); - if (f.getBlocks().size() != 1) - return; - - using matcher::Op; - LLVM_DEBUG(dbgs() << "\nMaterializeVectors on Function\n"); - LLVM_DEBUG(f.print(dbgs())); - - MaterializationState state(hwVectorSize); - // Get the hardware vector type. - // TODO(ntv): get elemental type from super-vector type rather than force f32. - auto subVectorType = - VectorType::get(hwVectorSize, FloatType::getF32(&getContext())); - - // Capture terminators; i.e. vector.transfer_write ops involving a strict - // super-vector of subVectorType. - auto filter = [subVectorType](Operation &op) { - if (!isa(op)) { - return false; - } - return matcher::operatesOnSuperVectorsOf(op, subVectorType); - }; - auto pat = Op(filter); - SmallVector matches; - pat.match(f, &matches); - SetVector terminators; - for (auto m : matches) { - terminators.insert(m.getMatchedOperation()); - } - - if (materialize(f, terminators, &state)) - signalPassFailure(); -} - -std::unique_ptr> -mlir::createMaterializeVectorsPass(llvm::ArrayRef vectorSize) { - return std::make_unique(vectorSize); -} - -static PassRegistration - pass("affine-materialize-vectors", - "Materializes super-vectors to vectors of the " - "proper size for the hardware"); - -#undef DEBUG_TYPE From 7609150c701c1ee379ccb9676774a3855ecb2268 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 12:24:58 -0800 Subject: [PATCH 071/383] Create tensorflow/core/framework:numeric_types and tensorflow/core/framework:bfloat16 targets. PiperOrigin-RevId: 283809597 Change-Id: Ib29daa2921d2d83f9e4645613c238d734f24aa6e --- tensorflow/core/BUILD | 2 ++ tensorflow/core/framework/BUILD | 44 +++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 79d4affa68f..7f042072f49 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2696,6 +2696,8 @@ tf_cuda_library( "@com_google_absl//absl/time", "//third_party/eigen3", "//tensorflow/core/framework:attr_value_proto_text", + "//tensorflow/core/framework:bfloat16", + "//tensorflow/core/framework:numeric_types", "//tensorflow/core/kernels:bounds_check", "//tensorflow/core/platform/default/build_config:platformlib", "//tensorflow/core/profiler/lib:traceme", diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD index fd3b3a34ad6..2f975195391 100644 --- a/tensorflow/core/framework/BUILD +++ b/tensorflow/core/framework/BUILD @@ -25,8 +25,6 @@ exports_files( "allocator_registry.h", "attr_value_util.cc", "attr_value_util.h", - "bfloat16.cc", - "bfloat16.h", "bounds_check.h", "cancellation.cc", "cancellation.h", @@ -73,7 +71,6 @@ exports_files( "node_def_util.cc", "node_def_util.h", "numeric_op.h", - "numeric_types.h", "op.cc", "op.h", "op_def_builder.cc", @@ -267,12 +264,13 @@ filegroup( "**/*test*", "**/*main.cc", "allocator.cc", - "cpu_allocator_impl.cc", "allocator_registry.cc", - "tracking_allocator.cc", + "bfloat16.cc", + "cpu_allocator_impl.cc", "fake_input.*", "op_gen_lib.*", "reader_base.*", + "tracking_allocator.cc", ], ), ) @@ -317,6 +315,42 @@ filegroup( ], ) +# Individual targets. These should be prefered over tensorflow/core:framework +# whenever possible. +cc_library( + name = "bfloat16", + srcs = ["bfloat16.cc"], + hdrs = ["bfloat16.h"], + visibility = ["//tensorflow/core:__subpackages__"], + deps = [ + ":numeric_types", + "//tensorflow/core/platform:byte_order", + "//tensorflow/core/platform:types", + ], +) + +cc_library( + name = "numeric_types", + hdrs = ["numeric_types.h"], + visibility = ["//tensorflow/core:__subpackages__"], + deps = [ + "//tensorflow/core/lib/bfloat16", + "//tensorflow/core/platform:types", + "//third_party/eigen3", + ], +) + +# Files whose users still need to be migrated from core:framework to the +# above targets. +# TODO(gonnet): Remove these files once targets depending on them have +# been cleaned up. +exports_files( + srcs = [ + "bfloat16.h", + "numeric_types.h", + ], +) + # All framewrok protos are self-contained, i.e. they only import other # protos from the same package, so we can build the protos here and then # link them from core:protos_all without circular dependencies. From 269ebca78c6e57eee11de6017e8c2765962d9a54 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Wed, 4 Dec 2019 12:31:36 -0800 Subject: [PATCH 072/383] [CSE] NFC: Hash the attribute dictionary pointer instead of the list of attributes. PiperOrigin-RevId: 283810829 Change-Id: I4964113a1293e34e0b3859a75ac33cb9bd74e2d9 --- third_party/mlir/include/mlir/IR/Attributes.h | 7 +++++++ third_party/mlir/lib/Transforms/CSE.cpp | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/third_party/mlir/include/mlir/IR/Attributes.h b/third_party/mlir/include/mlir/IR/Attributes.h index b5b9a0491f4..ebff99ddcd5 100644 --- a/third_party/mlir/include/mlir/IR/Attributes.h +++ b/third_party/mlir/include/mlir/IR/Attributes.h @@ -1373,6 +1373,13 @@ public: : attrs((attrs && !attrs.empty()) ? attrs : nullptr) {} NamedAttributeList(ArrayRef attributes); + bool operator!=(const NamedAttributeList &other) const { + return !(*this == other); + } + bool operator==(const NamedAttributeList &other) const { + return attrs == other.attrs; + } + /// Return the underlying dictionary attribute. This may be null, if this list /// has no attributes. DictionaryAttr getDictionary() const { return attrs; } diff --git a/third_party/mlir/lib/Transforms/CSE.cpp b/third_party/mlir/lib/Transforms/CSE.cpp index c3d30bf65a1..70eb69c2f9c 100644 --- a/third_party/mlir/lib/Transforms/CSE.cpp +++ b/third_party/mlir/lib/Transforms/CSE.cpp @@ -47,7 +47,7 @@ struct SimpleOperationInfo : public llvm::DenseMapInfo { // - Result Types // - Operands return hash_combine( - op->getName(), op->getAttrs(), + op->getName(), op->getAttrList().getDictionary(), hash_combine_range(op->result_type_begin(), op->result_type_end()), hash_combine_range(op->operand_begin(), op->operand_end())); } @@ -68,7 +68,7 @@ struct SimpleOperationInfo : public llvm::DenseMapInfo { lhs->getNumResults() != rhs->getNumResults()) return false; // Compare attributes. - if (lhs->getAttrs() != rhs->getAttrs()) + if (lhs->getAttrList() != rhs->getAttrList()) return false; // Compare operands. if (!std::equal(lhs->operand_begin(), lhs->operand_end(), From ecb57cfd7abf428f6ddb5f0453bbf05560132c9c Mon Sep 17 00:00:00 2001 From: Tong Shen Date: Wed, 4 Dec 2019 12:44:40 -0800 Subject: [PATCH 073/383] Add debug information for _Arg nodes. PiperOrigin-RevId: 283813323 Change-Id: I6696b29d6f4fb56af72fde91ee59d5ba6924e0a4 --- tensorflow/compiler/tf2xla/xla_compiler.cc | 7 ++++++- tensorflow/compiler/tf2xla/xla_compiler.h | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc index 9d10be1d90a..b9801e7d15a 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.cc +++ b/tensorflow/compiler/tf2xla/xla_compiler.cc @@ -1059,7 +1059,12 @@ Status XlaCompiler::BuildArguments( const XlaCompiler::Argument& arg = args[input_to_args->at(i)]; VLOG(2) << " XLA arg " << i << " shape: " << xla::ShapeUtil::HumanString(arg_shapes[i]) - << " name: " << arg.name << " TF arg " << input_to_args->at(i); + << " name: " << arg.name << " TF arg " << input_to_args->at(i) + << " node name: " << arg.node_name + << (arg_shardings.find(i) == arg_shardings.end() + ? "" + : absl::StrCat(" sharding: ", + arg_shardings.at(i).DebugString())); XlaExpression& arg_expression = (*arg_expressions)[input_to_args->at(i)]; switch (arg.kind) { case XlaCompiler::Argument::kResource: { diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h index c3e9b3edeca..670da043c1a 100644 --- a/tensorflow/compiler/tf2xla/xla_compiler.h +++ b/tensorflow/compiler/tf2xla/xla_compiler.h @@ -147,6 +147,9 @@ class XlaCompiler { // The name of this argument, used for debugging. string name; + // The name of TensorFlow _Arg node, used for debugging. + string node_name; + // For a kResource, what kind of resource is it? XlaResource::Kind resource_kind = XlaResource::kInvalid; From e12ad06b215ab66cb9abf679ea4ed5ca16be41ed Mon Sep 17 00:00:00 2001 From: Yuanzhong Xu Date: Wed, 4 Dec 2019 12:55:36 -0800 Subject: [PATCH 074/383] [MLIR:TF/XLA] Handle nested regions in side-effect analysis. PiperOrigin-RevId: 283815697 Change-Id: I41c59f6a6f3a455e639f4413b7c3300bc0745cc5 --- .../analysis/side_effect_analysis.cc | 174 +++++++++++------- .../analysis/side_effect_analysis.h | 35 +++- .../tests/side-effect-analysis-test.mlir | 83 ++++++--- 3 files changed, 198 insertions(+), 94 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc index 8d43c9330d0..898393479b0 100644 --- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc +++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc @@ -34,6 +34,7 @@ limitations under the License. #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir #include "mlir/Support/LLVM.h" // TF:local_config_mlir #include "mlir/Support/LogicalResult.h" // TF:local_config_mlir +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" #include "tensorflow/compiler/tf2xla/resource_operation_table.h" @@ -99,12 +100,13 @@ void ResourceAliasAnalysis::AnalyzeFunction(FuncOp func_op) { auto forward_input_to_output = [&](Value* operand, Value* result) { if (!mlir::getElementTypeOrSelf(result->getType()).isa()) return; + auto& result_ids = resource_value_to_ids_[result]; auto operand_it = resource_value_to_ids_.find(operand); assert(operand_it != resource_value_to_ids_.end() && "A resource-type output does not have the corresponding " "resource-type input."); - resource_value_to_ids_[result].insert(operand_it->getSecond().begin(), - operand_it->getSecond().end()); + result_ids.insert(operand_it->getSecond().begin(), + operand_it->getSecond().end()); }; // TODO(yuanzx): Consider control-flow ops. func_op.walk([&](Operation* op) { @@ -119,6 +121,16 @@ void ResourceAliasAnalysis::AnalyzeFunction(FuncOp func_op) { forward_input_to_output(std::get<0>(operand_and_result), std::get<1>(operand_and_result)); } + } else if (auto replicate = llvm::dyn_cast(op)) { + // The nested block for RepliateOp is handled separately in side-effect + // analysis. Inside that block, we can still treat its block arguments as + // different resources. + for (auto arg : replicate.GetBody().getArguments()) { + if (mlir::getElementTypeOrSelf(arg->getType()) + .isa()) { + resource_value_to_ids_[arg].insert(next_unique_id++); + } + } } else { for (auto result : op->getResults()) { if (!mlir::getElementTypeOrSelf(result->getType()) @@ -261,9 +273,36 @@ void SideEffectAnalysis::AddPredecessorsForAccess(int64_t resource_id, void SideEffectAnalysis::AnalyzeFunction( FuncOp func_op, const ResourceAliasAnalysis& alias_analysis) { - // This function populates control_predecessors_ and control_successors_ by - // walking through func_op's body, and tracking resource accesses in - // per_resource_access_info_. + // AnalyzeRegion() recursively analyzes the function body, and only populates + // control_predecessors_. + AnalyzeRegion(&func_op.getBody(), alias_analysis); + // Populate sorted_control_predecessors_ and sorted_control_successors_ based + // on control_predecessors. + for (auto& entry : control_predecessors_) { + auto op = entry.getFirst(); + auto& sorted_predecessors = sorted_control_predecessors_[op]; + for (auto predecessor : entry.getSecond()) { + sorted_predecessors.push_back(predecessor); + sorted_control_successors_[predecessor].push_back(op); + } + } + control_predecessors_.clear(); + for (auto& entry : sorted_control_predecessors_) { + llvm::sort(entry.getSecond(), [](Operation* a, Operation* b) { + return a->isBeforeInBlock(b); + }); + } + for (auto& entry : sorted_control_successors_) { + llvm::sort(entry.getSecond(), [](Operation* a, Operation* b) { + return a->isBeforeInBlock(b); + }); + } +} + +void SideEffectAnalysis::AnalyzeRegion( + Region* region, const ResourceAliasAnalysis& alias_analysis) { + // This function populates control_predecessors_ by walking through the + // region, and tracking resource accesses in per_resource_access_info_. // Returns whether an access to `resource` can skip control edges from // prevoius accesses to unknown resources, due to that earlier accesses to @@ -284,82 +323,93 @@ void SideEffectAnalysis::AnalyzeFunction( (it->second.tracked_last_unknown_read || no_unknown_read); }; - func_op.walk([&](Operation* op) { - // We do not need explicit control edges for declaration ops. - if (OpIsDeclaration(op, alias_analysis)) return; - - auto resource_op_info = GetResourceInfoForOp(op); - if (!resource_op_info && op->hasNoSideEffect()) return; - - llvm::SmallDenseSet resources = - resource_op_info ? FindAccessedResources(op, alias_analysis) - : UnknownResourceSet(); - assert(!resources.empty()); - const bool is_unknown = resources.count(kUnknownResourceId) > 0; - const bool read_only = OpIsReadOnly(op); - bool indirectly_tracked_unknown_access = false; - // First add edges from known resources. - if (is_unknown) { - for (auto& entry : per_resource_access_info_) { - if (entry.getFirst() == kUnknownResourceId) continue; - AddPredecessorsForAccess(entry.getFirst(), op, read_only); - indirectly_tracked_unknown_access |= - unknown_access_indirectly_tracked_by_resource(entry.getFirst(), - read_only); + // We explicitly iterates through the regions and blocks, in order to handle + // different nested regions separately. + for (auto& block : *region) { + for (auto& op : block) { + if (op.getNumRegions() > 0) { + llvm::SmallVector child_analyses; + for (auto& child_region : op.getRegions()) { + child_analyses.emplace_back(); + child_analyses.back().AnalyzeRegion(&child_region, alias_analysis); + } + ConsumeChildAnalyses(std::move(child_analyses)); } - } else { - for (int64_t resource : resources) { - AddPredecessorsForAccess(resource, op, read_only); - indirectly_tracked_unknown_access |= - unknown_access_indirectly_tracked_by_resource(resource, read_only); - // Update access info for known resources. - TrackAccess(resource, op, read_only); - } - } - // If not indirectly tracked, add edges from the unknown resource. - if (!indirectly_tracked_unknown_access) { - AddPredecessorsForAccess(kUnknownResourceId, op, read_only); - } - if (is_unknown) { - // Update access info for unknown resource. - TrackAccess(kUnknownResourceId, op, read_only); - } - }); - // Populate control_successors_ based on control_predecessors_. - for (auto& entry : control_predecessors_) { - auto op = entry.getFirst(); - for (auto predecessor : entry.getSecond()) { - control_successors_[predecessor].insert(op); + // We do not need explicit control edges for declaration ops. + if (OpIsDeclaration(&op, alias_analysis)) continue; + + auto resource_op_info = GetResourceInfoForOp(&op); + if (!resource_op_info && op.hasNoSideEffect()) continue; + + llvm::SmallDenseSet resources = + resource_op_info ? FindAccessedResources(&op, alias_analysis) + : UnknownResourceSet(); + assert(!resources.empty()); + const bool is_unknown = resources.count(kUnknownResourceId) > 0; + const bool read_only = OpIsReadOnly(&op); + bool indirectly_tracked_unknown_access = false; + // First add edges from known resources. + if (is_unknown) { + for (auto& entry : per_resource_access_info_) { + if (entry.getFirst() == kUnknownResourceId) continue; + AddPredecessorsForAccess(entry.getFirst(), &op, read_only); + indirectly_tracked_unknown_access |= + unknown_access_indirectly_tracked_by_resource(entry.getFirst(), + read_only); + } + } else { + for (int64_t resource : resources) { + AddPredecessorsForAccess(resource, &op, read_only); + indirectly_tracked_unknown_access |= + unknown_access_indirectly_tracked_by_resource(resource, + read_only); + // Update access info for known resources. + TrackAccess(resource, &op, read_only); + } + } + // If not indirectly tracked, add edges from the unknown resource. + if (!indirectly_tracked_unknown_access) { + AddPredecessorsForAccess(kUnknownResourceId, &op, read_only); + } + if (is_unknown) { + // Update access info for unknown resource. + TrackAccess(kUnknownResourceId, &op, read_only); + } } } } -llvm::SmallVector SideEffectAnalysis::DirectControlPredecessors( +void SideEffectAnalysis::ConsumeChildAnalyses( + llvm::SmallVector&& children) { + for (auto& child : children) { + for (auto& entry : child.control_predecessors_) { + control_predecessors_[entry.getFirst()] = std::move(entry.getSecond()); + } + } +} + +llvm::SmallVector SideEffectAnalysis::DirectControlPredecessors( Operation* op, llvm::function_ref filter) const { - llvm::SmallVector result; - auto it = control_predecessors_.find(op); - if (it == control_predecessors_.end()) return result; + llvm::SmallVector result; + auto it = sorted_control_predecessors_.find(op); + if (it == sorted_control_predecessors_.end()) return result; result.reserve(it->getSecond().size()); for (auto predecessor : it->getSecond()) { if (!filter || filter(predecessor)) result.push_back(predecessor); } - llvm::sort(result, - [](Operation* a, Operation* b) { return a->isBeforeInBlock(b); }); return result; } -llvm::SmallVector SideEffectAnalysis::DirectControlSuccessors( +llvm::SmallVector SideEffectAnalysis::DirectControlSuccessors( Operation* op, llvm::function_ref filter) const { - llvm::SmallVector result; - auto it = control_successors_.find(op); - if (it == control_successors_.end()) return result; + llvm::SmallVector result; + auto it = sorted_control_successors_.find(op); + if (it == sorted_control_successors_.end()) return result; result.reserve(it->getSecond().size()); for (auto successor : it->getSecond()) { if (!filter || filter(successor)) result.push_back(successor); } - llvm::sort(result, - [](Operation* a, Operation* b) { return a->isBeforeInBlock(b); }); return result; } diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h index 5eee28a6ae0..3d65217db27 100644 --- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h +++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h @@ -32,6 +32,9 @@ namespace TF { // An analysis that runs on a function and maps each resource-type value to a // set of unique int64_t IDs representing the possible resources it could alias. +// +// If there are nested regions, each region is handled separately. This means +// cross-region aliasing cannot be checked by this analysis. class ResourceAliasAnalysis { public: explicit ResourceAliasAnalysis(Operation* op); @@ -63,8 +66,12 @@ class ResourceAliasAnalysis { // interfering with all known resource op accesses. It distinguishes accesses // based on whether they are read-only, and read-only ops do not interfer with // each other. +// +// If there are nested regions, each region is handled separately, and control +// dependencies are only tracked for ops under the same parent op. class SideEffectAnalysis { public: + explicit SideEffectAnalysis() = default; explicit SideEffectAnalysis(Operation* op); SideEffectAnalysis(SideEffectAnalysis&& other) = default; ~SideEffectAnalysis() = default; @@ -72,23 +79,32 @@ class SideEffectAnalysis { // Returns a vector of ops that are direct control predecessors of `op`, // sorted in program order. If `filter` is provided, only predecessors that // pass the filter (returning true) will be included. - llvm::SmallVector DirectControlPredecessors( + llvm::SmallVector DirectControlPredecessors( Operation* op, llvm::function_ref filter = nullptr) const; // Returns a vector of ops that are direct control successors of `op`, sorted // in program order. If `filter` is provided, only successors that pass the // filter (returning true) will be included. - llvm::SmallVector DirectControlSuccessors( + llvm::SmallVector DirectControlSuccessors( Operation* op, llvm::function_ref filter = nullptr) const; private: - // Runs the analysis on `func_op` and populates control_predecessors_ and - // control_successors_. + // Runs the analysis on `func_op` and populates sorted_control_predecessors_ + // and sorted_control_successors_. void AnalyzeFunction(FuncOp func_op, const ResourceAliasAnalysis& alias_analysis); + // Runs the analysis on `region` and populates control_predecessors_. + void AnalyzeRegion(Region* region, + const ResourceAliasAnalysis& alias_analysis); + + // Moves the control_predecessors_ fields in `children` analyses to this + // current analysis. + void ConsumeChildAnalyses( + llvm::SmallVector&& children); + // Updates control_predecessors_ for `op` that is being visted, on the given // `resource_id`. void AddPredecessorsForAccess(int64_t resource_id, Operation* op, @@ -98,11 +114,14 @@ class SideEffectAnalysis { void TrackAccess(int64_t resource_id, Operation* op, bool read_only); // Maps from an op to its control predecessors. - llvm::SmallDenseMap, 8> + llvm::SmallDenseMap, 8> control_predecessors_; - // Maps from an op to its control successors. - llvm::SmallDenseMap, 8> - control_successors_; + // Maps from an op to its control predecessors sorted in program order. + llvm::SmallDenseMap, 8> + sorted_control_predecessors_; + // Maps from an op to its control successors sorted in program order. + llvm::SmallDenseMap, 8> + sorted_control_successors_; // Internal per-resource data structure when we build the dependencies. struct PerResourceAcessInfo { diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir index c6eb4663e57..678c2373a1b 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir @@ -6,18 +6,15 @@ // CHECK-LABEL: func @non_aliasing_reads_writes func @non_aliasing_reads_writes( // expected-remark@above {{ID: 13}} -// expected-remark@above {{Predecessors: {12}}} %arg0: tensor<*x!tf.resource>>, %arg1: tensor<*x!tf.resource>>, %arg2: tensor<32xf32>) -> (tensor<32xf32>) { %graph = tf_executor.graph { // expected-remark@above {{ID: 11}} - // expected-remark@above {{Predecessors: {10}}} // expected-remark@above {{Successors: {12}}} // CHECK: tf_executor.island %island:2 = tf_executor.island { // expected-remark@above {{ID: 9}} - // expected-remark@above {{Predecessors: {8}}} // expected-remark@above {{Successors: {10}}} %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>>) -> tensor<32xf32> // expected-remark@above {{ID: 0}} @@ -49,17 +46,14 @@ func @non_aliasing_reads_writes( tf_executor.yield %read3 : tensor<32xf32> // expected-remark@above {{ID: 8}} // expected-remark@above {{Predecessors: {4,5,7}}} - // expected-remark@above {{Successors: {9}}} } tf_executor.fetch %island#0 : tensor<32xf32> // expected-remark@above {{ID: 10}} // expected-remark@above {{Predecessors: {9}}} - // expected-remark@above {{Successors: {11}}} } return %graph : tensor<32xf32> // expected-remark@above {{ID: 12}} // expected-remark@above {{Predecessors: {11}}} - // expected-remark@above {{Successors: {13}}} } // ----- @@ -70,15 +64,12 @@ func @non_aliasing_reads_writes( // CHECK-LABEL: func @aliasing_reads_writes func @aliasing_reads_writes(%arg0: tensor<32xf32>) -> () { // expected-remark@above {{ID: 14}} -// expected-remark@above {{Predecessors: {13}}} tf_executor.graph { // expected-remark@above {{ID: 12}} - // expected-remark@above {{Predecessors: {11}}} // expected-remark@above {{Successors: {13}}} // CHECK: tf_executor.island %island = tf_executor.island { // expected-remark@above {{ID: 10}} - // expected-remark@above {{Predecessors: {9}}} // expected-remark@above {{Successors: {11}}} %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> tensor<*x!tf.resource>> // expected-remark@above {{ID: 0}} @@ -112,17 +103,14 @@ func @aliasing_reads_writes(%arg0: tensor<32xf32>) -> () { tf_executor.yield // expected-remark@above {{ID: 9}} // expected-remark@above {{Predecessors: {8}}} - // expected-remark@above {{Successors: {10}}} } tf_executor.fetch %island : !tf_executor.control // expected-remark@above {{ID: 11}} // expected-remark@above {{Predecessors: {10}}} - // expected-remark@above {{Successors: {12}}} } return // expected-remark@above {{ID: 13}} // expected-remark@above {{Predecessors: {12}}} - // expected-remark@above {{Successors: {14}}} } // ----- @@ -133,15 +121,12 @@ func @aliasing_reads_writes(%arg0: tensor<32xf32>) -> () { // CHECK-LABEL: func @unknown_side_effecting_op func @unknown_side_effecting_op(%arg0: tensor<32xf32>) -> () { // expected-remark@above {{ID: 13}} -// expected-remark@above {{Predecessors: {12}}} tf_executor.graph { // expected-remark@above {{ID: 11}} - // expected-remark@above {{Predecessors: {10}}} // expected-remark@above {{Successors: {12}}} // CHECK: tf_executor.island %island = tf_executor.island { // expected-remark@above {{ID: 9}} - // expected-remark@above {{Predecessors: {8}}} // expected-remark@above {{Successors: {10}}} %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> tensor<*x!tf.resource>> // expected-remark@above {{ID: 0}} @@ -172,17 +157,14 @@ func @unknown_side_effecting_op(%arg0: tensor<32xf32>) -> () { tf_executor.yield // expected-remark@above {{ID: 8}} // expected-remark@above {{Predecessors: {6,7}}} - // expected-remark@above {{Successors: {9}}} } tf_executor.fetch %island : !tf_executor.control // expected-remark@above {{ID: 10}} // expected-remark@above {{Predecessors: {9}}} - // expected-remark@above {{Successors: {11}}} } return // expected-remark@above {{ID: 12}} // expected-remark@above {{Predecessors: {11}}} - // expected-remark@above {{Successors: {13}}} } // ----- @@ -193,15 +175,12 @@ func @unknown_side_effecting_op(%arg0: tensor<32xf32>) -> () { // CHECK-LABEL: func @read_only_unknown_resource func @read_only_unknown_resource(%arg0: tensor<32xf32>) -> () { // expected-remark@above {{ID: 10}} -// expected-remark@above {{Predecessors: {9}}} tf_executor.graph { // expected-remark@above {{ID: 8}} - // expected-remark@above {{Predecessors: {7}}} // expected-remark@above {{Successors: {9}}} // CHECK: tf_executor.island %island = tf_executor.island { // expected-remark@above {{ID: 6}} - // expected-remark@above {{Predecessors: {5}}} // expected-remark@above {{Successors: {7}}} %vh0 = "tf._UnknownSideEffectingOp_"() : () -> tensor<*x!tf.resource>> // expected-remark@above {{ID: 0}} @@ -223,15 +202,71 @@ func @read_only_unknown_resource(%arg0: tensor<32xf32>) -> () { tf_executor.yield // expected-remark@above {{ID: 5}} // expected-remark@above {{Predecessors: {4}}} - // expected-remark@above {{Successors: {6}}} } tf_executor.fetch %island : !tf_executor.control // expected-remark@above {{ID: 7}} // expected-remark@above {{Predecessors: {6}}} - // expected-remark@above {{Successors: {8}}} } return // expected-remark@above {{ID: 9}} // expected-remark@above {{Predecessors: {8}}} - // expected-remark@above {{Successors: {10}}} +} + +// ----- + +// Tests that the pass adds control dependencies in nested regions with +// tf_device.replicate + +func @with_replicate( + // expected-remark@above {{ID: 12}} + %arg0: tensor<*x!tf.resource>>, + %arg1: tensor<*x!tf.resource>>, + %arg2: tensor<*x!tf.resource>>, + %arg3: tensor<*x!tf.resource>>) { + tf_executor.graph { + // expected-remark@above {{ID: 10}} + // expected-remark@above {{Successors: {11}}} + %island = tf_executor.island { + // expected-remark@above {{ID: 8}} + // expected-remark@above {{Successors: {9}}} + %u0:2 = "tf._UnknownSideEffectingOp_"() : () -> (tensor<32xf32>, tensor<32xf32>) + // expected-remark@above {{ID: 0}} + // expected-remark@above {{Successors: {5}}} + tf_device.replicate( + // expected-remark@above {{ID: 5}} + // expected-remark@above {{Predecessors: {0}}} + // expected-remark@above {{Successors: {6}}} + [%arg0, %arg1] as %r0: tensor<*x!tf.resource>>, + [%arg2, %arg3] as %r1: tensor<*x!tf.resource>>, + [%u0#0, %u0#1] as %u : tensor<32xf32>) + {n = 2 : i32, devices = ["/CPU:0", "/GPU:1"]} { + %read0 = "tf.ReadVariableOp"(%r0) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + // expected-remark@above {{ID: 1}} + // expected-remark@above {{Successors: {4}}} + "tf.AssignVariableOp"(%r1, %u) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + // expected-remark@above {{ID: 2}} + // expected-remark@above {{Successors: {3}}} + %read1 = "tf.ReadVariableOp"(%r1) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + // expected-remark@above {{ID: 3}} + // expected-remark@above {{Predecessors: {2}}} + // expected-remark@above {{Successors: {4}}} + tf_device.return + // expected-remark@above {{ID: 4}} + // expected-remark@above {{Predecessors: {1,3}}} + } + "tf._UnknownSideEffectingOp_"() : () -> () + // expected-remark@above {{ID: 6}} + // expected-remark@above {{Predecessors: {5}}} + // expected-remark@above {{Successors: {7}}} + tf_executor.yield + // expected-remark@above {{ID: 7}} + // expected-remark@above {{Predecessors: {6}}} + } + tf_executor.fetch %island : !tf_executor.control + // expected-remark@above {{ID: 9}} + // expected-remark@above {{Predecessors: {8}}} + } + return + // expected-remark@above {{ID: 11}} + // expected-remark@above {{Predecessors: {10}}} } From 68600d92a9c17b2b1d4f7e358d10dcc75cf76d6c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 12:59:49 -0800 Subject: [PATCH 075/383] Add memory kinds to memcpy details in device events. PiperOrigin-RevId: 283816667 Change-Id: I3ab241fca42460c5b6e363bd4789bbb4fb9e9262 --- tensorflow/core/profiler/internal/gpu/cupti_tracer.cc | 5 +++++ tensorflow/core/profiler/internal/gpu/cupti_tracer.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc index 34d6427cd4a..17b4362eca2 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc @@ -473,6 +473,8 @@ void AddMemcpyActivityEvent(CuptiTraceCollector *collector, event.memcpy_info.num_bytes = memcpy->bytes; event.memcpy_info.destination = memcpy->deviceId; event.memcpy_info.async = memcpy->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC; + event.memcpy_info.src_mem_kind = memcpy->srcKind; + event.memcpy_info.dst_mem_kind = memcpy->dstKind; collector->AddEvent(std::move(event)); } @@ -495,6 +497,8 @@ void AddMemcpy2ActivityEvent(CuptiTraceCollector *collector, event.memcpy_info.num_bytes = memcpy2->bytes; event.memcpy_info.destination = memcpy2->dstDeviceId; event.memcpy_info.async = memcpy2->flags & CUPTI_ACTIVITY_FLAG_MEMCPY_ASYNC; + event.memcpy_info.src_mem_kind = memcpy2->srcKind; + event.memcpy_info.dst_mem_kind = memcpy2->dstKind; collector->AddEvent(std::move(event)); } @@ -951,6 +955,7 @@ class CudaEventRecorder { event.memcpy_info.destination = ordinal_; // TODO: support differentiate sync and async memcpy. event.memcpy_info.async = false; + // TODO: set src_mem_kind and dst_mem_kind. collector_->AddEvent(std::move(event)); return Status::OK(); } diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h index bcfe1c27d38..12a61af1fa5 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.h +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.h @@ -41,6 +41,10 @@ struct MemcpyDetails { // This contains CUpti_ActivityMemcpyKind for activity event (on device). // For events from other CuptiTracerEventSource, it is always 0. int8 kind; + // CUpti_ActivityMemoryKind of source. + int8 src_mem_kind; + // CUpti_ActivityMemoryKind of destination. + int8 dst_mem_kind; }; struct MemAllocDetails { From c293fc8040144dd0efbc0f70e483550125fb146e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 13:00:14 -0800 Subject: [PATCH 076/383] Add canonicalization patterns for vector CreateMaskOp and StridedSliceOp to be used in the unroll vector op transformation. Adds a ConstantMaskOp to the vector ops dialect. Adds the following canonicalization patterns: CreateMaskOp -> ConstantMaskOp StridedSliceOp(ConstantMaskOp) -> ConstantMaskOp PiperOrigin-RevId: 283816752 Change-Id: I489b26e9fc64b0f1d71634bf865bf36146d4366f --- third_party/mlir/BUILD | 1 + .../mlir/Dialect/VectorOps/VectorOps.td | 41 ++++- .../mlir/lib/Dialect/VectorOps/VectorOps.cpp | 162 +++++++++++++++++- 3 files changed, 198 insertions(+), 6 deletions(-) diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index 62343699e1e..b298b0d897f 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -484,6 +484,7 @@ cc_library( deps = [ ":EDSC", ":IR", + ":StandardOps", ":Support", ":VectorOpsIncGen", ":VectorTransformPatterns", diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td index 36c26fe577f..f4bfeb73dd7 100644 --- a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td +++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td @@ -402,6 +402,7 @@ def Vector_StridedSliceOp : static StringRef getStridesAttrName() { return "strides"; } VectorType getVectorType(){ return vector()->getType().cast(); } }]; + let hasCanonicalizer = 1; } def Vector_TransferReadOp : @@ -639,7 +640,41 @@ def Vector_TypeCastOp : }]; } -// TODO(andydavis) Add constant folding support. +def Vector_ConstantMaskOp : + Vector_Op<"constant_mask", [NoSideEffect]>, + Arguments<(ins I64ArrayAttr:$mask_dim_sizes)>, + Results<(outs VectorOf<[I1]>)> { + let summary = "creates a constant vector mask"; + let description = [{ + Creates and returns a vector mask where elements of the result vector + are set to '0' or '1', based on whether the element indices are contained + within a hyper-rectangular region specified by the 'mask_dim_sizes' + array attribute argument. Each element of the 'mask_dim_sizes' array, + specifices an exclusive upper bound [0, mask-dim-size-element-value) + for a unique dimension in the vector result. The conjunction of the ranges + define a hyper-rectangular region within which elements values are set to 1 + (otherwise element values are set to 0). + + Example: create a constant vector mask of size 4x3xi1 with elements in range + 0 <= row <= 2 and 0 <= col <= 1 are set to 1 (others to 0). + + %1 = vector.constant_mask [3, 2] : vector<4x3xi1> + + print %1 + columns + 0 1 2 + |------------ + 0 | 1 1 0 + rows 1 | 1 1 0 + 2 | 1 1 0 + 3 | 0 0 0 + }]; + + let extraClassDeclaration = [{ + static StringRef getMaskDimSizesAttrName() { return "mask_dim_sizes"; } + }]; +} + def Vector_CreateMaskOp : Vector_Op<"create_mask", [NoSideEffect]>, Arguments<(ins Variadic:$operands)>, Results<(outs VectorOf<[I1]>)> { @@ -649,7 +684,7 @@ def Vector_CreateMaskOp : are set to '0' or '1', based on whether the element indices are contained within a hyper-rectangular region specified by the operands. Specifically, each operand specifies a range [0, operand-value) for a unique dimension in - the vector result. The conjunction of the operand ranges define + the vector result. The conjunction of the operand ranges define a hyper-rectangular region within which elements values are set to 1 (otherwise element values are set to 0). @@ -667,6 +702,8 @@ def Vector_CreateMaskOp : 2 | 1 1 0 3 | 0 0 0 }]; + + let hasCanonicalizer = 1; } // TODO(andydavis) Delete this op once ContractOp is converted to use VectorMask diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp index ab457a6b833..f96d3bacacf 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp @@ -21,10 +21,12 @@ //===----------------------------------------------------------------------===// #include "mlir/Dialect/VectorOps/VectorOps.h" +#include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/IR/AffineExpr.h" #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" #include "mlir/IR/OpImplementation.h" +#include "mlir/IR/PatternMatch.h" #include "mlir/IR/TypeUtilities.h" #include "mlir/Support/Functional.h" #include "mlir/Support/LLVM.h" @@ -342,8 +344,9 @@ static Type inferExtractElementOpResultType(VectorType vectorType, vectorType.getElementType()); } -void ExtractElementOp::build(Builder *builder, OperationState &result, - Value *source, ArrayRef position) { +void vector::ExtractElementOp::build(Builder *builder, OperationState &result, + Value *source, + ArrayRef position) { result.addOperands(source); auto positionAttr = builder->getI32ArrayAttr(position); result.addTypes(inferExtractElementOpResultType( @@ -351,7 +354,7 @@ void ExtractElementOp::build(Builder *builder, OperationState &result, result.addAttribute(getPositionAttrName(), positionAttr); } -static void print(OpAsmPrinter &p, ExtractElementOp op) { +static void print(OpAsmPrinter &p, vector::ExtractElementOp op) { p << op.getOperationName() << " " << *op.vector() << op.position(); p.printOptionalAttrDict(op.getAttrs(), {"position"}); p << " : " << op.vector()->getType(); @@ -387,7 +390,7 @@ static ParseResult parseExtractElementOp(OpAsmParser &parser, parser.addTypeToList(resType, result.types)); } -static LogicalResult verify(ExtractElementOp op) { +static LogicalResult verify(vector::ExtractElementOp op) { auto positionAttr = op.position().getValue(); if (positionAttr.empty()) return op.emitOpError("expected non-empty position attribute"); @@ -841,6 +844,74 @@ static LogicalResult verify(StridedSliceOp op) { return success(); } +namespace { + +static void populateFromInt64AttrArray(ArrayAttr arrayAttr, + SmallVectorImpl &results) { + for (auto attr : arrayAttr) + results.push_back(attr.cast().getInt()); +} + +// Pattern to rewrite a StridedSliceOp(ConstantMaskOp) -> ConstantMaskOp. +class StridedSliceConstantMaskFolder final + : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(StridedSliceOp stridedSliceOp, + PatternRewriter &rewriter) const override { + // Return if 'stridedSliceOp' operand is not defined by a ConstantMaskOp. + auto defOp = stridedSliceOp.vector()->getDefiningOp(); + auto constantMaskOp = dyn_cast_or_null(defOp); + if (!constantMaskOp) + return matchFailure(); + // Return if 'stridedSliceOp' has non-unit strides. + if (llvm::any_of(stridedSliceOp.strides(), [](Attribute attr) { + return attr.cast().getInt() != 1; + })) + return matchFailure(); + // Gather constant mask dimension sizes. + SmallVector maskDimSizes; + populateFromInt64AttrArray(constantMaskOp.mask_dim_sizes(), maskDimSizes); + // Gather strided slice offsets and sizes. + SmallVector sliceOffsets; + populateFromInt64AttrArray(stridedSliceOp.offsets(), sliceOffsets); + SmallVector sliceSizes; + populateFromInt64AttrArray(stridedSliceOp.sizes(), sliceSizes); + + // Compute slice of vector mask region. + SmallVector sliceMaskDimSizes; + assert(sliceOffsets.size() == maskDimSizes.size()); + for (const auto &it : llvm::zip(maskDimSizes, sliceOffsets, sliceSizes)) { + int64_t maskDimSize = std::get<0>(it); + int64_t sliceOffset = std::get<1>(it); + int64_t sliceSize = std::get<2>(it); + int64_t sliceMaskDimSize = std::max( + static_cast(0), + std::min(sliceOffset + sliceSize, maskDimSize) - sliceOffset); + sliceMaskDimSizes.push_back(sliceMaskDimSize); + } + // If any of 'sliceMaskDimSizes' are zero, then set all to zero (masked + // region is a conjunction of mask dim intervals). + if (llvm::any_of(sliceMaskDimSizes, [](int64_t sz) { return sz == 0; })) + sliceMaskDimSizes.assign(maskDimSizes.size(), 0); + + // Replace 'stridedSliceOp' with ConstantMaskOp with sliced mask region. + rewriter.replaceOpWithNewOp( + stridedSliceOp, stridedSliceOp.getResult()->getType(), + rewriter.getI64ArrayAttr(sliceMaskDimSizes)); + return matchSuccess(); + } +}; + +} // end anonymous namespace + +void StridedSliceOp::getCanonicalizationPatterns( + OwningRewritePatternList &results, MLIRContext *context) { + // Pattern to rewrite a StridedSliceOp(ConstantMaskOp) -> ConstantMaskOp. + results.insert(context); +} + //===----------------------------------------------------------------------===// // TransferReadOp //===----------------------------------------------------------------------===// @@ -1033,6 +1104,53 @@ static LogicalResult verify(TypeCastOp &op) { return success(); } +//===----------------------------------------------------------------------===// +// ConstantMaskOp +//===----------------------------------------------------------------------===// + +ParseResult parseConstantMaskOp(OpAsmParser &parser, OperationState &result) { + Type resultType; + ArrayAttr maskDimSizesAttr; + StringRef attrName = ConstantMaskOp::getMaskDimSizesAttrName(); + return failure( + parser.parseOptionalAttrDict(result.attributes) || + parser.parseAttribute(maskDimSizesAttr, attrName, result.attributes) || + parser.parseColonType(resultType) || + parser.addTypeToList(resultType, result.types)); +} + +static void print(OpAsmPrinter &p, ConstantMaskOp &op) { + p << op.getOperationName() << ' ' << op.mask_dim_sizes(); + p << " : " << op.getResult()->getType(); +} + +static LogicalResult verify(ConstantMaskOp &op) { + // Verify that array attr size matches the rank of the vector result. + auto resultType = op.getResult()->getType().cast(); + if (static_cast(op.mask_dim_sizes().size()) != resultType.getRank()) + return op.emitOpError( + "must specify array attr of size equal vector result rank"); + // Verify that each array attr element is in bounds of corresponding vector + // result dimension size. + auto resultShape = resultType.getShape(); + SmallVector maskDimSizes; + for (auto it : llvm::enumerate(op.mask_dim_sizes())) { + int64_t attrValue = it.value().cast().getInt(); + if (attrValue < 0 || attrValue > resultShape[it.index()]) + return op.emitOpError( + "array attr of size out of bounds of vector result dimension size"); + maskDimSizes.push_back(attrValue); + } + // Verify that if one mask dim size is zero, they all should be zero (because + // the mask region is a conjunction of each mask dimension interval). + bool any_zeros = llvm::is_contained(maskDimSizes, 0); + bool all_zeros = llvm::all_of(maskDimSizes, [](int64_t s) { return s == 0; }); + if (any_zeros && !all_zeros) + return op.emitOpError("expected all mask dim sizes to be zeros, " + "as a result of conjunction with zero mask dim"); + return success(); +} + //===----------------------------------------------------------------------===// // CreateMaskOp //===----------------------------------------------------------------------===// @@ -1064,6 +1182,42 @@ static LogicalResult verify(CreateMaskOp &op) { return success(); } +namespace { + +// Pattern to rewrite a CreateMaskOp with a ConstantMaskOp. +class CreateMaskFolder final : public OpRewritePattern { +public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(CreateMaskOp createMaskOp, + PatternRewriter &rewriter) const override { + // Return if any of 'createMaskOp' operands are not defined by a constant. + auto is_not_def_by_constant = [](Value *operand) { + return !isa_and_nonnull(operand->getDefiningOp()); + }; + if (llvm::any_of(createMaskOp.operands(), is_not_def_by_constant)) + return matchFailure(); + // Gather constant mask dimension sizes. + SmallVector maskDimSizes; + for (auto *operand : createMaskOp.operands()) { + auto defOp = operand->getDefiningOp(); + maskDimSizes.push_back(cast(defOp).getValue()); + } + // Replace 'createMaskOp' with ConstantMaskOp. + rewriter.replaceOpWithNewOp( + createMaskOp, createMaskOp.getResult()->getType(), + rewriter.getI64ArrayAttr(maskDimSizes)); + return matchSuccess(); + } +}; + +} // end anonymous namespace + +void CreateMaskOp::getCanonicalizationPatterns( + OwningRewritePatternList &results, MLIRContext *context) { + results.insert(context); +} + //===----------------------------------------------------------------------===// // IndexTupleOp //===----------------------------------------------------------------------===// From 20510af488041ebafa7eb602f0a1846c42d48ab5 Mon Sep 17 00:00:00 2001 From: Yuanzhong Xu Date: Wed, 4 Dec 2019 13:12:34 -0800 Subject: [PATCH 077/383] [MLIR:TF/XLA] Use side-effect analysis in breakup-islands. Add only necessary control edges inside an island. PiperOrigin-RevId: 283819679 Change-Id: I59c4ba0af5567985ace052afa9bb962d91611ae7 --- .../tensorflow/tests/breakup-islands.mlir | 112 +++++++++++-- .../tensorflow/translate/breakup-islands.cc | 158 ++++++++++++++---- 2 files changed, 227 insertions(+), 43 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir index 67c3982fe3b..d5a5c16cbff 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/breakup-islands.mlir @@ -18,7 +18,7 @@ func @multiple_return(%arg0: tensor<*xi32>, %arg1: tensor) -> (tensor<*xi32 // CHECK-LABEL: func @multiple_return // CHECK: %[[GRAPH:.*]]:2 = tf_executor.graph { // CHECK: %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1) -// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island(%[[ADD1_control]]) wraps "tf.Add"(%[[ADD1]], %arg1) +// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1) // CHECK: tf_executor.fetch %[[ADD1]], %[[ADD2]] : // CHECK: } // CHECK: return %[[GRAPH]]#0, %[[GRAPH]]#1 @@ -41,7 +41,12 @@ func @multiple_islands(%arg0: tensor<*xi32>, %arg1: tensor) -> (tensor<*xi3 %res = "tf.Print"(%sub) { message = "sub result" } : (tensor<*xi32>) -> (tensor<*xi32>) tf_executor.yield } - tf_executor.fetch %island1#1, %island2#1, %island3 : tensor<*xi32>, tensor<*xi32>, !tf_executor.control + %island4 = tf_executor.island(%island1#2, %island2#2) { + %add = "tf.Add"(%island1#1, %island1#1) : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32> + %res = "tf.Print"(%add) { message = "add result" } : (tensor<*xi32>) -> (tensor<*xi32>) + tf_executor.yield + } + tf_executor.fetch %island1#1, %island2#1, %island3, %island4 : tensor<*xi32>, tensor<*xi32>, !tf_executor.control, !tf_executor.control } return %graph#0, %graph#1 : tensor<*xi32>, tensor<*xi32> } @@ -49,12 +54,17 @@ func @multiple_islands(%arg0: tensor<*xi32>, %arg1: tensor) -> (tensor<*xi3 // CHECK-LABEL: func @multiple_islands // CHECK: %[[GRAPH:.*]]:2 = tf_executor.graph { // CHECK: %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1) -// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island(%[[ADD1_control]]) wraps "tf.Add"(%[[ADD1]], %arg1) +// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1) // CHECK: %[[SUB1:.*]], %[[SUB1_control:.*]] = tf_executor.island(%[[ADD2_control]]) wraps "tf.Sub"(%arg0, %arg1) -// CHECK: %[[MUL:.*]], %[[MUL_control:.*]] = tf_executor.island(%[[SUB1_control]]) wraps "tf.Mul"(%[[SUB1]], %arg1) +// CHECK: %[[MUL:.*]], %[[MUL_control:.*]] = tf_executor.island wraps "tf.Mul"(%[[SUB1]], %arg1) // CHECK: %[[SUB2:.*]], %[[SUB2_control:.*]] = tf_executor.island(%[[ADD2_control]], %[[MUL_control]]) wraps "tf.Sub"(%[[ADD1]], %[[SUB1]]) -// CHECK: %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island(%[[SUB2_control]]) wraps "tf.Print"(%[[SUB2]]) {message = "sub result"} -// CHECK: tf_executor.fetch %[[ADD2]], %[[MUL]], %[[PRINT_control]] : +// CHECK: %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island wraps "tf.Print"(%[[SUB2]]) {message = "sub result"} +// CHECK: %[[ISLAND1:.*]] = tf_executor.island(%[[ADD2_control]], %[[MUL_control]]) { +// CHECK: tf_executor.yield +// CHECK: } +// CHECK: %[[ADD3:.*]], %[[ADD3_control:.*]] = tf_executor.island(%[[ISLAND1]], %[[ADD2_control]]) wraps "tf.Add"(%[[ADD2]], %[[ADD2]]) +// CHECK: %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD3]]) {message = "add result"} +// CHECK: tf_executor.fetch %[[ADD2]], %[[MUL]], %[[PRINT1_control]], %[[PRINT2_control:.*]] : // CHECK: } // CHECK: return %[[GRAPH]]#0, %[[GRAPH]]#1 @@ -74,8 +84,8 @@ func @dangling_print(%arg0: tensor<*xi32>, %arg1: tensor) -> (tensor<*xi32> // CHECK-LABEL: func @dangling_print // CHECK: %[[GRAPH:.*]]:2 = tf_executor.graph { // CHECK: %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1) -// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island(%[[ADD1_control]]) wraps "tf.Add"(%[[ADD1_control:.*]], %arg1) -// CHECK: %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island(%[[ADD2_control]]) wraps "tf.Print"(%[[ADD2_control:.*]]) {message = "add result"} +// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1_control:.*]], %arg1) +// CHECK: %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2_control:.*]]) {message = "add result"} // CHECK: tf_executor.fetch %[[ADD1]], %[[ADD2]], %[[PRINT_control]] : // CHECK: } // CHECK: return %[[GRAPH]]#0, %[[GRAPH]]#1 @@ -103,11 +113,14 @@ func @switch_and_merge(%arg0: tensor<*xi32>, %arg1: tensor) -> (tensor<*xi3 // CHECK-LABEL: func @switch_and_merge(%arg0: tensor<*xi32>, %arg1: tensor) -> (tensor<*xi32>, tensor) { // CHECK: %[[GRAPH:.*]]:2 = tf_executor.graph { // CHECK: %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg1) -// CHECK: %[[LESS:.*]], %[[LESS_control:.*]] = tf_executor.island(%[[ADD1_control]]) wraps "tf.Less"(%arg1, %arg1) -// CHECK: %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island(%[[LESS_control]]) wraps "tf.Print"(%[[ADD1]]) {message = "add result 1"} -// CHECK: %[[SWITCH_false:.*]], %[[SWITCH_true:.*]], {{.*}} = tf_executor.Switch %[[ADD1]], %[[LESS]], %[[PRINT1_control]] +// CHECK: %[[LESS:.*]], %[[LESS_control:.*]] = tf_executor.island wraps "tf.Less"(%arg1, %arg1) +// CHECK: %[[PRINT1:.*]], %[[PRINT1_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD1]]) {message = "add result 1"} +// CHECK: %[[ISLAND1:.*]] = tf_executor.island(%[[LESS_control]], %[[PRINT1_control]]) { +// CHECK: tf_executor.yield +// CHECK: } +// CHECK: %[[SWITCH_false:.*]], %[[SWITCH_true:.*]], {{.*}} = tf_executor.Switch %[[ADD1]], %[[LESS]], %[[ISLAND1]] // CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[SWITCH_false]], %arg1) -// CHECK: %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island(%[[ADD2_control]]) wraps "tf.Print"(%[[ADD2]]) {message = "add result 2"} +// CHECK: %[[PRINT2:.*]], %[[PRINT2_control:.*]] = tf_executor.island wraps "tf.Print"(%[[ADD2]]) {message = "add result 2"} // CHECK: %[[MERGE:.*]], %[[MERGE_index:.*]], %{{.*}} = tf_executor.Merge %[[ADD2]], %[[SWITCH_true]], %[[PRINT2_control]] // CHECK: tf_executor.fetch %[[MERGE]], %[[MERGE_index]] // CHECK: } @@ -130,7 +143,7 @@ func @control_flow_plumbing(%arg0: tensor<*xi32>, %arg1: tensor) -> tensor< // CHECK: %[[GRAPH:.*]] = tf_executor.graph { // CHECK: %[[PRINT:.*]], %[[PRINT_control:.*]] = tf_executor.island wraps "tf.Print"(%arg0) {message = "Random Print"} // CHECK: %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island(%[[PRINT_control]]) wraps "tf.Add"(%arg0, %arg1) -// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island(%[[ADD1_control]]) wraps "tf.Add"(%[[ADD1]], %arg1) +// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg1) // CHECK: tf_executor.fetch %[[ADD2]] : tensor<*xi32> // CHECK: } // CHECK: return %[[GRAPH]] : tensor<*xi32> @@ -150,6 +163,77 @@ func @fetching_arg(%arg0: tensor<*xi32>) { // CHECK-LABEL: func @fetching_arg // CHECK: tf_executor.graph { // CHECK: %[[ADD1:.*]], %[[ADD1_control:.*]] = tf_executor.island wraps "tf.Add"(%arg0, %arg0) -// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island(%[[ADD1_control]]) wraps "tf.Add"(%[[ADD1]], %arg0) +// CHECK: %[[ADD2:.*]], %[[ADD2_control:.*]] = tf_executor.island wraps "tf.Add"(%[[ADD1]], %arg0) // CHECK: tf_executor.fetch %[[ADD2_control]] : !tf_executor.control // CHECK: } + +func @non_aliasing_reads_writes( + %arg0: tensor<*x!tf.resource>>, + %arg1: tensor<*x!tf.resource>>, + %arg2: tensor<32xf32>) -> (tensor<32xf32>) { + %graph = tf_executor.graph { + %island:2 = tf_executor.island { + %read0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + %read1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> tensor<*x!tf.resource>> + %read2 = "tf.ReadVariableOp"(%var_handle) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + "tf.AssignVariableOp"(%arg1, %read0) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + "tf.AssignVariableOp"(%arg0, %read2) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + %read3 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + tf_executor.yield %read3 : tensor<32xf32> + } + tf_executor.fetch %island#0 : tensor<32xf32> + } + return %graph : tensor<32xf32> +} + +// CHECK-LABEL: func @non_aliasing_reads_writes +// CHECK: %[[GRAPH:.*]] = tf_executor.graph { +// CHECK: %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg0) +// CHECK: %[[ASSIGN0_CONTROL:.*]] = tf_executor.island(%[[READ0_CONTROL]]) wraps "tf.AssignVariableOp"(%arg0, %arg2) +// CHECK: %[[READ1:.*]], %[[READ1_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%arg1) +// CHECK: %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"} +// CHECK: %[[READ2:.*]], %[[READ2_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]]) +// CHECK: %[[ASSIGN1_CONTROL:.*]] = tf_executor.island(%[[READ1_CONTROL]]) wraps "tf.AssignVariableOp"(%arg1, %[[READ0:.*]]) +// CHECK: %[[ASSIGN2_CONTROL:.*]] = tf_executor.island(%[[ASSIGN0_CONTROL]]) wraps "tf.AssignVariableOp"(%arg0, %[[READ2]]) +// CHECK: %[[READ3:.*]], %[[READ3_CONTROL:.*]] = tf_executor.island(%[[ASSIGN2_CONTROL]]) wraps "tf.ReadVariableOp"(%arg0) +// CHECK: %[[ISLAND1:.*]] = tf_executor.island(%[[ASSIGN1_CONTROL]], %[[READ3_CONTROL]]) { +// CHECK: tf_executor.yield +// CHECK: } +// CHECK: tf_executor.fetch %[[READ3]], %[[ISLAND1]] : tensor<32xf32>, !tf_executor.control +// CHECK: } + +func @unknown_side_effecting_op(%arg0: tensor<32xf32>) -> () { + tf_executor.graph { + %island = tf_executor.island { + %vh0 = "tf.VarHandleOp"() {container = "c", shared_name = "v0"} : () -> tensor<*x!tf.resource>> + %vh1 = "tf.VarHandleOp"() {container = "c", shared_name = "v1"} : () -> tensor<*x!tf.resource>> + %read0 = "tf.ReadVariableOp"(%vh0) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + "tf.AssignVariableOp"(%vh1, %arg0) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + "tf._UnknownSideEffectingOp_"() : () -> () + %read1 = "tf.ReadVariableOp"(%vh1) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + "tf.AssignVariableOp"(%vh0, %read1) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + "tf.AssignVariableOp"(%vh1, %read0) : (tensor<*x!tf.resource>>, tensor<32xf32>) -> () + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + +// CHECK-LABEL: func @unknown_side_effecting_op +// CHECK: tf_executor.graph { +// CHECK: %[[VH0:.*]], %[[VH0_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v0"} +// CHECK: %[[VH1:.*]], %[[VH1_CONTROL:.*]] = tf_executor.island wraps "tf.VarHandleOp"() {container = "c", shared_name = "v1"} +// CHECK: %[[READ0:.*]], %[[READ0_CONTROL:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[VH0]]) +// CHECK: %[[ASSIGN0_CONTROL:.*]] = tf_executor.island wraps "tf.AssignVariableOp"(%[[VH1]], %arg0) +// CHECK: %[[UNKNOWN_CONTROL:.*]] = tf_executor.island(%[[READ0_CONTROL]], %[[ASSIGN0_CONTROL]]) wraps "tf._UnknownSideEffectingOp_"() +// CHECK: %[[READ1:.*]], %[[READ1_CONTROL:.*]] = tf_executor.island(%[[UNKNOWN_CONTROL]]) wraps "tf.ReadVariableOp"(%[[VH1]]) +// CHECK: %[[ASSIGN1_CONTROL:.*]] = tf_executor.island(%[[UNKNOWN_CONTROL]]) wraps "tf.AssignVariableOp"(%[[VH0]], %[[READ1]]) +// CHECK: %[[ASSIGN2_CONTROL:.*]] = tf_executor.island(%[[READ1_CONTROL]]) wraps "tf.AssignVariableOp"(%[[VH1]], %[[READ0]]) +// CHECK: %[[ISLAND1:.*]] = tf_executor.island(%[[ASSIGN1_CONTROL]], %[[ASSIGN2_CONTROL]]) { +// CHECK: tf_executor.yield +// CHECK: } +// CHECK: tf_executor.fetch %[[ISLAND1]] : !tf_executor.control +// CHECK: } diff --git a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc index 22d04b27dd1..2b3a87ee853 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/breakup-islands.cc @@ -13,14 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include + #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "mlir/Dialect/StandardOps/Ops.h" // TF:local_config_mlir #include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/Operation.h" // TF:local_config_mlir #include "mlir/Pass/Pass.h" // TF:local_config_mlir #include "mlir/Pass/PassRegistry.h" // TF:local_config_mlir #include "mlir/Support/STLExtras.h" // TF:local_config_mlir +#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h" // This pass is used in preparation for Graph export. @@ -38,12 +43,11 @@ struct BreakUpIslands : OperationPass { void runOnOperation() final; void BreakUpIsland(tf_executor::IslandOp op, + const TF::SideEffectAnalysis& side_effect_analysis, llvm::DenseMap>* new_control_edges); }; -} // end anonymous namespace - void BreakUpIslands::runOnOperation() { auto graph_op_range = getOperation().getBody().front().without_terminator(); tf_executor::GraphOp graph_op; @@ -61,12 +65,13 @@ void BreakUpIslands::runOnOperation() { // Map from the users of the existing islands to the list of control // edges that need to be added. llvm::DenseMap> new_control_edges; + auto& side_effect_analysis = getAnalysis(); // Iterate in reverse order to avoid invalidating Operation* stored in // new_control_edges. for (auto& item : llvm::make_early_inc_range(llvm::reverse(graph_op.GetBody()))) { if (auto island = dyn_cast(&item)) { - BreakUpIsland(island, &new_control_edges); + BreakUpIsland(island, side_effect_analysis, &new_control_edges); } } OpBuilder builder(getOperation()); @@ -106,21 +111,82 @@ void BreakUpIslands::runOnOperation() { } } +// Helper that creates an island. If `sub_op` is not nullptr, it will be moved +// to the island. +tf_executor::IslandOp CreateIsland(ArrayRef result_types, + ArrayRef control_inputs, + const tf_executor::ControlType& control_type, + const Location& loc, Operation* sub_op, + tf_executor::IslandOp original_island) { + OpBuilder builder(original_island); + auto island = builder.create( + loc, result_types, control_type, control_inputs); + island.body().push_back(new Block); + Block* block = &island.body().back(); + if (sub_op) { + sub_op->replaceAllUsesWith(island.outputs()); + sub_op->moveBefore(block, block->begin()); + } + OpBuilder island_builder(original_island); + island_builder.setInsertionPointToEnd(block); + if (sub_op) { + island_builder.create( + loc, llvm::to_vector<4>(sub_op->getResults())); + } else { + island_builder.create(loc, ArrayRef{}); + } + return island; +} + +// A struct contains the operations in an island that do not have incoming or +// outgoing dependencies. +struct IslandSourcesAndSinks { + // Sub-ops that do not depend on other ops in the island. + llvm::SmallPtrSet sources; + // Sub-ops that do not have other sub-ops island depending on them (excluding + // yield). + llvm::SmallPtrSet sinks; +}; + +// Finds IslandSourcesAndSinks for an unmodified island. +IslandSourcesAndSinks FindSourcesAndSinksInIsland( + tf_executor::IslandOp island, + const TF::SideEffectAnalysis& side_effect_analysis) { + IslandSourcesAndSinks result; + auto island_body = island.GetBody().without_terminator(); + for (Operation& sub_op : island_body) { + auto predecessors = side_effect_analysis.DirectControlPredecessors(&sub_op); + result.sinks.insert(&sub_op); + // Remove predecessor from sinks. + for (auto predecessor : predecessors) result.sinks.erase(predecessor); + bool has_in_island_operands = false; + for (auto operand : sub_op.getOperands()) { + auto defining_op = operand->getDefiningOp(); + if (!defining_op || defining_op->getParentOp() != island) continue; + // Remove operands from sinks. + result.sinks.erase(defining_op); + has_in_island_operands = true; + } + if (predecessors.empty() && !has_in_island_operands) { + result.sources.insert(&sub_op); + } + } + return result; +} + // Converts a single island into multiple islands (one for each op). The islands // are chained together by control flow values. void BreakUpIslands::BreakUpIsland( tf_executor::IslandOp op, + const TF::SideEffectAnalysis& side_effect_analysis, llvm::DenseMap>* new_control_edges) { auto island_body = op.GetBody().without_terminator(); // Skip islands that are already only a single op. // Skip islands that are empty (only yield). if (island_body.empty() || has_single_element(island_body)) return; - OpBuilder builder(op); - OpBuilder island_builder(op); auto control_type = tf_executor::ControlType::get(&getContext()); - Value* previous_island = nullptr; - auto tmp_control_inputs = llvm::to_vector<4>(op.controlInputs()); + auto island_control_inputs = llvm::to_vector<4>(op.controlInputs()); // Add control dependencies for yields of values defined by other islands to // the island that defines that fetched value. for (auto* fetch : op.GetYield().fetches()) { @@ -130,7 +196,7 @@ void BreakUpIslands::BreakUpIsland( // OK, because it is the same island. } else if (auto island_op = llvm::dyn_cast( fetch->getDefiningOp())) { - tmp_control_inputs.push_back(island_op.control()); + island_control_inputs.push_back(island_op.control()); } else { // TODO(parkers): Any defining op that has a control output can be handled // just like an island. @@ -138,39 +204,71 @@ void BreakUpIslands::BreakUpIsland( return signalPassFailure(); } } - ArrayRef previous_control = tmp_control_inputs; + // If there are multiple control inputs, create an empty island to group them. + if (island_control_inputs.size() > 1) { + auto island = CreateIsland({}, island_control_inputs, control_type, + op.getLoc(), nullptr, op); + island_control_inputs.clear(); + island_control_inputs.push_back(island.control()); + } + // Find sources and sinks inside the original island. + auto sources_and_sinks = + FindSourcesAndSinksInIsland(op, side_effect_analysis); + // The corresponding control output of the new island created for each sub-op. + llvm::SmallDenseMap new_control_for_sub_ops; + // Control outputs of newly created islands that are sinks. + llvm::SmallVector sink_island_controls; // For each operation in the island, construct a new island to wrap the op, // yield all the results, and replace all the usages with the results of the // new island. - for (Operation& sub_op : llvm::make_early_inc_range(island_body)) { - auto loc = sub_op.getLoc(); - auto island = builder.create( - loc, llvm::to_vector<4>(sub_op.getResultTypes()), control_type, - previous_control); - island.body().push_back(new Block); - Block* block = &island.body().back(); - sub_op.replaceAllUsesWith(island.outputs()); - block->getOperations().splice(block->begin(), op.GetBody().getOperations(), - sub_op); - island_builder.setInsertionPointToEnd(block); - island_builder.create( - loc, llvm::to_vector<4>(sub_op.getResults())); - previous_island = island.control(); - previous_control = previous_island; + for (auto& sub_op : llvm::make_early_inc_range(island_body)) { + const auto predecessors = + side_effect_analysis.DirectControlPredecessors(&sub_op); + // Get the controls from the predecessors. + llvm::SmallVector predecessors_control; + predecessors_control.reserve(predecessors.size()); + for (auto predecessor : predecessors) { + predecessors_control.push_back(new_control_for_sub_ops[predecessor]); + } + // If sub_op is a source, use island_control_inputs, because that's required + // by inter-islands dependencies; otherwise, we do not need to include + // island_control_inputs, since they must have been tracked by the (direct + // or indirect) control predecessors or operands. + ArrayRef control = sources_and_sinks.sources.count(&sub_op) > 0 + ? island_control_inputs + : predecessors_control; + auto island = + CreateIsland(llvm::to_vector<4>(sub_op.getResultTypes()), control, + control_type, sub_op.getLoc(), &sub_op, op); + new_control_for_sub_ops[&sub_op] = island.control(); + if (sources_and_sinks.sinks.count(&sub_op)) { + sink_island_controls.push_back(island.control()); + } } - op.control()->replaceAllUsesWith(previous_island); - // All existing outputs need to add a control flow edge to the - // previous_island. + // Create output controls for the sinks. + assert(!sink_island_controls.empty()); + // If there are multiple output controls, create an empty island to group + // them. + if (sink_island_controls.size() > 1) { + auto island = CreateIsland({}, sink_island_controls, control_type, + op.getLoc(), nullptr, op); + sink_island_controls.clear(); + sink_island_controls.push_back(island.control()); + } + assert(sink_island_controls.size() == 1); + op.control()->replaceAllUsesWith(sink_island_controls[0]); + // All existing outputs need to add a control flow edge from + // sink_island_controls[0]. for (Value* out : op.outputs()) { for (auto& use : out->getUses()) { Operation* owner = use.getOwner(); if (auto island_op = llvm::dyn_cast(owner->getParentOp())) { - (*new_control_edges)[island_op].push_back(previous_island); + (*new_control_edges)[island_op].push_back(sink_island_controls[0]); } else if (llvm::isa(owner) || llvm::isa(owner) || llvm::isa(owner)) { - (*new_control_edges)[owner].push_back(previous_island); + (*new_control_edges)[owner].push_back(sink_island_controls[0]); } else { use.getOwner()->emitError("Adding control dependency not supported"); return signalPassFailure(); @@ -182,6 +280,8 @@ void BreakUpIslands::BreakUpIsland( op.erase(); } +} // namespace + std::unique_ptr> CreateBreakUpIslandsPass() { return std::make_unique(); } From e9829498e93b0251923e6db1875f2fc095fc2f41 Mon Sep 17 00:00:00 2001 From: Jian Li Date: Wed, 4 Dec 2019 13:34:30 -0800 Subject: [PATCH 078/383] Restrict LSTM operator property to support only one variant of LSTM. PiperOrigin-RevId: 283824285 Change-Id: I44e0b3e7f9e21af1219ca19ad72900054babd0ef --- .../lite/tools/optimize/operator_property.cc | 146 ++++++++++-------- 1 file changed, 82 insertions(+), 64 deletions(-) diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc index c2e66dd2e35..7b4056c7f9c 100644 --- a/tensorflow/lite/tools/optimize/operator_property.cc +++ b/tensorflow/lite/tools/optimize/operator_property.cc @@ -26,6 +26,9 @@ namespace { // TODO(jianlijianli): extend it to support ops that has multiple variants. struct OpVariant { BuiltinOperator op_code; + bool use_layer_norm = false; + bool use_projection = false; + bool use_peephole = false; }; const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index, @@ -34,6 +37,19 @@ const OpVariant GetOperatorVariant(const ModelT* model, int subgraph_index, OperatorT* op = model->subgraphs.at(subgraph_index)->operators[op_index].get(); op_variant.op_code = model->operator_codes[op->opcode_index]->builtin_code; + if (op_variant.op_code == BuiltinOperator_LSTM) { + const int cell_to_output_weight_index = 11; + const int forget_layer_norm_coefficients_index = 21; + const int projection_weights_index = 16; + op_variant.use_projection = op->inputs[projection_weights_index] != -1; + op_variant.use_peephole = op->inputs[cell_to_output_weight_index] != -1; + if (op->inputs.size() == 20) { + op_variant.use_layer_norm = false; + } else { + op_variant.use_layer_norm = + op->inputs[forget_layer_norm_coefficients_index] != -1; + } + } return op_variant; } } // namespace @@ -180,72 +196,74 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index, // TODO(jianlijianli): extend this to other variants of LSTM. // LSTM needs 5 intermediate tensors. This agrees with the fully quantized // kernels in lstm_eval.cc - static const float alpha = static_cast(std::pow(2, -10)); + if (op_variant.use_layer_norm && op_variant.use_projection && + !op_variant.use_peephole) { + static const float alpha = static_cast(std::pow(2, -10)); - TensorProperty tensor_property_12; - tensor_property_12.use_derived_scale = true; - tensor_property_12.number_of_bits = 32; - tensor_property_12.derived_scale = {{20}, {}, {alpha}}; - TensorProperty tensor_property_13; - tensor_property_13.use_derived_scale = true; - tensor_property_13.number_of_bits = 32; - tensor_property_13.derived_scale = {{21}, {}, {alpha}}; - TensorProperty tensor_property_14; - tensor_property_14.use_derived_scale = true; - tensor_property_14.number_of_bits = 32; - tensor_property_14.derived_scale = {{22}, {}, {alpha}}; - TensorProperty tensor_property_15; - tensor_property_15.use_derived_scale = true; - tensor_property_15.number_of_bits = 32; - tensor_property_15.derived_scale = {{23}, {}, {alpha}}; - TensorProperty tensor_property_17; - tensor_property_17.use_derived_scale = true; - tensor_property_17.number_of_bits = 32; - tensor_property_17.derived_scale = {{16}, {4}, {}}; - TensorProperty tensor_property_19; - tensor_property_19.extend_to_power_of_two = true; - tensor_property_19.number_of_bits = 16; - tensor_property_19.state_tensor = true; - tensor_property_19.symmetric = true; - TensorProperty tensor_property_20; - tensor_property_20.number_of_bits = 16; - tensor_property_20.symmetric = true; + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{20}, {}, {alpha}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{21}, {}, {alpha}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{22}, {}, {alpha}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{23}, {}, {alpha}}; + TensorProperty tensor_property_17; + tensor_property_17.use_derived_scale = true; + tensor_property_17.number_of_bits = 32; + tensor_property_17.derived_scale = {{16}, {4}, {}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + TensorProperty tensor_property_20; + tensor_property_20.number_of_bits = 16; + tensor_property_20.symmetric = true; - property.inputs = { - {0, {}}, - {1, {}}, - {2, {}}, - {3, {}}, - {4, {}}, - {5, {}}, - {6, {}}, - {7, {}}, - {8, {}}, - {9, {}}, - {10, {}}, - {11, {}}, - {16, {}}, - {19, tensor_property_19}, - {20, tensor_property_20}, - {21, tensor_property_20}, - {22, tensor_property_20}, - {23, tensor_property_20}, - {12, tensor_property_12}, - {13, tensor_property_13}, - {14, tensor_property_14}, - {15, tensor_property_15}, - {17, tensor_property_17}, - }; - property.outputs = {{0, {}}}; - property.intermediates = { - {0, tensor_property_20}, - {1, tensor_property_20}, - {2, tensor_property_20}, - {3, tensor_property_20}, - {4, {}}, - }; - property.restrict_scale = {{18, 0}}; - property.version = 2; + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {16, {}}, + {19, tensor_property_19}, + {20, tensor_property_20}, + {21, tensor_property_20}, + {22, tensor_property_20}, + {23, tensor_property_20}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + {17, tensor_property_17}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + {0, tensor_property_20}, + {1, tensor_property_20}, + {2, tensor_property_20}, + {3, tensor_property_20}, + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } else { + property.quantizable = false; + } break; } case BuiltinOperator_L2_NORMALIZATION: { From ae7ec95f2aeb689150df842f107895b20298b322 Mon Sep 17 00:00:00 2001 From: Skye Wanderman-Milne Date: Wed, 4 Dec 2019 13:39:01 -0800 Subject: [PATCH 079/383] [XLA:Python] Expose GetDefaultDeviceAssignment Python binding. This is necessary to replicate values that will eventually be passed to a compiled executable without doing compilation. PiperOrigin-RevId: 283825308 Change-Id: Id69b3cbb0f6d3f90f040b23ed0f8af4c4c6f767a --- .../python/tpu_driver/client/tpu_client.py | 3 +++ .../tpu_driver/client/tpu_client_extension.cc | 15 ++++++++++++++ tensorflow/compiler/xla/python/xla.cc | 15 ++++++++++++++ tensorflow/compiler/xla/python/xla_client.py | 20 +++++++++++++++++++ 4 files changed, 53 insertions(+) diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py index 43c0d1a40c3..a3ad8b117ef 100644 --- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py +++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.py @@ -104,6 +104,9 @@ class TpuBackend(xla_client.Backend): options, self.client, compile_options.device_assignment) + def get_default_device_assignment(self, num_replicas): + return self.client.GetDefaultDeviceAssignment(num_replicas) + def serialize(self, executable): return self.client.SerializeExecutable(executable) diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc index e7d1e2ef9d9..09d3350b590 100644 --- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc +++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc @@ -32,6 +32,21 @@ PYBIND11_MODULE(tpu_client_extension, m) { .def("devices", &PyTpuClient::devices) .def("local_devices", &PyTpuClient::local_devices) .def("host_id", &PyTpuClient::host_id) + .def("GetDefaultDeviceAssignment", + [](PyTpuClient* client, int num_replicas) + -> StatusOr>> { + TF_ASSIGN_OR_RETURN( + DeviceAssignment device_assignment, + client->GetDefaultDeviceAssignment(num_replicas)); + std::vector> result; + for (int i = 0; i < num_replicas; ++i) { + int device_id = device_assignment(i, 0); + auto iter = client->id_to_device().find(device_id); + CHECK(iter != client->id_to_device().end()) << device_id; + result.push_back(iter->second); + } + return result; + }) .def("TransferToInfeed", [](PyTpuClient* client, const LiteralSlice& literal, int device_ordinal) { diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc index 054c1da9e03..0e594982202 100644 --- a/tensorflow/compiler/xla/python/xla.cc +++ b/tensorflow/compiler/xla/python/xla.cc @@ -366,6 +366,21 @@ PYBIND11_MODULE(xla_extension, m) { .def("devices", &PyLocalClient::devices) .def("local_devices", &PyLocalClient::local_devices) .def("host_id", &PyLocalClient::host_id) + .def("GetDefaultDeviceAssignment", + [](PyLocalClient* client, int num_replicas) + -> StatusOr>> { + TF_ASSIGN_OR_RETURN( + DeviceAssignment device_assignment, + client->GetDefaultDeviceAssignment(num_replicas)); + std::vector> result; + for (int i = 0; i < num_replicas; ++i) { + int device_id = device_assignment(i, 0); + auto iter = client->id_to_device().find(device_id); + CHECK(iter != client->id_to_device().end()) << device_id; + result.push_back(iter->second); + } + return result; + }) .def("TransferToInfeed", [](PyLocalClient* client, const LiteralSlice& literal, int device_ordinal) { diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index c8f66f704d7..9477b3c2b1d 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -91,6 +91,23 @@ class Backend(object): def compile(self, computation, compile_options): """Compiles a computation. Returns an executable.""" + @abc.abstractmethod + def get_default_device_assignment(self, num_replicas): + """Returns the default device assignment that `compile` would use. + + If `compile_options.device_assignment` isn't set, `compile` will pick a + deterministic device assignment based on the number of replicas, possibly + optimizing for device locality. This method returns that assignment, which + is useful for e.g. manually replicating a value before passing it to a + compiled executable. + + Args: + num_replicas: the number of replicas needed. + + Returns: + A list of Devices of length `num_replicas` indexed by replica ID. + """ + class LocalBackend(Backend): """XLA backend implemented using the in-process xla::LocalClient API.""" @@ -143,6 +160,9 @@ class LocalBackend(Backend): options, self.client, compile_options.device_assignment) + def get_default_device_assignment(self, num_replicas): + return self.client.GetDefaultDeviceAssignment(num_replicas) + def serialize(self, executable): return self.client.SerializeExecutable(executable) From 15715cb2c8e877c18f8d969cc51a37ff26e8397b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 13:48:14 -0800 Subject: [PATCH 080/383] Updating out-of-date comment. PiperOrigin-RevId: 283827304 Change-Id: I2ba2f88f1fe31cdae2588642404b4bf40ea07f21 --- tensorflow/core/grappler/costs/virtual_scheduler.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc index c3100b4c3a4..559101c22f0 100644 --- a/tensorflow/core/grappler/costs/virtual_scheduler.cc +++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc @@ -1163,9 +1163,7 @@ void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) { node_stats->set_timeline_label(timeline_label); } node_stats->set_node_name(node_def->name()); - // Timestamps in microseconds. - // TODO(b/138165866): Remove once TimelineServer support is no longer - // needed. + // Timestamps in microseconds (can be used by timeline_server). node_stats->set_op_start_rel_micros(0); node_stats->set_all_start_micros( nodestate.time_scheduled.asMicroSeconds().count()); @@ -1175,7 +1173,7 @@ void VirtualScheduler::GenerateRunMetadata(RunMetadata* metadata) { node_stats->set_all_end_rel_micros( nodestate.time_finished.asMicroSeconds().count() - nodestate.time_scheduled.asMicroSeconds().count()); - // Timestamps in nanoseconds. + // Timestamps in nanoseconds (can be used by xprof trace). node_stats->set_op_start_rel_nanos(0); node_stats->set_all_start_nanos(nodestate.time_scheduled.count()); node_stats->set_op_end_rel_nanos(nodestate.time_finished.count() - From 41228d7f14496ff661e7c22361a987b0255cf812 Mon Sep 17 00:00:00 2001 From: Nicolas Vasilache Date: Wed, 4 Dec 2019 14:15:24 -0800 Subject: [PATCH 081/383] Add a CL option to Standard to LLVM lowering to use alloca instead of malloc/free. In the future, a more configurable malloc and free interface should be used and exposed via extra parameters to the `createLowerToLLVMPass`. Until requirements are gathered, a simple CL flag allows generating code that runs successfully on hardware that cannot use the stdlib. PiperOrigin-RevId: 283833424 Change-Id: I56115a960e7d5a1fc14cabdc71dd3e33d9f6812c --- .../ConvertStandardToLLVMPass.h | 27 +++- .../StandardToLLVM/ConvertStandardToLLVM.cpp | 124 +++++++++++++----- 2 files changed, 110 insertions(+), 41 deletions(-) diff --git a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h index 98e105aa2b5..c5c17b36f5e 100644 --- a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h +++ b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h @@ -57,25 +57,40 @@ void populateStdToLLVMConversionPatterns(LLVMTypeConverter &converter, OwningRewritePatternList &patterns); /// Creates a pass to convert the Standard dialect into the LLVMIR dialect. -std::unique_ptr> createLowerToLLVMPass(); +/// By default stdlib malloc/free are used for allocating MemRef payloads. +/// Specifying `useAlloca-true` emits stack allocations instead. In the future +/// this may become an enum when we have concrete uses for other options. +std::unique_ptr> +createLowerToLLVMPass(bool useAlloca = false); /// Creates a pass to convert operations to the LLVMIR dialect. The conversion /// is defined by a list of patterns and a type converter that will be obtained /// during the pass using the provided callbacks. +/// By default stdlib malloc/free are used for allocating MemRef payloads. +/// Specifying `useAlloca-true` emits stack allocations instead. In the future +/// this may become an enum when we have concrete uses for other options. std::unique_ptr> createLowerToLLVMPass(LLVMPatternListFiller patternListFiller, - LLVMTypeConverterMaker typeConverterMaker); + LLVMTypeConverterMaker typeConverterMaker, + bool useAlloca = false); /// Creates a pass to convert operations to the LLVMIR dialect. The conversion /// is defined by a list of patterns obtained during the pass using the provided /// callback and an optional type conversion class, an instance is created /// during the pass. +/// By default stdlib malloc/free are used for allocating MemRef payloads. +/// Specifying `useAlloca-true` emits stack allocations instead. In the future +/// this may become an enum when we have concrete uses for other options. template std::unique_ptr> -createLowerToLLVMPass(LLVMPatternListFiller patternListFiller) { - return createLowerToLLVMPass(patternListFiller, [](MLIRContext *context) { - return std::make_unique(context); - }); +createLowerToLLVMPass(LLVMPatternListFiller patternListFiller, + bool useAlloca = false) { + return createLowerToLLVMPass( + patternListFiller, + [](MLIRContext *context) { + return std::make_unique(context); + }, + useAlloca); } namespace LLVM { diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp index 793997e9045..23c7be310a9 100644 --- a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp +++ b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp @@ -38,9 +38,20 @@ #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" using namespace mlir; +#define PASS_NAME "convert-std-to-llvm" + +static llvm::cl::OptionCategory + clOptionsCategory("Standard to LLVM lowering options"); + +static llvm::cl::opt + clUseAlloca(PASS_NAME "-use-alloca", + llvm::cl::desc("Replace emission of malloc/free by alloca"), + llvm::cl::init(false)); + LLVMTypeConverter::LLVMTypeConverter(MLIRContext *ctx) : llvmDialect(ctx->getRegisteredDialect()) { assert(llvmDialect && "LLVM IR dialect is not registered"); @@ -764,6 +775,11 @@ static bool isSupportedMemRefType(MemRefType type) { struct AllocOpLowering : public LLVMLegalizationPattern { using LLVMLegalizationPattern::LLVMLegalizationPattern; + AllocOpLowering(LLVM::LLVMDialect &dialect_, LLVMTypeConverter &converter, + bool useAlloca = false) + : LLVMLegalizationPattern(dialect_, converter), + useAlloca(useAlloca) {} + PatternMatchResult match(Operation *op) const override { MemRefType type = cast(op).getType(); if (isSupportedMemRefType(type)) @@ -825,32 +841,43 @@ struct AllocOpLowering : public LLVMLegalizationPattern { cumulativeSize = rewriter.create( loc, getIndexType(), ArrayRef{cumulativeSize, elementSize}); - // Insert the `malloc` declaration if it is not already present. - auto module = op->getParentOfType(); - auto mallocFunc = module.lookupSymbol("malloc"); - if (!mallocFunc) { - OpBuilder moduleBuilder(op->getParentOfType().getBodyRegion()); - mallocFunc = moduleBuilder.create( - rewriter.getUnknownLoc(), "malloc", - LLVM::LLVMType::getFunctionTy(getVoidPtrType(), getIndexType(), - /*isVarArg=*/false)); - } - // Allocate the underlying buffer and store a pointer to it in the MemRef // descriptor. - Value *align = nullptr; - if (auto alignAttr = allocOp.alignment()) { - align = createIndexConstant(rewriter, loc, - alignAttr.getValue().getSExtValue()); - cumulativeSize = rewriter.create( - loc, rewriter.create(loc, cumulativeSize, align), one); + Value *allocated = nullptr; + int alignment = 0; + Value *alignmentValue = nullptr; + if (auto alignAttr = allocOp.alignment()) + alignment = alignAttr.getValue().getSExtValue(); + + if (useAlloca) { + allocated = rewriter.create(loc, getVoidPtrType(), + cumulativeSize, alignment); + } else { + // Insert the `malloc` declaration if it is not already present. + auto module = op->getParentOfType(); + auto mallocFunc = module.lookupSymbol("malloc"); + if (!mallocFunc) { + OpBuilder moduleBuilder( + op->getParentOfType().getBodyRegion()); + mallocFunc = moduleBuilder.create( + rewriter.getUnknownLoc(), "malloc", + LLVM::LLVMType::getFunctionTy(getVoidPtrType(), getIndexType(), + /*isVarArg=*/false)); + } + if (alignment != 0) { + alignmentValue = createIndexConstant(rewriter, loc, alignment); + cumulativeSize = rewriter.create( + loc, + rewriter.create(loc, cumulativeSize, alignmentValue), + one); + } + allocated = rewriter + .create( + loc, getVoidPtrType(), + rewriter.getSymbolRefAttr(mallocFunc), cumulativeSize) + .getResult(0); } - Value *allocated = - rewriter - .create(loc, getVoidPtrType(), - rewriter.getSymbolRefAttr(mallocFunc), - cumulativeSize) - .getResult(0); + auto structElementType = lowering.convertType(elementType); auto elementPtrType = structElementType.cast().getPointerTo( type.getMemorySpace()); @@ -878,13 +905,17 @@ struct AllocOpLowering : public LLVMLegalizationPattern { // Field 2: Actual aligned pointer to payload. Value *bitcastAligned = bitcastAllocated; - if (align) { + if (!useAlloca && alignment != 0) { + assert(alignmentValue); // offset = (align - (ptr % align))% align Value *intVal = rewriter.create( loc, this->getIndexType(), allocated); - Value *ptrModAlign = rewriter.create(loc, intVal, align); - Value *subbed = rewriter.create(loc, align, ptrModAlign); - Value *offset = rewriter.create(loc, subbed, align); + Value *ptrModAlign = + rewriter.create(loc, intVal, alignmentValue); + Value *subbed = + rewriter.create(loc, alignmentValue, ptrModAlign); + Value *offset = + rewriter.create(loc, subbed, alignmentValue); Value *aligned = rewriter.create(loc, allocated->getType(), allocated, offset); bitcastAligned = rewriter.create( @@ -930,6 +961,8 @@ struct AllocOpLowering : public LLVMLegalizationPattern { // Return the final value of the descriptor. rewriter.replaceOp(op, {memRefDescriptor}); } + + bool useAlloca; }; // A CallOp automatically promotes MemRefType to a sequence of alloca/store and @@ -1001,9 +1034,17 @@ struct CallIndirectOpLowering : public CallOpInterfaceLowering { struct DeallocOpLowering : public LLVMLegalizationPattern { using LLVMLegalizationPattern::LLVMLegalizationPattern; + DeallocOpLowering(LLVM::LLVMDialect &dialect_, LLVMTypeConverter &converter, + bool useAlloca = false) + : LLVMLegalizationPattern(dialect_, converter), + useAlloca(useAlloca) {} + PatternMatchResult matchAndRewrite(Operation *op, ArrayRef operands, ConversionPatternRewriter &rewriter) const override { + if (useAlloca) + return rewriter.eraseOp(op), matchSuccess(); + assert(operands.size() == 1 && "dealloc takes one operand"); OperandAdaptor transformed(operands); @@ -1026,6 +1067,8 @@ struct DeallocOpLowering : public LLVMLegalizationPattern { op, ArrayRef(), rewriter.getSymbolRefAttr(freeFunc), casted); return matchSuccess(); } + + bool useAlloca; }; struct MemRefCastOpLowering : public LLVMLegalizationPattern { @@ -1759,7 +1802,6 @@ void mlir::populateStdToLLVMConversionPatterns( patterns.insert< AddFOpLowering, AddIOpLowering, - AllocOpLowering, AndOpLowering, BranchOpLowering, CallIndirectOpLowering, @@ -1768,7 +1810,6 @@ void mlir::populateStdToLLVMConversionPatterns( CmpIOpLowering, CondBranchOpLowering, ConstLLVMOpLowering, - DeallocOpLowering, DimOpLowering, DivFOpLowering, DivISOpLowering, @@ -1800,6 +1841,10 @@ void mlir::populateStdToLLVMConversionPatterns( ViewOpLowering, XOrOpLowering, ZeroExtendIOpLowering>(*converter.getDialect(), converter); + patterns.insert< + AllocOpLowering, + DeallocOpLowering>( + *converter.getDialect(), converter, clUseAlloca.getValue()); // clang-format on } @@ -1873,6 +1918,7 @@ struct LLVMLoweringPass : public ModulePass { // By default, the patterns are those converting Standard operations to the // LLVMIR dialect. explicit LLVMLoweringPass( + bool useAlloca = false, LLVMPatternListFiller patternListFiller = populateStdToLLVMConversionPatterns, LLVMTypeConverterMaker converterBuilder = makeStandardToLLVMTypeConverter) @@ -1911,17 +1957,25 @@ struct LLVMLoweringPass : public ModulePass { }; } // end namespace -std::unique_ptr> mlir::createLowerToLLVMPass() { - return std::make_unique(); +std::unique_ptr> +mlir::createLowerToLLVMPass(bool useAlloca) { + return std::make_unique(useAlloca); } std::unique_ptr> mlir::createLowerToLLVMPass(LLVMPatternListFiller patternListFiller, - LLVMTypeConverterMaker typeConverterMaker) { - return std::make_unique(patternListFiller, + LLVMTypeConverterMaker typeConverterMaker, + bool useAlloca) { + return std::make_unique(useAlloca, patternListFiller, typeConverterMaker); } static PassRegistration - pass("convert-std-to-llvm", "Convert scalar and vector operations from the " - "Standard to the LLVM dialect"); + pass("convert-std-to-llvm", + "Convert scalar and vector operations from the " + "Standard to the LLVM dialect", + [] { + return std::make_unique( + clUseAlloca.getValue(), populateStdToLLVMConversionPatterns, + makeStandardToLLVMTypeConverter); + }); From 6d7926bb87c1a91ffd110aa3407c003b2ae54009 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Wed, 4 Dec 2019 14:22:31 -0800 Subject: [PATCH 082/383] Update grappler/cluster_test.py reflecting changes in random_uniform op PiperOrigin-RevId: 283834913 Change-Id: If2869e4786d7c10c180bf6e7cb2ba5361a46c3cb --- tensorflow/python/grappler/cluster_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py index 2014c0dde3f..b192ba726f4 100644 --- a/tensorflow/python/grappler/cluster_test.py +++ b/tensorflow/python/grappler/cluster_test.py @@ -81,9 +81,9 @@ class ClusterTest(test.TestCase): self.assertLessEqual(1, len(peak_mem)) snapshot = peak_mem['/job:localhost/replica:0/task:0/device:CPU:0'] peak_usage = snapshot[0] - self.assertEqual(52, peak_usage) + self.assertEqual(12, peak_usage) live_tensors = snapshot[1] - self.assertEqual(15, len(live_tensors)) + self.assertEqual(5, len(live_tensors)) def testVirtualCluster(self): with ops.Graph().as_default() as g: @@ -107,8 +107,8 @@ class ClusterTest(test.TestCase): disable_timeline=False, devices=[named_device]) op_perfs, run_time, _ = grappler_cluster.MeasureCosts(grappler_item) - self.assertEqual(run_time, 0.000545) - self.assertEqual(len(op_perfs), 15) + self.assertEqual(run_time, 0.000209) + self.assertEqual(len(op_perfs), 5) estimated_perf = grappler_cluster.EstimatePerformance(named_device) self.assertEqual(7680.0, estimated_perf) From 23275fb35cf17482d147f88ce7d8f4ce9c2376f3 Mon Sep 17 00:00:00 2001 From: Igor Ganichev Date: Wed, 4 Dec 2019 14:37:00 -0800 Subject: [PATCH 083/383] Strip default attributes before calling into WorkerService Strip default attributes before calling RegisterGraph from ClusterFunctionLibraryRuntime and MasterSession. This change is the second part of stripping and re-inserting default attributes across WorkerService, which is needed to support forward compatibility across RPCs, i.e. an old server trying to run a new graph. PiperOrigin-RevId: 283838038 Change-Id: I0a1a898f9d7ff164b51a9121a2fa20e2263da6e4 --- .../distributed_runtime/cluster_function_library_runtime.cc | 3 +++ tensorflow/core/distributed_runtime/master_session.cc | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc index f7d1e345a2d..2f6e97a4aee 100644 --- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc +++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/process_function_library_runtime.h" #include "tensorflow/core/distributed_runtime/worker_session.h" #include "tensorflow/core/framework/function.h" +#include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/framework/node_def_builder.h" #include "tensorflow/core/framework/tensor.pb.h" @@ -223,6 +224,8 @@ void ClusterFunctionLibraryRuntime::Instantiate( req->set_session_handle(worker_session_->session_name()); req->set_create_worker_session_called(create_worker_session_called_); *req->mutable_graph_def() = std::move(gdef); + StripDefaultAttributes(*OpRegistry::Global(), + req->mutable_graph_def()->mutable_node()); req->mutable_graph_options() ->mutable_optimizer_options() ->set_do_function_inlining(true); diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc index 03970b91505..9c95c29b020 100644 --- a/tensorflow/core/distributed_runtime/master_session.cc +++ b/tensorflow/core/distributed_runtime/master_session.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/framework/allocation_description.pb.h" #include "tensorflow/core/framework/collective.h" #include "tensorflow/core/framework/cost_graph.pb.h" +#include "tensorflow/core/framework/graph_def_util.h" #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/framework/node_def_util.h" #include "tensorflow/core/framework/tensor.h" @@ -472,6 +473,8 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions( c->req.set_session_handle(session_handle_); c->req.set_create_worker_session_called(!should_deregister_); c->req.mutable_graph_def()->Swap(&graph_partitions[part.name]); + StripDefaultAttributes(*OpRegistry::Global(), + c->req.mutable_graph_def()->mutable_node()); *c->req.mutable_config_proto() = session_opts_.config; *c->req.mutable_graph_options() = session_opts_.config.graph_options(); *c->req.mutable_debug_options() = @@ -741,7 +744,7 @@ Status MasterSession::ReffedClientGraph::RunPartitionsHelper( // Waits for the RunGraph calls. call_opts->SetCancelCallback([&calls]() { LOG(INFO) << "Client requested cancellation for RunStep, cancelling " - "worker operations."; + "worker operations."; calls.StartCancel(); }); auto token = cm->get_cancellation_token(); From 92e07d0d13c645bbfdbc7187fe3e9080c723d9b5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 14:47:39 -0800 Subject: [PATCH 084/383] Internal build rule updates PiperOrigin-RevId: 283840388 Change-Id: I6949582e330932f655a1678394a5832240265173 --- tensorflow/python/ops/ragged/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD index 010a1a4cd71..083953ee837 100644 --- a/tensorflow/python/ops/ragged/BUILD +++ b/tensorflow/python/ops/ragged/BUILD @@ -6,6 +6,7 @@ package( "//intelligence/datum/prensor:__pkg__", "//learning/brain/contrib/text:__pkg__", "//nlp/nlx/bert:__pkg__", + "//nlp/nlx/i18n/saft:__subpackages__", "//nlp/nlx/infrastructure/multiscale:__subpackages__", "//nlp/projects/atc/tf/ops:__pkg__", "//research/socrates:__subpackages__", From 1f3fa0d852e1cf662721e508857e2e9fabf5bcae Mon Sep 17 00:00:00 2001 From: Reed Wanderman-Milne Date: Wed, 4 Dec 2019 14:56:20 -0800 Subject: [PATCH 085/383] Fix flaky normalization test. Before test_batchnorm_mixed_precision would fail ~0.3% of the time. PiperOrigin-RevId: 283842177 Change-Id: I918576a96be7533a5e0fb8ab96015b8ff0de7ab6 --- tensorflow/python/keras/layers/normalization_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py index 48e08ba763f..cbff4b48977 100644 --- a/tensorflow/python/keras/layers/normalization_test.py +++ b/tensorflow/python/keras/layers/normalization_test.py @@ -378,8 +378,8 @@ def _run_batchnorm_correctness_test(layer, dtype='float32', fused=False): out -= keras.backend.eval(norm.beta) out /= keras.backend.eval(norm.gamma) - np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1) - np.testing.assert_allclose(out.std(), 1.0, atol=1e-1) + np.testing.assert_allclose(out.mean(), 0.0, atol=2e-1) + np.testing.assert_allclose(out.std(), 1.0, atol=2e-1) @parameterized.parameters( From f540109342f8b7cb9b96163dae455013249c3128 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 14:57:05 -0800 Subject: [PATCH 086/383] Explicitly export files needed by other packages PiperOrigin-RevId: 283842350 Change-Id: I04e11deedd2611ec9706d58a38d4483f7daab074 --- tensorflow/compiler/mlir/xla/BUILD | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD index ac3475cebc4..6a617206823 100644 --- a/tensorflow/compiler/mlir/xla/BUILD +++ b/tensorflow/compiler/mlir/xla/BUILD @@ -23,6 +23,8 @@ package_group( ], ) +exports_files(["ir/hlo_ops.td"]) + filegroup( name = "hlo_ops_td_files", srcs = [ From c29529aa7d55bc66b040917a36acdb5722231043 Mon Sep 17 00:00:00 2001 From: Jian Li Date: Wed, 4 Dec 2019 15:04:41 -0800 Subject: [PATCH 087/383] Add VectorBatchVectorCwiseProductAccumulate for int16 input. This is used in integer LSTM for peephole connections. PiperOrigin-RevId: 283844221 Change-Id: Ic47e0fc433599782db5a33c2d461b7ab4968915e --- .../internal/optimized/neon_tensor_utils.h | 8 +++ .../internal/optimized/sse_tensor_utils.h | 8 +++ .../reference/portable_tensor_utils.cc | 14 +++++ .../reference/portable_tensor_utils.h | 8 +++ .../reference/portable_tensor_utils_impl.h | 4 ++ .../lite/kernels/internal/tensor_utils.h | 6 ++ .../kernels/internal/tensor_utils_test.cc | 55 ++++++++++++++++++- 7 files changed, 102 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h index 892fcebd110..4c7ef2cf3fe 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h @@ -172,6 +172,14 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, result); } +void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size, + const int16_t* batch_vector, + int n_batch, int32_t multiplier, + int shift, int16_t* result) { + PortableVectorBatchVectorCwiseProductAccumulate( + vector, v_size, batch_vector, n_batch, multiplier, shift, result); +} + float VectorVectorDotProduct(const float* vector1, const float* vector2, int v_size) { return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size); diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h index ebad7b70a95..7b08823e1ac 100644 --- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h @@ -182,6 +182,14 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, result); } +void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size, + const int16_t* batch_vector, + int n_batch, int32_t multiplier, + int shift, int16_t* result) { + PortableVectorBatchVectorCwiseProductAccumulate( + vector, v_size, batch_vector, n_batch, multiplier, shift, result); +} + float VectorVectorDotProduct(const float* vector1, const float* vector2, int v_size) { return NEON_OR_PORTABLE(VectorVectorDotProduct, vector1, vector2, v_size); diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index b2b4dd25770..dcf0df8ebed 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -558,6 +558,20 @@ void PortableVectorVectorCwiseProductAccumulate(const float* vector1, } } +void PortableVectorBatchVectorCwiseProductAccumulate( + const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch, + int32_t multiplier, int shift, int16_t* result) { + for (int b = 0; b < n_batch; b++) { + for (int v = 0; v < v_size; v++) { + int32_t prod = vector[v] * *batch_vector++; + prod = MultiplyByQuantizedMultiplier(prod, multiplier, shift); + int32_t output = prod + *result; + output = std::max(std::min(32767, output), -32768); + *result++ = output; + } + } +} + void PortableVectorBatchVectorAdd(const float* vector, int v_size, int n_batch, float* batch_vector) { for (int b = 0; b < n_batch; b++) { diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h index 918775234f7..068fe3a8593 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -197,6 +197,14 @@ void VectorVectorCwiseProductAccumulate(const float* vector1, PortableVectorVectorCwiseProductAccumulate(vector1, vector2, v_size, result); } +void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size, + const int16_t* batch_vector, + int n_batch, int32_t multiplier, + int shift, int16_t* result) { + PortableVectorBatchVectorCwiseProductAccumulate( + vector, v_size, batch_vector, n_batch, multiplier, shift, result); +} + float VectorVectorDotProduct(const float* vector1, const float* vector2, int v_size) { return PortableVectorVectorDotProduct(vector1, vector2, v_size); diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h index 448c510e58a..fe06f582320 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h @@ -99,6 +99,10 @@ void PortableBatchVectorBatchVectorDotProduct(const int16_t* vector1, int32_t* result, int result_stride); +void PortableVectorBatchVectorCwiseProductAccumulate( + const int16_t* vector, int v_size, const int16_t* batch_vector, int n_batch, + int32_t multiplier, int shift, int16_t* result); + void PortableMatrixBatchVectorMultiplyAccumulate( const int8_t* input, const int32_t* bias, const int8_t* input_to_gate_weights, int32_t multiplier, int32_t shift, diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h index a9a2b839547..b62cc8b089c 100644 --- a/tensorflow/lite/kernels/internal/tensor_utils.h +++ b/tensorflow/lite/kernels/internal/tensor_utils.h @@ -406,6 +406,12 @@ inline void VectorBatchVectorCwiseProductAccumulate(const T* vector, int v_size, } } +// Same as above, but inputs are 16bit integer and output is 16bit integer. +void VectorBatchVectorCwiseProductAccumulate(const int16_t* vector, int v_size, + const int16_t* batch_vector, + int n_batch, int32_t multiplier, + int shift, int16_t* result); + // Add another vector for each batch in the batch vector. void VectorBatchVectorAdd(const float* vector, int v_size, int n_batch, float* batch_vector); diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc index a36e4fc4247..cf31bf046b2 100644 --- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc +++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc @@ -1424,7 +1424,60 @@ TEST(uKernels, Sub1VectorInt16Test) { })); } -TEST(uKernels, VectorBatchVectorCwiseProductAccumulate) { +TEST(uKernels, VectorBatchVectorCwiseProductAccumulateInteger) { + constexpr int kVectorSize = 29; + constexpr int kBatchSize = 4; + static int16_t vector[kVectorSize] = {-10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 18}; + const std::vector batch_vector = { + /* batch 0 */ + 10, 11, 12, 13, 14, 15, 16, 17, 18, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, + /* batch 1 */ + -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, + /* batch 2 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 11, 12, + 13, 14, 15, 16, 17, 18, + /* batch 3 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 11, 12, + 13, 14, 15, 16, 17, 18}; + std::vector batch_output = { + /* batch 0 */ + -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 0, 1, + 2, 3, 4, 5, 6, 7, 8, 9, + /* batch 1 */ + 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -10, 9, 8, 7, 6, 5, + 4, 3, 2, 1, 10, 11, 12, + /* batch 2 */ + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10, 11, 12, + 13, 14, 15, 16, 17, 18, + /* batch 3 */ + 10, 11, 12, 13, 14, 15, 16, 17, 18, -10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 1, + 13, 14, 15, 16, 17, 18}; + // Test with 0.25 scale, which is decomposed into (1073741824, -1). + VectorBatchVectorCwiseProductAccumulate(vector, kVectorSize, + batch_vector.data(), kBatchSize, + 1073741824, -1, batch_output.data()); + + const std::vector expected_output = { + /* batch 0 */ + -35, 34, 32, 30, 27, 24, 20, 16, 11, -2, 10, 13, 16, 18, 19, 20, 21, 21, + 20, 0, 4, 8, 12, 17, 23, 29, 35, 42, 50, + /* batch 1 */ + 27, 24, 20, 18, 15, 14, 12, 12, 1, 2, 2, 6, 10, 15, 20, 26, 32, 39, 26, 9, + 11, 13, 15, 18, 22, 26, 30, 35, 51, + /* batch 2 */ + 11, 15, 4, 7, 8, 10, 10, 11, 10, 10, 8, 12, -6, 15, 14, 14, 12, 11, 8, 6, + 27, 32, 46, 54, 61, 70, 78, 88, 97, + /* batch 3 */ + 17, 21, 14, 17, 18, 20, 20, 21, 20, 20, 18, -7, 13, 14, 13, 13, 11, 10, 7, + 5, 26, 31, 37, 56, 63, 72, 80, 90, 99}; + EXPECT_THAT(batch_output, testing::ElementsAreArray(expected_output)); +} + +TEST(uKernels, VectorBatchVectorCwiseProductAccumulateFloat) { constexpr int kVectorSize = 29; constexpr int kBatchSize = 4; static float input[kVectorSize] = { From 26f7a8d6a9cb992f6498b4cf26188514e2f52a38 Mon Sep 17 00:00:00 2001 From: Yanhua Sun Date: Wed, 4 Dec 2019 15:10:31 -0800 Subject: [PATCH 088/383] random_seed documentation improvement PiperOrigin-RevId: 283845553 Change-Id: If92090d1911007602f368bb6bd06563b80527871 --- tensorflow/python/framework/random_seed.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py index cb7e0320dfa..eff0434f52e 100644 --- a/tensorflow/python/framework/random_seed.py +++ b/tensorflow/python/framework/random_seed.py @@ -96,9 +96,12 @@ def set_random_seed(seed): 1. If neither the graph-level nor the operation seed is set: A random seed is used for this op. 2. If the graph-level seed is set, but the operation seed is not: - The system deterministically (determined by the current graph size) picks - an operation seed in conjunction with the graph-level seed so that it gets - a unique random sequence. + The system deterministically picks an operation seed in conjunction with + the graph-level seed so that it gets a unique random sequence. Within the + same version of tensorflow and user code, this sequence is deterministic. + However across different versions, this sequence might change. If the + code depends on particular seeds to work, specify both graph-level + and operation-level seeds explicitly. 3. If the graph-level seed is not set, but the operation seed is set: A default graph-level seed and the specified operation seed are used to determine the random sequence. @@ -198,9 +201,13 @@ def set_seed(seed): 1. If neither the global seed nor the operation seed is set: A randomly picked seed is used for this op. - 2. If the operation seed is not set but the global seed is set: The system - picks an operation seed from a stream of seeds determined by the global - seed. + 2. If the graph-level seed is set, but the operation seed is not: + The system deterministically picks an operation seed in conjunction with + the graph-level seed so that it gets a unique random sequence. Within the + same version of tensorflow and user code, this sequence is deterministic. + However across different versions, this sequence might change. If the + code depends on particular seeds to work, specify both graph-level + and operation-level seeds explicitly. 3. If the operation seed is set, but the global seed is not set: A default global seed and the specified operation seed are used to determine the random sequence. @@ -308,5 +315,4 @@ def set_seed(seed): Args: seed: integer. """ - # TODO(go/tf2-random): change doc, update to match design doc set_random_seed(seed) From 99f0e90b384cfb255103a8965bec0d11a7995e49 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Wed, 4 Dec 2019 15:18:40 -0800 Subject: [PATCH 089/383] Protect EagerContext on worker side when updating cluster. When handling worker failures, the failure handling thread sends update context request to all workers. In the meanwhile, other eager executors might be sending op/function execution requests. This change avoids the necessity of grabbing a global lock on the client side to prevent race conditions of concurrent updating and execution. * Ref count the eager client to avoid deallocating them before pending requests finish. * Hold context lock on worker side to avoid concurrently executing enqueue ops while handling context update. * Adjust local device initialization to avoid clearing the _context_devices list since this can be called multiple times by update_server_def. PiperOrigin-RevId: 283847202 Change-Id: I3f84d56c44cd2adce5136f7fd4f67313a1da3610 --- tensorflow/c/eager/c_api.cc | 8 +- .../core/common_runtime/eager/context.cc | 15 +- .../core/common_runtime/eager/context.h | 14 +- .../core/common_runtime/eager/execute.cc | 5 +- .../core/distributed_runtime/eager/BUILD | 1 + .../eager/cluster_function_library_runtime.cc | 5 +- .../eager/destroy_tensor_handle_node.h | 33 +---- .../distributed_runtime/eager/eager_client.h | 13 +- .../eager/eager_service_impl.cc | 30 ++-- .../eager/eager_service_impl_test.cc | 12 +- .../eager/remote_copy_node.cc | 6 +- .../eager/remote_execute_node.h | 2 + .../eager/remote_tensor_handle_data.cc | 6 +- .../rpc/eager/grpc_eager_client.cc | 131 +++++++++++------- tensorflow/python/eager/context.py | 10 +- 15 files changed, 173 insertions(+), 118 deletions(-) diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc index 46ade1b2e77..8793e308466 100644 --- a/tensorflow/c/eager/c_api.cc +++ b/tensorflow/c/eager/c_api.cc @@ -233,7 +233,7 @@ tensorflow::Status GetReplacedFromExistingWorkers( std::vector responses( existing_workers->size()); for (int i = 0; i < existing_workers->size(); i++) { - tensorflow::eager::EagerClient* eager_client; + tensorflow::core::RefCountPtr eager_client; statuses[i] = client_cache->GetClient(existing_workers->at(i), &eager_client); if (!statuses[i].ok()) { @@ -282,7 +282,7 @@ tensorflow::Status CreateRemoteContexts( continue; } - tensorflow::eager::EagerClient* eager_client; + tensorflow::core::RefCountPtr eager_client; statuses[i] = remote_eager_workers->GetClient(remote_worker, &eager_client); if (eager_client == nullptr) { statuses[i] = tensorflow::errors::Internal( @@ -340,7 +340,7 @@ tensorflow::Status UpdateRemoteContexts( continue; } - tensorflow::eager::EagerClient* eager_client; + tensorflow::core::RefCountPtr eager_client; statuses[i] = remote_eager_workers->GetClient(remote_worker, &eager_client); if (eager_client == nullptr) { statuses[i] = tensorflow::errors::Internal( @@ -819,7 +819,7 @@ TF_CAPI_EXPORT extern bool TFE_ContextCheckAlive(TFE_Context* ctx, } // TODO(yuefengz): support partially specified `worker_name`. - tensorflow::eager::EagerClient* eager_client; + tensorflow::core::RefCountPtr eager_client; status->status = remote_eager_workers->GetClient(worker_name, &eager_client); if (!status->status.ok()) { return false; diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc index b5b0bce52ef..b8dd8d8dcd1 100644 --- a/tensorflow/core/common_runtime/eager/context.cc +++ b/tensorflow/core/common_runtime/eager/context.cc @@ -273,7 +273,7 @@ void EagerContext::CloseRemoteContexts( int i = 0; for (const auto& worker : remote_contexts) { - eager::EagerClient* client; + core::RefCountPtr client; Status s = remote_eager_workers_->GetClient(worker, &client); client->CloseContextAsync( @@ -449,7 +449,7 @@ Status EagerContext::MaybeRegisterFunctionRemotely(const FunctionDef& fdef) { register_function->mutable_function_def()->mutable_node_def()); for (const auto& target : remote_contexts_) { - eager::EagerClient* eager_client; + core::RefCountPtr eager_client; TF_RETURN_IF_ERROR(remote_eager_workers_->GetClient(target, &eager_client)); eager::EnqueueResponse* response = new eager::EnqueueResponse(); @@ -475,7 +475,7 @@ Status EagerContext::RegisterExistingFunctionsOnRemoteWorkers( // Register multiple functions on selected remote workers. uint64 context_id = GetContextId(); for (int i = 0; i < remote_workers.size(); i++) { - eager::EagerClient* eager_client; + core::RefCountPtr eager_client; Status s = remote_eager_workers_->GetClient(remote_workers[i], &eager_client); if (!s.ok()) { @@ -649,12 +649,13 @@ Status GetTaskName(Device* d, string* task_name) { } // namespace #if !defined(IS_MOBILE_PLATFORM) -Status EagerContext::GetClient(Device* device, eager::EagerClient** client) { +Status EagerContext::GetClient(Device* device, + core::RefCountPtr* client) { return GetClient(device->parsed_name(), client); } Status EagerContext::GetClient(const DeviceNameUtils::ParsedName& device_name, - eager::EagerClient** client) { + core::RefCountPtr* client) { if (remote_eager_workers_ == nullptr) { return errors::Internal( "Haven't set up remote eager worker in this eager context yet."); @@ -685,7 +686,7 @@ Status EagerContext::GetClient(const DeviceNameUtils::ParsedName& device_name, } Status EagerContext::GetClient(const string& remote_task, - eager::EagerClient** client) { + core::RefCountPtr* client) { if (remote_eager_workers_ == nullptr) { return errors::Internal( "Haven't set up remote eager worker in this eager context yet."); @@ -934,7 +935,7 @@ Status EagerContext::SetMasterContextState( if (keep_alive_secs_ > 0) { { for (const auto& worker : remote_contexts_) { - eager::EagerClient* client; + core::RefCountPtr client; Status s = remote_eager_workers_->GetClient(worker, &client); diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h index 4a9606e80dd..93fbd8947fe 100644 --- a/tensorflow/core/common_runtime/eager/context.h +++ b/tensorflow/core/common_runtime/eager/context.h @@ -265,10 +265,18 @@ class EagerContext : public core::RefCounted { FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; } #if !defined(IS_MOBILE_PLATFORM) - Status GetClient(Device* device, eager::EagerClient** client); + // Assign the EagerClient pointer to `client` based on the given device / task + // name, and increment the refcount of the client. The reference ownership is + // transferred to the caller, and the unref should automatically happen when + // destructing the RefCountPtr object at the caller's side. + // `client` must not be initialized or holding a reference of another object + // before calling this method. + Status GetClient(Device* device, + core::RefCountPtr* client); Status GetClient(const DeviceNameUtils::ParsedName& device_name, - eager::EagerClient** client); - Status GetClient(const string& remote_task, eager::EagerClient** client); + core::RefCountPtr* client); + Status GetClient(const string& remote_task, + core::RefCountPtr* client); uint64 GetContextId(); uint64 GetContextViewId(); diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc index 32fdb21c1b4..32937bfdfc4 100644 --- a/tensorflow/core/common_runtime/eager/execute.cc +++ b/tensorflow/core/common_runtime/eager/execute.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/framework/node_def.pb.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/tensor_shape.h" +#include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/platform/platform.h" // clang-format on @@ -727,7 +728,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals, op->SetDevice(device); } - eager::EagerClient* eager_client = nullptr; + core::RefCountPtr eager_client; uint64 context_id = ctx->GetContextId(); TF_RETURN_IF_ERROR(ctx->GetClient(op->GetDeviceParsedName(), &eager_client)); string remote_task; @@ -860,7 +861,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals, << " (is async?: " << executor.Async() << ")."; std::unique_ptr node(new eager::RemoteExecuteNode( - std::move(request), op_device, eager_client, + std::move(request), op_device, eager_client.get(), op->MutableAttrs()->BuildNodeDef(), op->EagerContext()->FuncLibDef(), op->Inputs(), {retvals, num_outputs})); Status s = executor.AddOrExecute(std::move(node)); diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD index bbcc10b029a..6cd525b317d 100644 --- a/tensorflow/core/distributed_runtime/eager/BUILD +++ b/tensorflow/core/distributed_runtime/eager/BUILD @@ -65,6 +65,7 @@ cc_library( deps = [ "//tensorflow/core:eager_service_proto_cc", "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", ], ) diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc index a1cfe5813f1..3f940284396 100644 --- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc +++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc @@ -59,7 +59,7 @@ void EagerClusterFunctionLibraryRuntime::Instantiate( VLOG(1) << "CFLR::Instantiate: " << function_name << " on " << target << " (this: " << this << ")"; - eager::EagerClient* eager_client = nullptr; + core::RefCountPtr eager_client; Device* device; s = ctx_->FindDeviceFromName(target.c_str(), &device); if (!s.ok()) { @@ -97,7 +97,8 @@ void EagerClusterFunctionLibraryRuntime::Instantiate( eager_client->EnqueueAsync(request, response, [this, request, response, handle, released_op, - target, eager_client, done](const Status& s) { + target, eager_client = eager_client.get(), + done](const Status& s) { { mutex_lock l(mu_); *handle = function_data_.size(); diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h index 869345fcdd3..bc1670b9f71 100644 --- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h +++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h @@ -30,45 +30,24 @@ namespace eager { class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode { public: DestroyTensorHandleNode(std::unique_ptr request, - EagerContext* ctx, const string& remote_task, - bool ready) + EagerClient* eager_client, bool ready) : tensorflow::AsyncEagerNode(), request_(std::move(request)), - ctx_(ctx), - remote_task_(remote_task), + eager_client_(eager_client), ready_(ready) { - ctx_->Ref(); + eager_client_->Ref(); } - ~DestroyTensorHandleNode() override { ctx_->Unref(); } + ~DestroyTensorHandleNode() override { eager_client_->Unref(); } void RunAsync(StatusCallback done) override { - auto context_id = request_->context_id(); - if (ctx_->GetContextId() != context_id) { - // This means that this tensor was pointing to a remote device, which - // has been changed out from under us. Simply return since there is - // nothing we can do. - done(Status::OK()); - return; - } - - eager::EagerClient* eager_client; - Status status = ctx_->GetClient(remote_task_, &eager_client); - if (!status.ok()) { - LOG_EVERY_N_SEC(INFO, 60) - << "Unable to destroy remote tensor handle because the target " - << remote_task_ << " is no longer available."; - done(Status::OK()); - return; - } - EnqueueResponse* response = new EnqueueResponse; bool ready = ready_; // NOTE(fishx): Don't use StreamingEnqueueAsync here. When a // StreamingEnqueueAsync request fails all following requests will fail as // well. We don't want this request poison following requests since it is // safe to ignore a failing destroy tensor handle request. - eager_client->EnqueueAsync( + eager_client_->EnqueueAsync( request_.get(), response, [response, ready, done](const tensorflow::Status& s) { // Omit the warning if: @@ -96,7 +75,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode { private: std::unique_ptr request_; - EagerContext* ctx_; + EagerClient* eager_client_; const string remote_task_; bool ready_; }; diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h index 089cf25d9b4..3b083f3cae6 100644 --- a/tensorflow/core/distributed_runtime/eager/eager_client.h +++ b/tensorflow/core/distributed_runtime/eager/eager_client.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_ #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_ +#include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/protobuf/eager_service.pb.h" @@ -25,9 +26,9 @@ namespace eager { // This is a base class that can be implemented by a variety of // transports (e.g. gRPC which for each of the client methods makes an RPC). -class EagerClient { +class EagerClient : public core::RefCounted { public: - virtual ~EagerClient() {} + ~EagerClient() override {} #define CLIENT_METHOD(method) \ virtual void method##Async(const method##Request* request, \ method##Response* response, \ @@ -62,7 +63,13 @@ class EagerClient { class EagerClientCache { public: virtual ~EagerClientCache() {} - virtual Status GetClient(const string& target, EagerClient** client) = 0; + + // If the `target` exists, assign the EagerClient pointer to `client` and + // increment the refcount of the client. The reference ownership is + // transferred to the caller, and the unref should automatically happen when + // destructing the RefCountPtr object from the caller's side. + virtual Status GetClient(const string& target, + core::RefCountPtr* client) = 0; }; } // namespace eager diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc index 92e3d2fb3cf..e1a5f341816 100644 --- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc +++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc @@ -235,7 +235,6 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request, " but received update request at view #", request->context_view_id(), ". View id should only be continuously incremented."); } - ctx->ClearCaches(); // TODO(b/143914772): Potential memory leak if rendezvous has pending // tensors for removed / replaced workers. @@ -277,13 +276,25 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request, DistributedFunctionLibraryRuntime* cluster_flr = eager::CreateClusterFLR(request->context_id(), ctx, worker_session.get()); - Status s = ctx->UpdateRemoteWorker( - device_mgr, std::move(remote_eager_workers), - worker_session->remote_device_mgr(), remote_workers, - request->context_id(), cluster_flr); - if (!s.ok()) { - VLOG(1) << "EagerContext::UpdateRemoteWorker failed with " << s.ToString(); - return s; + { + // Hold `contexts_mu_` exclusively, wait for all pending nodes to finish + // (implicitly calling WaitForAllPendingNodes inside `ctx->ClearCaches`), + // and update the context state. + // This lock prevents other threads from handling enqueue requests at the + // same time. Each enqueue request will be processed either with context + // state before or after the update, but the exact ordering needs to be + // determined by the client if desired. + mutex_lock lock(contexts_mu_); + ctx->ClearCaches(); + Status s = ctx->UpdateRemoteWorker( + device_mgr, std::move(remote_eager_workers), + worker_session->remote_device_mgr(), remote_workers, + request->context_id(), cluster_flr); + if (!s.ok()) { + VLOG(1) << "EagerContext::UpdateRemoteWorker failed with " + << s.ToString(); + return s; + } } std::vector device_attributes; @@ -408,6 +419,9 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request, TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context)); core::ScopedUnref context_unref(context); + // Acquire shared lock to prevent handling enqueue requests while updating + // context (see UpdateContext). + tf_shared_lock lock(contexts_mu_); EagerExecutor& executor = stream_id == kInvalidStreamId ? context->Context()->Executor() diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc index dbf3c6370bc..a2c15daf0b3 100644 --- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc +++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc @@ -103,13 +103,15 @@ class FakeEagerClient : public EagerClient { class DummyEagerClientCache : public EagerClientCache { public: DummyEagerClientCache() : client_(new FakeEagerClient) {} - Status GetClient(const string& target, EagerClient** client) override { - *client = client_.get(); + Status GetClient(const string& target, + core::RefCountPtr* client) override { + client->reset(client_.get()); + client_->Ref(); return Status::OK(); } private: - std::unique_ptr client_; + core::RefCountPtr client_; }; class FakeCache : public TestWorkerCache { @@ -481,9 +483,9 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest { TF_ASSERT_OK(eager_service_impl_.GetEagerContext(context_id_, &ctx)); Device* device; TF_ASSERT_OK(ctx->FindDeviceFromName(local_device_.c_str(), &device)); - EagerClient* client; + core::RefCountPtr client; TF_ASSERT_OK(ctx->GetClient(device, &client)); - FakeEagerClient* fake_client = static_cast(client); + FakeEagerClient* fake_client = static_cast(client.get()); fake_client->SetServiceImpl(&eager_service_impl_); // Create an input on local_device for MatMulFunction. diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc index 0dfcd82d737..d0b07a5a97c 100644 --- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc +++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc @@ -156,7 +156,7 @@ void RemoteCopyNode::StartSend() { remote_op->set_id(ctx_->RemoteMgr()->NextOpId()); // Issue the RPC - eager::EagerClient* eager_client; + core::RefCountPtr eager_client; status = ctx_->GetClient(send_device_, &eager_client); if (!status.ok()) { captured_state_->SetSendStatus(status); @@ -199,7 +199,7 @@ void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) { PrepareRemoteOp(remote_op, op); remote_op->set_id(recv_op_id_); - eager::EagerClient* eager_client; + core::RefCountPtr eager_client; Status status = ctx_->GetClient(recv_device_, &eager_client); if (!status.ok()) { captured_state_->dst()->Poison(status); @@ -307,7 +307,7 @@ void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) { } tensor.AsProtoTensorContent(send_tensor->add_tensors()); - eager::EagerClient* eager_client; + core::RefCountPtr eager_client; s = ctx_->GetClient(recv_device_, &eager_client); if (!s.ok()) { captured_state_->dst()->Poison(s); diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h index 3736173cd19..b0342fc5056 100644 --- a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h +++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h @@ -60,6 +60,7 @@ class RemoteExecuteNode : public AsyncEagerNode { for (auto handle : inputs_) { handle->Ref(); } + eager_client_->Ref(); } ~RemoteExecuteNode() override { @@ -70,6 +71,7 @@ class RemoteExecuteNode : public AsyncEagerNode { for (auto handle : inputs_) { handle->Unref(); } + eager_client_->Unref(); } Status Prepare() override { diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc index 58741ee2c9a..af63c20a7f4 100644 --- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc +++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc @@ -34,7 +34,7 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task, return; } - eager::EagerClient* eager_client; + core::RefCountPtr eager_client; Status status = ctx->GetClient(remote_task, &eager_client); if (!status.ok()) { LOG_EVERY_N_SEC(INFO, 60) @@ -52,8 +52,8 @@ void DestroyRemoteTensorHandle(EagerContext* ctx, const string& remote_task, VLOG(3) << "Sending request to delete " << request->DebugString(); std::unique_ptr node( - absl::make_unique(std::move(request), ctx, - remote_task, ready)); + absl::make_unique( + std::move(request), eager_client.get(), ready)); auto& executor = ctx->Executor(); if (executor.Async()) { Status status = executor.AddOrExecute(std::move(node)); diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc index 487479af782..921696efbcc 100644 --- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc +++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h" #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h" +#include "tensorflow/core/lib/core/refcount.h" #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/protobuf/eager_service.pb.h" @@ -61,21 +62,68 @@ bool EnableStreaming() { return result; } +// Ref-counted thread to handle callbacks for completed requests a GRPC +// completion queue. The thread might be shared by multiple eager clients, and +// each one of them should hold a reference count to ensure that the thread +// outlives the clients. +// To ensure that every tag in completion queue is processed, this thread also +// holds a reference to itself and always wait until ref count is one to exit. +class GrpcEagerClientThread : public core::RefCounted { + public: + GrpcEagerClientThread() { + // Hold a reference to ensure every completion tag gets processed. + Ref(); + thread_.reset(Env::Default()->StartThread( + ThreadOptions(), "eager_client_thread", [this]() { + void* tag; + bool ok; + while (completion_queue_.Next(&tag, &ok)) { + VLOG(4) << "GrpcEagerClientThread got next tag"; + GrpcClientCQTag* callback_tag = static_cast(tag); + callback_tag->OnCompleted(ok); + VLOG(4) << "GrpcEagerClientThread blocking for next tag"; + if (RefCountIsOne()) { + break; + } + } + VLOG(4) << "GrpcEagerClientThread exiting"; + completion_queue_.Shutdown(); + // `this` holds the final reference so cannot directly Unref here. + // Instead, schedule a separate thread to clean it up. + Env::Default()->SchedClosure([this]() { this->Unref(); }); + })); + } + + ~GrpcEagerClientThread() override {} + + ::grpc::CompletionQueue* completion_queue() { return &completion_queue_; } + + private: + ::grpc::CompletionQueue completion_queue_; + std::unique_ptr thread_; +}; + class GrpcEagerClient : public EagerClient { public: GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel, - ::grpc::CompletionQueue* cq) - : stub_(channel), cq_(cq) {} - ~GrpcEagerClient() override {} + GrpcEagerClientThread* thread) + : stub_(channel), thread_(thread) { + // Hold a reference to make sure the corresponding EagerClientThread + // outlives the client. + thread_->Ref(); + cq_ = thread->completion_queue(); + } + ~GrpcEagerClient() override { thread_->Unref(); } #define CLIENT_METHOD(method) \ void method##Async(const method##Request* request, \ method##Response* response, StatusCallback done) \ override { \ + StatusCallback done_wrapped = callback_wrapper(std::move(done)); \ new RPCState( \ &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \ - response, std::move(done), nullptr, nullptr, /*max_retries=*/0, \ - /*fail_fast=*/true); \ + response, std::move(done_wrapped), /*call_opts=*/nullptr, \ + /*threadpool=*/nullptr, /*max_retries=*/0, /*fail_fast=*/true); \ } CLIENT_METHOD(CreateContext); @@ -89,9 +137,11 @@ class GrpcEagerClient : public EagerClient { void CloseContextAsync(const CloseContextRequest* request, CloseContextResponse* response, StatusCallback done) override { + StatusCallback done_wrapped = callback_wrapper(std::move(done)); new RPCState( &stub_, cq_, "/tensorflow.eager.EagerService/CloseContext", *request, - response, std::move(done), nullptr, nullptr); + response, std::move(done_wrapped), /*call_opts=*/nullptr, + /*threadpool=*/nullptr); VLOG(1) << "Sending RPC to close remote eager context " << request->DebugString(); @@ -110,6 +160,7 @@ class GrpcEagerClient : public EagerClient { void StreamingEnqueueAsync(const EnqueueRequest* request, EnqueueResponse* response, StatusCallback done) override { + StatusCallback done_wrapped = callback_wrapper(std::move(done)); if (EnableStreaming()) { tf_shared_lock l(mu_); auto it = enqueue_dispatchers_.find(request->context_id()); @@ -122,7 +173,7 @@ class GrpcEagerClient : public EagerClient { "/tensorflow.eager.EagerService/StreamingEnqueue")); it = it_and_bool.first; } - it->second.SendNextRequest(*request, response, std::move(done)); + it->second.SendNextRequest(*request, response, std::move(done_wrapped)); } else { Notification n; Status status; @@ -131,29 +182,44 @@ class GrpcEagerClient : public EagerClient { n.Notify(); }); n.WaitForNotification(); - done(status); + done_wrapped(status); } } private: ::grpc::GenericStub stub_; + const GrpcEagerClientThread* thread_; + ::grpc::CompletionQueue* cq_; mutable mutex mu_; std::unordered_map> enqueue_dispatchers_ GUARDED_BY(mu_); + + StatusCallback callback_wrapper(StatusCallback done) { + Ref(); + return [this, done = std::move(done)](const Status& status) { + done(status); + this->Unref(); + }; + } }; class GrpcEagerClientCache : public EagerClientCache { public: explicit GrpcEagerClientCache( std::shared_ptr cache) - : next_round_robin_assignment_(0), cache_(cache), threads_(4) {} + : next_round_robin_assignment_(0), cache_(cache), threads_(4) { + for (int i = 0; i < threads_.size(); i++) { + threads_[i].reset(new GrpcEagerClientThread()); + } + } ~GrpcEagerClientCache() override { threads_.clear(); } - Status GetClient(const string& target, EagerClient** client) override { + Status GetClient(const string& target, + core::RefCountPtr* client) override { auto it = clients_.find(target); if (it == clients_.end()) { tensorflow::SharedGrpcChannelPtr shared = @@ -162,13 +228,14 @@ class GrpcEagerClientCache : public EagerClientCache { return errors::InvalidArgument("Client for target ", target, " not found."); } - auto worker = std::unique_ptr(new GrpcEagerClient( - shared, threads_[AssignClientToThread(target)].completion_queue())); - - it = clients_.emplace(target, std::move(worker)).first; + int assigned_index = AssignClientToThread(target); + GrpcEagerClientThread* thread = threads_[assigned_index].get(); + auto worker = new GrpcEagerClient(shared, thread); + it = clients_.emplace(target, worker).first; } - *client = it->second.get(); + it->second->Ref(); + client->reset(it->second.get()); return Status::OK(); } @@ -192,39 +259,9 @@ class GrpcEagerClientCache : public EagerClientCache { return it->second; } - class GrpcEagerClientThread { - public: - GrpcEagerClientThread() { - thread_.reset(Env::Default()->StartThread( - ThreadOptions(), "eager_client_thread", [this]() { - void* tag; - bool ok; - while (completion_queue_.Next(&tag, &ok)) { - VLOG(4) << "GrpcEagerClientThread got next tag"; - GrpcClientCQTag* callback_tag = - static_cast(tag); - callback_tag->OnCompleted(ok); - VLOG(4) << "GrpcEagerClientThread blocking for next tag"; - } - VLOG(4) << "GrpcEagerClientThread exiting"; - })); - } - - ~GrpcEagerClientThread() { - completion_queue_.Shutdown(); - thread_.reset(); - } - - ::grpc::CompletionQueue* completion_queue() { return &completion_queue_; } - - private: - ::grpc::CompletionQueue completion_queue_; - std::unique_ptr thread_; - }; // GrpcEagerClientThread - std::shared_ptr cache_; - std::unordered_map> clients_; - std::vector threads_; + std::unordered_map> clients_; + std::vector> threads_; }; } // namespace diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py index dbcdd4a83d6..19626ec7059 100644 --- a/tensorflow/python/eager/context.py +++ b/tensorflow/python/eager/context.py @@ -461,27 +461,29 @@ class Context(object): def _initialize_logical_devices(self): """Helper to initialize devices.""" # Store list of devices - self._logical_devices = [] - self._context_devices = [] + logical_devices = [] + context_devices = [] device_list = pywrap_tensorflow.TFE_ContextListDevices( self._context_handle) try: self._num_gpus = 0 for i in range(pywrap_tensorflow.TF_DeviceListCount(device_list)): dev_name = pywrap_tensorflow.TF_DeviceListName(device_list, i) - self._context_devices.append(pydev.canonical_name(dev_name)) + context_devices.append(pydev.canonical_name(dev_name)) spec = pydev.DeviceSpec.from_string(dev_name) # If the job is localhost, we assume that the cluster has not yet been # configured and thus clear the job, replica & task. if spec.job == "localhost": spec = spec.replace(job=None, replica=None, task=None) - self._logical_devices.append( + logical_devices.append( LogicalDevice(name=spec.to_string(), device_type=spec.device_type)) dev_type = pywrap_tensorflow.TF_DeviceListType(device_list, i) if dev_type == "GPU": self._num_gpus += 1 finally: + self._logical_devices = logical_devices + self._context_devices = context_devices pywrap_tensorflow.TF_DeleteDeviceList(device_list) def ensure_initialized(self): From cdfe80802bd6100d2a78c569302aa5108b45c058 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Wed, 4 Dec 2019 15:19:06 -0800 Subject: [PATCH 090/383] Update tf.ones_like doc example to make it testable. PiperOrigin-RevId: 283847305 Change-Id: Ifb24351d3659c61842cde78845f085979376a4d6 --- tensorflow/python/ops/array_ops.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index c7156b5346e..c550feedfc2 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -2840,7 +2840,7 @@ def ones_like_v2( input, # pylint: disable=redefined-builtin dtype=None, name=None): - """Creates a tensor with all elements set to one. + """Creates a tensor of all ones that has the same shape as the input. Given a single tensor (`tensor`), this operation returns a tensor of the same type and shape as `tensor` with all elements set to 1. Optionally, @@ -2848,10 +2848,11 @@ def ones_like_v2( For example: - ```python - tensor = tf.constant([[1, 2, 3], [4, 5, 6]]) - tf.ones_like(tensor) # [[1, 1, 1], [1, 1, 1]] - ``` + >>> tensor = tf.constant([[1, 2, 3], [4, 5, 6]]) + >>> tf.ones_like(tensor) + Args: input: A `Tensor`. From 8489a5a40c17e5799159742fb8135746c06d0f5d Mon Sep 17 00:00:00 2001 From: Yanhui Liang Date: Wed, 4 Dec 2019 15:19:21 -0800 Subject: [PATCH 091/383] Unify subclass MLP model in Keras tests. PiperOrigin-RevId: 283847367 Change-Id: I43d12d89306c2facb2c430c6c2605b965df58829 --- .../keras/model_subclassing_compiled_test.py | 8 ++--- .../python/keras/model_subclassing_test.py | 20 ++++++------- .../keras/model_subclassing_test_util.py | 29 ++----------------- tensorflow/python/keras/testing_utils.py | 19 +++++++++--- 4 files changed, 31 insertions(+), 45 deletions(-) diff --git a/tensorflow/python/keras/model_subclassing_compiled_test.py b/tensorflow/python/keras/model_subclassing_compiled_test.py index bf27b3bf8a7..54a91bdcc57 100644 --- a/tensorflow/python/keras/model_subclassing_compiled_test.py +++ b/tensorflow/python/keras/model_subclassing_compiled_test.py @@ -44,8 +44,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase): num_samples = 100 input_dim = 50 - model = model_util.SimpleTestModel( - num_classes=num_classes, use_dp=True, use_bn=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True) model.compile( loss='mse', optimizer='rmsprop', @@ -87,8 +87,8 @@ class ModelSubclassCompiledTest(keras_parameterized.TestCase): input_dim = 50 with self.cached_session(): - model = model_util.SimpleTestModel( - num_classes=num_classes, use_dp=True, use_bn=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True) model.compile( loss='mse', optimizer='rmsprop', diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py index a4b8ac92b03..5202b44c3a8 100644 --- a/tensorflow/python/keras/model_subclassing_test.py +++ b/tensorflow/python/keras/model_subclassing_test.py @@ -135,8 +135,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase): num_classes = 2 input_dim = 50 - model = model_util.SimpleTestModel( - num_classes=num_classes, use_dp=True, use_bn=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True) self.assertFalse(model.built, 'Model should not have been built') self.assertFalse(model.weights, ('Model should have no weights since it ' @@ -212,8 +212,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase): input_dim = 50 batch_size = None - model = model_util.SimpleTestModel( - num_classes=num_classes, use_dp=True, use_bn=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True) self.assertFalse(model.built, 'Model should not have been built') self.assertFalse(model.weights, ('Model should have no weights since it ' @@ -229,8 +229,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase): input_dim = tensor_shape.Dimension(50) batch_size = tensor_shape.Dimension(None) - model = model_util.SimpleTestModel( - num_classes=num_classes, use_dp=True, use_bn=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True) self.assertFalse(model.built, 'Model should not have been built') self.assertFalse(model.weights, ('Model should have no weights since it ' @@ -338,7 +338,8 @@ class ModelSubclassingTest(keras_parameterized.TestCase): self.contents += msg + '\n' # Single-io - model = model_util.SimpleTestModel(num_classes=4, use_bn=True, use_dp=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=4, use_bn=True, use_dp=True) model._set_inputs(np.ones((3, 4))) # need to build model first print_fn = ToString() model.summary(print_fn=print_fn) @@ -481,7 +482,6 @@ class ModelSubclassingTest(keras_parameterized.TestCase): self.assertEqual(1, len(model.get_updates_for(x))) - class GraphSpecificModelSubclassingTests(test.TestCase): @test_util.run_deprecated_v1 @@ -491,8 +491,8 @@ class GraphSpecificModelSubclassingTests(test.TestCase): input_dim = 50 with self.cached_session(): - model = model_util.SimpleTestModel( - num_classes=num_classes, use_dp=True, use_bn=True) + model = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=num_classes, use_dp=True, use_bn=True) model.compile(loss='mse', optimizer='rmsprop') x = array_ops.ones((num_samples, input_dim)) diff --git a/tensorflow/python/keras/model_subclassing_test_util.py b/tensorflow/python/keras/model_subclassing_test_util.py index cf627b984a1..5802d319e6b 100644 --- a/tensorflow/python/keras/model_subclassing_test_util.py +++ b/tensorflow/python/keras/model_subclassing_test_util.py @@ -23,30 +23,6 @@ from tensorflow.python.keras import testing_utils # pylint: disable=missing-docstring,not-callable -class SimpleTestModel(keras.Model): - - def __init__(self, use_bn=False, use_dp=False, num_classes=10): - super(SimpleTestModel, self).__init__(name='test_model') - self.use_bn = use_bn - self.use_dp = use_dp - self.num_classes = num_classes - - self.dense1 = keras.layers.Dense(32, activation='relu') - self.dense2 = keras.layers.Dense(num_classes, activation='softmax') - if self.use_dp: - self.dp = keras.layers.Dropout(0.5) - if self.use_bn: - self.bn = keras.layers.BatchNormalization(axis=-1) - - def call(self, x): - x = self.dense1(x) - if self.use_dp: - x = self.dp(x) - if self.use_bn: - x = self.bn(x) - return self.dense2(x) - - class SimpleConvTestModel(keras.Model): def __init__(self, num_classes=10): @@ -92,9 +68,8 @@ class NestedTestModel1(keras.Model): self.dense1 = keras.layers.Dense(32, activation='relu') self.dense2 = keras.layers.Dense(num_classes, activation='relu') self.bn = keras.layers.BatchNormalization() - self.test_net = SimpleTestModel(num_classes=4, - use_bn=True, - use_dp=True) + self.test_net = testing_utils.SmallSubclassMLP( + num_hidden=32, num_classes=4, use_bn=True, use_dp=True) def call(self, inputs): x = self.dense1(inputs) diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py index aa4059cb50e..2c48434cb68 100644 --- a/tensorflow/python/keras/testing_utils.py +++ b/tensorflow/python/keras/testing_utils.py @@ -416,17 +416,28 @@ def get_small_functional_mlp(num_hidden, num_classes, input_dim): return keras.Model(inputs, outputs) -class _SmallSubclassMLP(keras.Model): +class SmallSubclassMLP(keras.Model): """A subclass model based small MLP.""" - def __init__(self, num_hidden, num_classes): - super(_SmallSubclassMLP, self).__init__() + def __init__(self, num_hidden, num_classes, use_bn=False, use_dp=False): + super(SmallSubclassMLP, self).__init__(name='test_model') + self.use_bn = use_bn + self.use_dp = use_dp + self.layer_a = keras.layers.Dense(num_hidden, activation='relu') activation = 'sigmoid' if num_classes == 1 else 'softmax' self.layer_b = keras.layers.Dense(num_classes, activation=activation) + if self.use_dp: + self.dp = keras.layers.Dropout(0.5) + if self.use_bn: + self.bn = keras.layers.BatchNormalization(axis=-1) def call(self, inputs, **kwargs): x = self.layer_a(inputs) + if self.use_dp: + x = self.dp(x) + if self.use_bn: + x = self.bn(x) return self.layer_b(x) @@ -451,7 +462,7 @@ class _SmallSubclassMLPCustomBuild(keras.Model): def get_small_subclass_mlp(num_hidden, num_classes): - return _SmallSubclassMLP(num_hidden, num_classes) + return SmallSubclassMLP(num_hidden, num_classes) def get_small_subclass_mlp_with_custom_build(num_hidden, num_classes): From e2eaad32f64aa55d01a499d33efa1d7ec529b218 Mon Sep 17 00:00:00 2001 From: "T.J. Alumbaugh" Date: Wed, 4 Dec 2019 15:41:06 -0800 Subject: [PATCH 092/383] NeonCpuBackendGemm uses CpuBackedGemm interface instead of Ruy interface PiperOrigin-RevId: 283851982 Change-Id: Id894aa98fffcc8774f30eada4f7e4041fb537fc3 --- .../lite/experimental/ruy/prepacked_cache.cc | 2 +- .../lite/experimental/ruy/prepacked_cache.h | 5 +---- .../internal/optimized/neon_tensor_utils.cc | 18 ++++-------------- 3 files changed, 6 insertions(+), 19 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/prepacked_cache.cc b/tensorflow/lite/experimental/ruy/prepacked_cache.cc index eab1b6acdfd..372693d7670 100644 --- a/tensorflow/lite/experimental/ruy/prepacked_cache.cc +++ b/tensorflow/lite/experimental/ruy/prepacked_cache.cc @@ -38,7 +38,7 @@ void PrepackedCache::Insert(const CacheKey &key, // Calculate size of this new item. const size_t size_bytes = matrix.data_size + matrix.sums_size; - // If we are above the threshold of ejection, eject the LRU entry. + // While we are above the threshold of ejection, eject the LRU entry. while (!cache_.empty() && ((TotalSize() + size_bytes) > ejection_threshold_)) { EjectOne(); diff --git a/tensorflow/lite/experimental/ruy/prepacked_cache.h b/tensorflow/lite/experimental/ruy/prepacked_cache.h index 3f25b451ce1..1306e5f902f 100644 --- a/tensorflow/lite/experimental/ruy/prepacked_cache.h +++ b/tensorflow/lite/experimental/ruy/prepacked_cache.h @@ -69,10 +69,7 @@ enum CachePolicy { kNoCache, kCacheLHSOnGemV }; // The implementation is "low effort" in the following ways: // - we just linearly search for the oldest entry when doing an ejection // - the ejection policy is very simple: if the new size would be above the -// . threshold, we will eject one entry when adding an entry. Therefore, -// there are no guarantees on maximum cache size since one may -// insert an item larger than the ejection threshold (it will be ejected on -// the next insert, but inserts always succeed). +// . threshold, we will eject entries until the size is below the threshold. // Current use cases (RNNs with GEMV operations) indicate that ejection is rare // and memory constraints are tight, so we devote no additional storage to the // LRU mechanism and accept O(n) search to eject oldest entry. In practice, diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc index 5f75699e2ca..e5a71c7243d 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -957,11 +957,6 @@ void NeonCpuBackendGemm(const int8_t* input, const int32_t* bias, using ::tflite::cpu_backend_gemm::Gemm; using ::tflite::cpu_backend_gemm::GemmParams; using ::tflite::cpu_backend_gemm::MatrixParams; - using ::tflite::cpu_backend_gemm::QuantizationFlavor; - - ruy::Matrix ruy_lhs; - ruy::Matrix ruy_rhs; - ruy::Matrix ruy_dst; MatrixParams lhs_params; lhs_params.order = cpu_backend_gemm::Order::kRowMajor; @@ -978,15 +973,10 @@ void NeonCpuBackendGemm(const int8_t* input, const int32_t* bias, dst_params.rows = n_output; dst_params.cols = n_batch; - cpu_backend_gemm::detail::MakeRuyMatrix(lhs_params, input_to_gate_weights, - &ruy_lhs); - cpu_backend_gemm::detail::MakeRuyMatrix(rhs_params, input, &ruy_rhs); - cpu_backend_gemm::detail::MakeRuyMatrix(dst_params, scratch, &ruy_dst); - - ruy::BasicSpec ruy_spec; - ruy_spec.bias = bias; - ruy::Mul(ruy_lhs, ruy_rhs, ruy_spec, context->ruy_context(), - &ruy_dst); + GemmParams gemm_params; + gemm_params.bias = bias; + cpu_backend_gemm::Gemm(lhs_params, input_to_gate_weights, rhs_params, input, + dst_params, scratch, gemm_params, context); } void NeonMatrixBatchVectorMultiplyAccumulate( From 417a6464ce9bd4d6cb973a517229550d92547324 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 15:45:31 -0800 Subject: [PATCH 093/383] Add a testable example to tf.math.log PiperOrigin-RevId: 283852868 Change-Id: I572496e315be44e4238e58275c0972d2aa4bf9ab --- .../core/api_def/python_api/api_def_Log.pbtxt | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt index b6d2da6d32a..a8b00c696c0 100644 --- a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt @@ -7,4 +7,18 @@ op { name: "log" deprecation_version: 2 } + description: <>> x = tf.constant([0, 0.5, 1, 5]) +>>> tf.math.log(x) + + +``` + +See: https://en.wikipedia.org/wiki/Logarithm +END } From 48e88283dbed3fb9129d69b867dfb2ae1f7fec2d Mon Sep 17 00:00:00 2001 From: River Riddle Date: Wed, 4 Dec 2019 15:49:09 -0800 Subject: [PATCH 094/383] Add emitOptional(Error|Warning|Remark) functions to simplify emission with an optional location. In some situations a diagnostic may optionally be emitted by the presence of a location, e.g. attribute and type verification. These situations currently require extra 'if(loc) emitError(...); return failure()' wrappers that make verification clunky. These new overloads take an optional location and a list of arguments to the diagnostic, and return a LogicalResult. We take the arguments directly and return LogicalResult instead of returning InFlightDiagnostic because we cannot create a valid diagnostic with a null location. This creates an awkward situation where a user may try to treat the, potentially null, diagnostic as a valid one and encounter crashes when attaching notes/etc. Below is an example of how these methods simplify some existing usages: Before: if (loc) emitError(*loc, "this is my diagnostic with argument: ") << 5; return failure(); After: return emitOptionalError(loc, "this is my diagnostic with argument: ", 5); PiperOrigin-RevId: 283853599 Change-Id: Icc28f0257f7a4ffa96e14e61e57c5340c868f17d --- .../mlir/Dialect/QuantOps/QuantTypes.h | 18 ++-- third_party/mlir/include/mlir/IR/Attributes.h | 21 ++-- .../mlir/include/mlir/IR/Diagnostics.h | 24 +++++ .../mlir/include/mlir/IR/StandardTypes.h | 34 +++---- third_party/mlir/include/mlir/IR/Types.h | 10 +- .../lib/Dialect/QuantOps/IR/QuantTypes.cpp | 99 ++++++------------- third_party/mlir/lib/IR/Attributes.cpp | 43 ++++---- third_party/mlir/lib/IR/StandardTypes.cpp | 82 ++++++--------- third_party/mlir/lib/IR/Types.cpp | 14 ++- 9 files changed, 151 insertions(+), 194 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h index b705026ac91..a681d16c3ee 100644 --- a/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h +++ b/third_party/mlir/include/mlir/Dialect/QuantOps/QuantTypes.h @@ -75,10 +75,10 @@ public: static constexpr unsigned MaxStorageBits = 32; static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, unsigned flags, - Type storageType, Type expressedType, - int64_t storageTypeMin, int64_t storageTypeMax); + verifyConstructionInvariants(Optional loc, MLIRContext *context, + unsigned flags, Type storageType, + Type expressedType, int64_t storageTypeMin, + int64_t storageTypeMax); /// Support method to enable LLVM-style type casting. static bool classof(Type type) { @@ -238,10 +238,10 @@ public: /// Verifies construction invariants and issues errors/warnings. static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, unsigned flags, - Type storageType, Type expressedType, - int64_t storageTypeMin, int64_t storageTypeMax); + verifyConstructionInvariants(Optional loc, MLIRContext *context, + unsigned flags, Type storageType, + Type expressedType, int64_t storageTypeMin, + int64_t storageTypeMax); }; /// Represents a family of uniform, quantized types. @@ -298,7 +298,7 @@ public: /// Verifies construction invariants and issues errors/warnings. static LogicalResult verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, unsigned flags, + Optional loc, MLIRContext *context, unsigned flags, Type storageType, Type expressedType, double scale, int64_t zeroPoint, int64_t storageTypeMin, int64_t storageTypeMax); diff --git a/third_party/mlir/include/mlir/IR/Attributes.h b/third_party/mlir/include/mlir/IR/Attributes.h index ebff99ddcd5..3968d44dd37 100644 --- a/third_party/mlir/include/mlir/IR/Attributes.h +++ b/third_party/mlir/include/mlir/IR/Attributes.h @@ -321,12 +321,12 @@ public: } /// Verify the construction invariants for a double value. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, MLIRContext *ctx, - Type type, double value); - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, MLIRContext *ctx, - Type type, const APFloat &value); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *ctx, Type type, + double value); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *ctx, Type type, + const APFloat &value); }; //===----------------------------------------------------------------------===// @@ -403,10 +403,11 @@ public: StringRef getAttrData() const; /// Verify the construction of an opaque attribute. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, Identifier dialect, - StringRef attrData, Type type); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + Identifier dialect, + StringRef attrData, + Type type); static bool kindof(unsigned kind) { return kind == StandardAttributes::Opaque; diff --git a/third_party/mlir/include/mlir/IR/Diagnostics.h b/third_party/mlir/include/mlir/IR/Diagnostics.h index 1d284f6ccd7..4baea744a62 100644 --- a/third_party/mlir/include/mlir/IR/Diagnostics.h +++ b/third_party/mlir/include/mlir/IR/Diagnostics.h @@ -481,6 +481,30 @@ InFlightDiagnostic emitWarning(Location loc, const Twine &message); InFlightDiagnostic emitRemark(Location loc); InFlightDiagnostic emitRemark(Location loc, const Twine &message); +/// Overloads of the above emission functions that take an optionally null +/// location. If the location is null, no diagnostic is emitted and a failure is +/// returned. Given that the provided location may be null, these methods take +/// the diagnostic arguments directly instead of relying on the returned +/// InFlightDiagnostic. +template +LogicalResult emitOptionalError(Optional loc, Args &&... args) { + if (loc) + return emitError(*loc).append(std::forward(args)...); + return failure(); +} +template +LogicalResult emitOptionalWarning(Optional loc, Args &&... args) { + if (loc) + return emitWarning(*loc).append(std::forward(args)...); + return failure(); +} +template +LogicalResult emitOptionalRemark(Optional loc, Args &&... args) { + if (loc) + return emitRemark(*loc).append(std::forward(args)...); + return failure(); +} + //===----------------------------------------------------------------------===// // ScopedDiagnosticHandler //===----------------------------------------------------------------------===// diff --git a/third_party/mlir/include/mlir/IR/StandardTypes.h b/third_party/mlir/include/mlir/IR/StandardTypes.h index 2d232897428..f19c2d276fd 100644 --- a/third_party/mlir/include/mlir/IR/StandardTypes.h +++ b/third_party/mlir/include/mlir/IR/StandardTypes.h @@ -102,9 +102,9 @@ public: Location location); /// Verify the construction of an integer type. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, unsigned width); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + unsigned width); /// Return the bitwidth of this integer type. unsigned getWidth() const; @@ -168,9 +168,9 @@ public: static ComplexType getChecked(Type elementType, Location location); /// Verify the construction of an integer type. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, Type elementType); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + Type elementType); Type getElementType(); @@ -269,10 +269,10 @@ public: Location location); /// Verify the construction of a vector type. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, ArrayRef shape, - Type elementType); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + ArrayRef shape, + Type elementType); /// Returns true of the given type can be used as an element of a vector type. /// In particular, vectors can consist of integer or float primitives. @@ -328,10 +328,10 @@ public: Location location); /// Verify the construction of a ranked tensor type. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, ArrayRef shape, - Type elementType); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + ArrayRef shape, + Type elementType); ArrayRef getShape() const; @@ -359,9 +359,9 @@ public: static UnrankedTensorType getChecked(Type elementType, Location location); /// Verify the construction of a unranked tensor type. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, Type elementType); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + Type elementType); ArrayRef getShape() const { return llvm::None; } diff --git a/third_party/mlir/include/mlir/IR/Types.h b/third_party/mlir/include/mlir/IR/Types.h index b1d522a2511..11af3eb1e66 100644 --- a/third_party/mlir/include/mlir/IR/Types.h +++ b/third_party/mlir/include/mlir/IR/Types.h @@ -56,7 +56,7 @@ struct OpaqueTypeStorage; /// /// * Optional: /// - static LogicalResult verifyConstructionInvariants( -/// llvm::Optional loc, +/// Optional loc, /// MLIRContext *context, /// Args... args) /// * This method is invoked when calling the 'TypeBase::get/getChecked' @@ -250,10 +250,10 @@ public: StringRef getTypeData() const; /// Verify the construction of an opaque type. - static LogicalResult - verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *context, Identifier dialect, - StringRef typeData); + static LogicalResult verifyConstructionInvariants(Optional loc, + MLIRContext *context, + Identifier dialect, + StringRef typeData); static bool kindof(unsigned kind) { return kind == Kind::Opaque; } }; diff --git a/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp b/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp index 421d660a664..bc8290cda16 100644 --- a/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp +++ b/third_party/mlir/lib/Dialect/QuantOps/IR/QuantTypes.cpp @@ -33,28 +33,20 @@ unsigned QuantizedType::getFlags() const { } LogicalResult QuantizedType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, unsigned flags, + Optional loc, MLIRContext *context, unsigned flags, Type storageType, Type expressedType, int64_t storageTypeMin, int64_t storageTypeMax) { // Verify that the storage type is integral. // This restriction may be lifted at some point in favor of using bf16 // or f16 as exact representations on hardware where that is advantageous. auto intStorageType = storageType.dyn_cast(); - if (!intStorageType) { - if (loc) { - emitError(*loc, "storage type must be integral"); - } - return failure(); - } + if (!intStorageType) + return emitOptionalError(loc, "storage type must be integral"); unsigned integralWidth = intStorageType.getWidth(); // Verify storage width. - if (integralWidth == 0 || integralWidth > MaxStorageBits) { - if (loc) { - emitError(*loc, "illegal storage type size: ") << integralWidth; - } - return failure(); - } + if (integralWidth == 0 || integralWidth > MaxStorageBits) + return emitOptionalError(loc, "illegal storage type size: ", integralWidth); // Verify storageTypeMin and storageTypeMax. bool isSigned = @@ -66,11 +58,8 @@ LogicalResult QuantizedType::verifyConstructionInvariants( if (storageTypeMax - storageTypeMin <= 0 || storageTypeMin < defaultIntegerMin || storageTypeMax > defaultIntegerMax) { - if (loc) { - emitError(*loc, "illegal storage min and storage max: (") - << storageTypeMin << ":" << storageTypeMax << ")"; - } - return failure(); + return emitOptionalError(loc, "illegal storage min and storage max: (", + storageTypeMin, ":", storageTypeMax, ")"); } return success(); } @@ -235,7 +224,7 @@ AnyQuantizedType AnyQuantizedType::getChecked(unsigned flags, Type storageType, } LogicalResult AnyQuantizedType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, unsigned flags, + Optional loc, MLIRContext *context, unsigned flags, Type storageType, Type expressedType, int64_t storageTypeMin, int64_t storageTypeMax) { if (failed(QuantizedType::verifyConstructionInvariants( @@ -247,12 +236,8 @@ LogicalResult AnyQuantizedType::verifyConstructionInvariants( // Verify that the expressed type is floating point. // If this restriction is ever eliminated, the parser/printer must be // extended. - if (expressedType && !expressedType.isa()) { - if (loc) { - emitError(*loc, "expressed type must be floating point"); - } - return failure(); - } + if (expressedType && !expressedType.isa()) + return emitOptionalError(loc, "expressed type must be floating point"); return success(); } @@ -280,7 +265,7 @@ UniformQuantizedType::getChecked(unsigned flags, Type storageType, } LogicalResult UniformQuantizedType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, unsigned flags, + Optional loc, MLIRContext *context, unsigned flags, Type storageType, Type expressedType, double scale, int64_t zeroPoint, int64_t storageTypeMin, int64_t storageTypeMax) { if (failed(QuantizedType::verifyConstructionInvariants( @@ -291,30 +276,19 @@ LogicalResult UniformQuantizedType::verifyConstructionInvariants( // Uniform quantization requires fully expressed parameters, including // expressed type. - if (!expressedType) { - if (loc) { - emitError(*loc, "uniform quantization requires expressed type"); - } - return failure(); - } + if (!expressedType) + return emitOptionalError(loc, + "uniform quantization requires expressed type"); // Verify that the expressed type is floating point. // If this restriction is ever eliminated, the parser/printer must be // extended. - if (!expressedType.isa()) { - if (loc) { - emitError(*loc, "expressed type must be floating point"); - } - return failure(); - } + if (!expressedType.isa()) + return emitOptionalError(loc, "expressed type must be floating point"); // Verify scale. - if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale)) { - if (loc) { - emitError(*loc) << "illegal scale: " << scale; - } - return failure(); - } + if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale)) + return emitOptionalError(loc, "illegal scale: ", scale); return success(); } @@ -348,7 +322,7 @@ UniformQuantizedPerAxisType UniformQuantizedPerAxisType::getChecked( } LogicalResult UniformQuantizedPerAxisType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, unsigned flags, + Optional loc, MLIRContext *context, unsigned flags, Type storageType, Type expressedType, ArrayRef scales, ArrayRef zeroPoints, int32_t quantizedDimension, int64_t storageTypeMin, int64_t storageTypeMax) { @@ -360,40 +334,25 @@ LogicalResult UniformQuantizedPerAxisType::verifyConstructionInvariants( // Uniform quantization requires fully expressed parameters, including // expressed type. - if (!expressedType) { - if (loc) { - emitError(*loc, "uniform quantization requires expressed type"); - } - return failure(); - } + if (!expressedType) + return emitOptionalError(loc, + "uniform quantization requires expressed type"); // Verify that the expressed type is floating point. // If this restriction is ever eliminated, the parser/printer must be // extended. - if (!expressedType.isa()) { - if (loc) { - emitError(*loc, "expressed type must be floating point"); - } - return failure(); - } + if (!expressedType.isa()) + return emitOptionalError(loc, "expressed type must be floating point"); // Ensure that the number of scales and zeroPoints match. - if (scales.size() != zeroPoints.size()) { - if (loc) { - emitError(*loc, "illegal number of scales and zeroPoints: ") - << scales.size() << ", " << zeroPoints.size(); - } - return failure(); - } + if (scales.size() != zeroPoints.size()) + return emitOptionalError(loc, "illegal number of scales and zeroPoints: ", + scales.size(), ", ", zeroPoints.size()); // Verify scale. for (double scale : scales) { - if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale)) { - if (loc) { - emitError(*loc) << "illegal scale: " << scale; - } - return failure(); - } + if (scale <= 0.0 || std::isinf(scale) || std::isnan(scale)) + return emitOptionalError(loc, "illegal scale: ", scale); } return success(); diff --git a/third_party/mlir/lib/IR/Attributes.cpp b/third_party/mlir/lib/IR/Attributes.cpp index 5d7a4f08d1e..f2f3d41f980 100644 --- a/third_party/mlir/lib/IR/Attributes.cpp +++ b/third_party/mlir/lib/IR/Attributes.cpp @@ -214,35 +214,31 @@ double FloatAttr::getValueAsDouble(APFloat value) { } /// Verify construction invariants. -static LogicalResult verifyFloatTypeInvariants(llvm::Optional loc, +static LogicalResult verifyFloatTypeInvariants(Optional loc, Type type) { - if (!type.isa()) { - if (loc) - emitError(*loc, "expected floating point type"); - return failure(); - } + if (!type.isa()) + return emitOptionalError(loc, "expected floating point type"); return success(); } -LogicalResult FloatAttr::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *ctx, Type type, double value) { +LogicalResult FloatAttr::verifyConstructionInvariants(Optional loc, + MLIRContext *ctx, + Type type, double value) { return verifyFloatTypeInvariants(loc, type); } -LogicalResult -FloatAttr::verifyConstructionInvariants(llvm::Optional loc, - MLIRContext *ctx, Type type, - const APFloat &value) { +LogicalResult FloatAttr::verifyConstructionInvariants(Optional loc, + MLIRContext *ctx, + Type type, + const APFloat &value) { // Verify that the type is correct. if (failed(verifyFloatTypeInvariants(loc, type))) return failure(); // Verify that the type semantics match that of the value. if (&type.cast().getFloatSemantics() != &value.getSemantics()) { - if (loc) - emitError(*loc, - "FloatAttr type doesn't match the type implied by its value"); - return failure(); + return emitOptionalError( + loc, "FloatAttr type doesn't match the type implied by its value"); } return success(); } @@ -330,14 +326,13 @@ Identifier OpaqueAttr::getDialectNamespace() const { StringRef OpaqueAttr::getAttrData() const { return getImpl()->attrData; } /// Verify the construction of an opaque attribute. -LogicalResult OpaqueAttr::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, Identifier dialect, - StringRef attrData, Type type) { - if (!Dialect::isValidNamespace(dialect.strref())) { - if (loc) - emitError(*loc) << "invalid dialect namespace '" << dialect << "'"; - return failure(); - } +LogicalResult OpaqueAttr::verifyConstructionInvariants(Optional loc, + MLIRContext *context, + Identifier dialect, + StringRef attrData, + Type type) { + if (!Dialect::isValidNamespace(dialect.strref())) + return emitOptionalError(loc, "invalid dialect namespace '", dialect, "'"); return success(); } diff --git a/third_party/mlir/lib/IR/StandardTypes.cpp b/third_party/mlir/lib/IR/StandardTypes.cpp index 4347856de36..8a4b51f215a 100644 --- a/third_party/mlir/lib/IR/StandardTypes.cpp +++ b/third_party/mlir/lib/IR/StandardTypes.cpp @@ -61,13 +61,12 @@ bool Type::isIntOrFloat() { return isa() || isa(); } constexpr unsigned IntegerType::kMaxWidth; /// Verify the construction of an integer type. -LogicalResult IntegerType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, unsigned width) { +LogicalResult IntegerType::verifyConstructionInvariants(Optional loc, + MLIRContext *context, + unsigned width) { if (width > IntegerType::kMaxWidth) { - if (loc) - emitError(*loc) << "integer bitwidth is limited to " - << IntegerType::kMaxWidth << " bits"; - return failure(); + return emitOptionalError(loc, "integer bitwidth is limited to ", + IntegerType::kMaxWidth, " bits"); } return success(); } @@ -213,26 +212,21 @@ VectorType VectorType::getChecked(ArrayRef shape, Type elementType, StandardTypes::Vector, shape, elementType); } -LogicalResult VectorType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, ArrayRef shape, - Type elementType) { - if (shape.empty()) { - if (loc) - emitError(*loc, "vector types must have at least one dimension"); - return failure(); - } +LogicalResult VectorType::verifyConstructionInvariants(Optional loc, + MLIRContext *context, + ArrayRef shape, + Type elementType) { + if (shape.empty()) + return emitOptionalError(loc, + "vector types must have at least one dimension"); - if (!isValidElementType(elementType)) { - if (loc) - emitError(*loc, "vector elements must be int or float type"); - return failure(); - } + if (!isValidElementType(elementType)) + return emitOptionalError(loc, "vector elements must be int or float type"); + + if (any_of(shape, [](int64_t i) { return i <= 0; })) + return emitOptionalError(loc, + "vector types must have positive constant sizes"); - if (any_of(shape, [](int64_t i) { return i <= 0; })) { - if (loc) - emitError(*loc, "vector types must have positive constant sizes"); - return failure(); - } return success(); } @@ -247,11 +241,8 @@ ArrayRef VectorType::getShape() const { return getImpl()->getShape(); } static inline LogicalResult checkTensorElementType(Optional location, MLIRContext *context, Type elementType) { - if (!TensorType::isValidElementType(elementType)) { - if (location) - emitError(*location, "invalid tensor element type"); - return failure(); - } + if (!TensorType::isValidElementType(elementType)) + return emitOptionalError(location, "invalid tensor element type"); return success(); } @@ -273,14 +264,11 @@ RankedTensorType RankedTensorType::getChecked(ArrayRef shape, } LogicalResult RankedTensorType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, ArrayRef shape, + Optional loc, MLIRContext *context, ArrayRef shape, Type elementType) { for (int64_t s : shape) { - if (s < -1) { - if (loc) - emitError(*loc, "invalid tensor dimension size"); - return failure(); - } + if (s < -1) + return emitOptionalError(loc, "invalid tensor dimension size"); } return checkTensorElementType(loc, context, elementType); } @@ -305,7 +293,7 @@ UnrankedTensorType UnrankedTensorType::getChecked(Type elementType, } LogicalResult UnrankedTensorType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, Type elementType) { + Optional loc, MLIRContext *context, Type elementType) { return checkTensorElementType(loc, context, elementType); } @@ -350,19 +338,14 @@ MemRefType MemRefType::getImpl(ArrayRef shape, Type elementType, auto *context = elementType.getContext(); // Check that memref is formed from allowed types. - if (!elementType.isIntOrFloat() && !elementType.isa()) { - if (location) - emitError(*location, "invalid memref element type"); - return nullptr; - } + if (!elementType.isIntOrFloat() && !elementType.isa()) + return emitOptionalError(location, "invalid memref element type"), + MemRefType(); for (int64_t s : shape) { // Negative sizes are not allowed except for `-1` that means dynamic size. - if (s < -1) { - if (location) - emitError(*location, "invalid memref size"); - return {}; - } + if (s < -1) + return emitOptionalError(location, "invalid memref size"), MemRefType(); } // Check that the structure of the composition is valid, i.e. that each @@ -631,11 +614,8 @@ ComplexType ComplexType::getChecked(Type elementType, Location location) { /// Verify the construction of an integer type. LogicalResult ComplexType::verifyConstructionInvariants( llvm::Optional loc, MLIRContext *context, Type elementType) { - if (!elementType.isa() && !elementType.isa()) { - if (loc) - emitError(*loc, "invalid element type for complex"); - return failure(); - } + if (!elementType.isa() && !elementType.isa()) + return emitOptionalError(loc, "invalid element type for complex"); return success(); } diff --git a/third_party/mlir/lib/IR/Types.cpp b/third_party/mlir/lib/IR/Types.cpp index f1a6d8f11c9..23c80c96aad 100644 --- a/third_party/mlir/lib/IR/Types.cpp +++ b/third_party/mlir/lib/IR/Types.cpp @@ -80,13 +80,11 @@ Identifier OpaqueType::getDialectNamespace() const { StringRef OpaqueType::getTypeData() const { return getImpl()->typeData; } /// Verify the construction of an opaque type. -LogicalResult OpaqueType::verifyConstructionInvariants( - llvm::Optional loc, MLIRContext *context, Identifier dialect, - StringRef typeData) { - if (!Dialect::isValidNamespace(dialect.strref())) { - if (loc) - emitError(*loc) << "invalid dialect namespace '" << dialect << "'"; - return failure(); - } +LogicalResult OpaqueType::verifyConstructionInvariants(Optional loc, + MLIRContext *context, + Identifier dialect, + StringRef typeData) { + if (!Dialect::isValidNamespace(dialect.strref())) + return emitOptionalError(loc, "invalid dialect namespace '", dialect, "'"); return success(); } From 6c2b85d24b4afb31f42f670110b23d60fb88c016 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 15:50:51 -0800 Subject: [PATCH 095/383] Update docstring for tf.random.normal. PiperOrigin-RevId: 283853942 Change-Id: I6afec52582af7255738141331528894b49ee165b --- tensorflow/python/ops/random_ops.py | 38 +++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py index f9208cca551..75c5fd5c5d2 100644 --- a/tensorflow/python/ops/random_ops.py +++ b/tensorflow/python/ops/random_ops.py @@ -50,6 +50,24 @@ def random_normal(shape, name=None): """Outputs random values from a normal distribution. + Example that generates a new set of random values every time: + + >>> tf.random.set_seed(5); + >>> tf.random.normal([4], 0, 1, tf.float32) + + + Example that outputs a reproduceable result: + + >>> tf.random.set_seed(5); + >>> tf.random.normal([2,2], 0, 1, tf.float32, seed=1) + + + In this case, we are setting both the global and operation-level seed to + ensure this result is reproduceable. See `tf.random.set_seed` for more + information. + Args: shape: A 1-D integer Tensor or Python array. The shape of the output tensor. mean: A Tensor or Python value of type `dtype`, broadcastable with `stddev`. @@ -59,7 +77,7 @@ def random_normal(shape, dtype: The type of the output. seed: A Python integer. Used to create a random seed for the distribution. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: A name for the operation (optional). @@ -109,7 +127,7 @@ def parameterized_truncated_normal(shape, dtype: The type of the output. seed: A Python integer. Used to create a random seed for the distribution. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: A name for the operation (optional). @@ -160,7 +178,7 @@ def truncated_normal(shape, dtype: The type of the output. seed: A Python integer. Used to create a random seed for the distribution. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: A name for the operation (optional). @@ -325,7 +343,7 @@ def random_shuffle(value, seed=None, name=None): value: A Tensor to be shuffled. seed: A Python integer. Used to create a random seed for the distribution. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: A name for the operation (optional). @@ -354,7 +372,7 @@ def random_crop(value, size, seed=None, name=None): value: Input tensor to crop. size: 1-D tensor with size the rank of `value`. seed: Python integer. Used to create a random seed. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: A name for this operation (optional). @@ -401,7 +419,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None): `[i, :]` represents the unnormalized log-probabilities for all classes. num_samples: 0-D. Number of independent samples to draw for each row slice. seed: A Python integer. Used to create a random seed for the distribution. - See `tf.compat.v1.set_random_seed` for behavior. + See `tf.random.set_seed` for behavior. name: Optional name for the operation. output_dtype: integer type to use for the output. Defaults to int64. @@ -430,7 +448,7 @@ def categorical(logits, num_samples, dtype=None, seed=None, name=None): num_samples: 0-D. Number of independent samples to draw for each row slice. dtype: integer type to use for the output. Defaults to int64. seed: A Python integer. Used to create a random seed for the distribution. - See `tf.compat.v1.set_random_seed` for behavior. + See `tf.random.set_seed` for behavior. name: Optional name for the operation. Returns: @@ -521,7 +539,7 @@ def random_gamma(shape, `float64`. seed: A Python integer. Used to create a random seed for the distributions. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: Optional name for the operation. @@ -583,7 +601,7 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None): `int64`. seed: A Python integer. Used to create a random seed for the distributions. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: Optional name for the operation. @@ -622,7 +640,7 @@ def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None): `int64`. seed: A Python integer. Used to create a random seed for the distributions. See - `tf.compat.v1.set_random_seed` + `tf.random.set_seed` for behavior. name: Optional name for the operation. From 834e051ff45c69c10362089940dcfd7353cef628 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Wed, 4 Dec 2019 15:56:41 -0800 Subject: [PATCH 096/383] Read GetModuleConfigForTest for tests going through GetOptimizedModule. PiperOrigin-RevId: 283855146 Change-Id: Id636ec1869d91b3664be6589652cfcea16b95bcf --- tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc index 8b95c17d199..c2dc9125479 100644 --- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc +++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc @@ -98,9 +98,9 @@ void LlvmIrGenTestBase::MatchOptimizedHlo(absl::string_view hlo, StatusOr> LlvmIrGenTestBase::GetOptimizedModule( absl::string_view hlo) { - HloModuleConfig config; - TF_ASSIGN_OR_RETURN(std::unique_ptr module, - ParseAndReturnVerifiedModule(hlo, config)); + TF_ASSIGN_OR_RETURN( + std::unique_ptr module, + ParseAndReturnVerifiedModule(hlo, GetModuleConfigForTest())); return backend().compiler()->RunHloPasses( std::move(module), backend().default_stream_executor(), backend().default_stream_executor()->GetAllocator()); From 74261e62b645b459af31778042104f27a72cd685 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 16:03:27 -0800 Subject: [PATCH 097/383] Add op utils for entering TensorFlow activities. PiperOrigin-RevId: 283856744 Change-Id: I36fce752b41ee9b64cbea114eb0a07bca485654a --- tensorflow/core/profiler/utils/BUILD | 15 ++++- tensorflow/core/profiler/utils/op_utils.cc | 47 +++++++++++++ tensorflow/core/profiler/utils/op_utils.h | 78 ++++++++++++++++++++++ 3 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 tensorflow/core/profiler/utils/op_utils.cc create mode 100644 tensorflow/core/profiler/utils/op_utils.h diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD index edba7b3aa27..73fc56dfd11 100644 --- a/tensorflow/core/profiler/utils/BUILD +++ b/tensorflow/core/profiler/utils/BUILD @@ -36,13 +36,26 @@ cc_library( deps = [ ":math_utils", ":tf_op_utils", - "//tensorflow/core:tflite_portable_logging", + "//tensorflow/core:lib", "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc", "@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/strings", ], ) +cc_library( + name = "op_utils", + srcs = ["op_utils.cc"], + hdrs = ["op_utils.h"], + deps = [ + ":op_metrics_db_utils", + ":tf_op_utils", + "//tensorflow/core:lib", + "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc", + "@com_google_absl//absl/strings", + ], +) + cc_library( name = "tf_op_utils", srcs = ["tf_op_utils.cc"], diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc new file mode 100644 index 00000000000..3a899e47e87 --- /dev/null +++ b/tensorflow/core/profiler/utils/op_utils.cc @@ -0,0 +1,47 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/profiler/utils/op_utils.h" + +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h" + +namespace tensorflow { +namespace profiler { + +void HostOpMetricsDbBuilder::EnterOp(absl::string_view name, + absl::string_view category, uint64 time_ps, + uint64 children_time_ps) { + uint64 self_time_ps = time_ps - children_time_ps; + DCHECK_GE(time_ps, self_time_ps); + OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(/*hlo_module_id=*/0, name); + if (op_metrics->category().empty()) + op_metrics->set_category(category.data(), category.size()); + op_metrics->set_occurrences(op_metrics->occurrences() + 1); + op_metrics->set_time_ps(op_metrics->time_ps() + time_ps); + op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps); + db()->set_total_op_time_ps(db()->total_op_time_ps() + self_time_ps); +} + +void HostOpMetricsDbBuilder::UpdateHostInfeedEnqInfo( + uint64 duration_ps, uint64 start_timestamp_ps_diff) { + db()->set_total_host_infeed_enq_duration_ps( + db()->total_host_infeed_enq_duration_ps() + duration_ps); + db()->set_total_host_infeed_enq_start_timestamp_ps_diff( + db()->total_host_infeed_enq_start_timestamp_ps_diff() + + start_timestamp_ps_diff); +} +} // namespace profiler +} // namespace tensorflow diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h new file mode 100644 index 00000000000..44bfa508b09 --- /dev/null +++ b/tensorflow/core/profiler/utils/op_utils.h @@ -0,0 +1,78 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_ +#define TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_ + +#include + +#include "absl/strings/string_view.h" +#include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h" +#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h" +#include "tensorflow/core/profiler/utils/tf_op_utils.h" + +namespace tensorflow { +namespace profiler { + +class HostOpMetricsDbBuilder : public OpMetricsDbBuilder { + public: + explicit HostOpMetricsDbBuilder(OpMetricsDb* db) : OpMetricsDbBuilder(db) {} + + // A function that will be called when the end of an OP is + // observed on a trace, where: + // name = the OP name. + // category = the OP category. + // time_ps = the total execution time of the OP in picoseconds, including + // the execution time of its children. + // children_time_ps = the execution time of the children of this OP in + // picoseconds + void EnterOp(absl::string_view name, absl::string_view category, + uint64 time_ps, uint64 children_time_ps); + + // Updates total_host_infeed_enq_duration_ps_ and + // total_host_infeed_enq_duration_ps_. + void UpdateHostInfeedEnqInfo(uint64 duration_ps, + uint64 start_timestamp_ps_diff); +}; + +// Type of a TensorFlow Op activity, which is either beginning or ending an Op. +enum TfActivityType { kTfOpBegin, kTfOpEnd }; + +// Instant activity representing the begin or end of a host-side TF Op. +struct TfActivity { + // The timestamp in picoseconds when this activity happened. + uint64 timestamp_ps; + // The ID of this Op. + uint32 tf_op_id; + // Type of this activity. + TfActivityType activity_type; + // Full TF op name and type of this activity (backed by XEvent::name). + TfOp tf_op; +}; + +// TF Op metrics stored as element in OpStack. +struct TfOpInfo { + explicit TfOpInfo(uint64 ts) : start_timestamp_ps(ts) {} + + // Start timestamp in picoseconds. + uint64 start_timestamp_ps; + // Children duration in picoseconds. + uint64 children_duration_ps = 0; +}; +} // namespace profiler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PROFILER_UTILS_OP_UTILS_H_ From 25cf85aef597c7b248d116a38343f11a75f5e6bf Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Wed, 4 Dec 2019 16:03:40 -0800 Subject: [PATCH 098/383] Add int32 support to floor, ceil & rint Though the result is trivial, this avoids the need to call tf.cast if receiving a int32 tensor from another operation. PiperOrigin-RevId: 283856807 Change-Id: I15e83621b5c008042f5f20137a0fa7f17940a3af --- tensorflow/core/kernels/cwise_op_ceil.cc | 3 +- tensorflow/core/kernels/cwise_op_floor.cc | 3 +- tensorflow/core/kernels/cwise_op_rint.cc | 2 +- tensorflow/core/ops/math_ops.cc | 6 ++-- tensorflow/python/ops/math_ops_test.py | 42 +++++++++++++++++++++++ 5 files changed, 50 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index 4b1847d758c..c6e38a55efd 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -16,7 +16,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double); +REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double, + int32); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index 25210a0fa51..ba0340372f4 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -16,7 +16,8 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double); +REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double, + int32); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_rint.cc b/tensorflow/core/kernels/cwise_op_rint.cc index f9fe8321947..c6071d02295 100644 --- a/tensorflow/core/kernels/cwise_op_rint.cc +++ b/tensorflow/core/kernels/cwise_op_rint.cc @@ -16,7 +16,7 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER2(UnaryOp, CPU, "Rint", functor::rint, float, double); +REGISTER3(UnaryOp, CPU, "Rint", functor::rint, float, double, int32); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER2(UnaryOp, GPU, "Rint", functor::rint, float, double); #endif diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index ccdcf0b76e6..691f34ff307 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -349,19 +349,19 @@ REGISTER_OP("Sign") REGISTER_OP("Floor") .Input("x: T") .Output("y: T") - .Attr("T: {bfloat16, half, float, double}") + .Attr("T: {bfloat16, half, float, double, int32}") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("Ceil") .Input("x: T") .Output("y: T") - .Attr("T: {bfloat16, half, float, double}") + .Attr("T: {bfloat16, half, float, double, int32}") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("Rint") .Input("x: T") .Output("y: T") - .Attr("T: {bfloat16, half, float, double}") + .Attr("T: {bfloat16, half, float, double, int32}") .SetShapeFn(shape_inference::UnchangedShape); // Declares cwise binary operations signature: 't, 't -> 't. diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index f49ba3dd2a3..c1591791cbd 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -179,6 +179,48 @@ class RoundTest(test_util.TensorFlowTestCase): self.assertAllClose(y_tf_np, y_np, atol=1e-2) +@test_util.run_all_in_graph_and_eager_modes +class FloorTest(test_util.TensorFlowTestCase): + + def testFloor(self): + x = np.arange(-5.0, 5.0, .25) + for dtype in [np.float32, np.double, np.int32]: + x_np = np.array(x, dtype=dtype) + x_tf = constant_op.constant(x_np, shape=x_np.shape) + y_tf = math_ops.floor(x_tf) + y_tf_np = self.evaluate(y_tf) + y_np = np.floor(x_np) + self.assertAllClose(y_tf_np, y_np, atol=1e-2) + + +@test_util.run_all_in_graph_and_eager_modes +class CeilTest(test_util.TensorFlowTestCase): + + def testCeil(self): + x = np.arange(-5.0, 5.0, .25) + for dtype in [np.float32, np.double, np.int32]: + x_np = np.array(x, dtype=dtype) + x_tf = constant_op.constant(x_np, shape=x_np.shape) + y_tf = math_ops.ceil(x_tf) + y_tf_np = self.evaluate(y_tf) + y_np = np.ceil(x_np) + self.assertAllClose(y_tf_np, y_np, atol=1e-2) + + +@test_util.run_all_in_graph_and_eager_modes +class RintTest(test_util.TensorFlowTestCase): + + def testRint(self): + x = np.arange(-5.0, 5.0, .25) + for dtype in [np.float32, np.double, np.int32]: + x_np = np.array(x, dtype=dtype) + x_tf = constant_op.constant(x_np, shape=x_np.shape) + y_tf = math_ops.rint(x_tf) + y_tf_np = self.evaluate(y_tf) + y_np = np.rint(x_np) + self.assertAllClose(y_tf_np, y_np, atol=1e-2) + + @test_util.run_all_in_graph_and_eager_modes class ModTest(test_util.TensorFlowTestCase): From 4c1ad6329a0c4bf03ca437aa1cb3dc24e2d76920 Mon Sep 17 00:00:00 2001 From: Yunlu Li Date: Wed, 4 Dec 2019 16:08:09 -0800 Subject: [PATCH 099/383] Sparse tensor definition in TFLite. PiperOrigin-RevId: 283857867 Change-Id: If52f0176910801681a65b74ef87e9a1956ce7b71 --- tensorflow/lite/BUILD | 1 + tensorflow/lite/c/common.c | 34 ++ tensorflow/lite/c/common.h | 31 ++ tensorflow/lite/c/common_test.cc | 26 ++ tensorflow/lite/core/subgraph.cc | 20 +- tensorflow/lite/core/subgraph.h | 9 +- tensorflow/lite/model.cc | 81 ++++- tensorflow/lite/model.h | 2 + tensorflow/lite/model_test.cc | 72 ++++ tensorflow/lite/schema/schema.fbs | 80 +++++ tensorflow/lite/schema/schema_generated.h | 334 +++++++++++++++++- tensorflow/lite/testdata/sparse_tensor.bin | Bin 0 -> 412 bytes tensorflow/lite/testdata/sparse_tensor.json | 63 ++++ .../benchmark/experimental/c/c_api_types.h | 31 ++ 14 files changed, 769 insertions(+), 15 deletions(-) create mode 100644 tensorflow/lite/testdata/sparse_tensor.bin create mode 100644 tensorflow/lite/testdata/sparse_tensor.json diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD index 530b27aa7d3..84150546353 100644 --- a/tensorflow/lite/BUILD +++ b/tensorflow/lite/BUILD @@ -318,6 +318,7 @@ cc_test( "testdata/2_subgraphs.bin", "testdata/empty_model.bin", "testdata/multi_add_flex.bin", + "testdata/sparse_tensor.bin", "testdata/test_min_runtime.bin", "testdata/test_model.bin", "testdata/test_model_broken.bin", diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c index 524bf8091fe..0b17c049e93 100644 --- a/tensorflow/lite/c/common.c +++ b/tensorflow/lite/c/common.c @@ -103,12 +103,46 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) { quantization->type = kTfLiteNoQuantization; } +void TfLiteSparsityFree(TfLiteSparsity* sparsity) { + if (sparsity == NULL) { + return; + } + + if (sparsity->traversal_order) { + TfLiteIntArrayFree(sparsity->traversal_order); + sparsity->traversal_order = NULL; + } + + if (sparsity->block_map) { + TfLiteIntArrayFree(sparsity->block_map); + sparsity->block_map = NULL; + } + + if (sparsity->dim_metadata) { + for (int i = 0; i < sparsity->dim_metadata_size; i++) { + TfLiteDimensionMetadata metadata = sparsity->dim_metadata[i]; + if (metadata.format == kTfLiteDimSparseCSR) { + TfLiteIntArrayFree(metadata.array_segments); + metadata.array_segments = NULL; + TfLiteIntArrayFree(metadata.array_indices); + metadata.array_indices = NULL; + } + } + free(sparsity->dim_metadata); + sparsity->dim_metadata = NULL; + } + + free(sparsity); +} + void TfLiteTensorFree(TfLiteTensor* t) { TfLiteTensorDataFree(t); if (t->dims) TfLiteIntArrayFree(t->dims); t->dims = NULL; TfLiteQuantizationFree(&t->quantization); + TfLiteSparsityFree(t->sparsity); + t->sparsity = NULL; } void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims, diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h index b3b0ddc059d..332b9b68881 100644 --- a/tensorflow/lite/c/common.h +++ b/tensorflow/lite/c/common.h @@ -303,6 +303,29 @@ enum { kTfLiteNullBufferHandle = -1, }; +// Storage format of each dimension in a sparse tensor. +typedef enum { + kTfLiteDimDense = 0, + kTfLiteDimSparseCSR, +} TfLiteDimensionType; + +// Metadata to encode each dimension in a sparse tensor. +typedef struct { + TfLiteDimensionType format; + int dense_size; + TfLiteIntArray* array_segments; + TfLiteIntArray* array_indices; +} TfLiteDimensionMetadata; + +// Parameters used to encode a sparse tensor. For detailed explanation of each +// field please refer to lite/schema/schema.fbs. +typedef struct { + TfLiteIntArray* traversal_order; + TfLiteIntArray* block_map; + TfLiteDimensionMetadata* dim_metadata; + int dim_metadata_size; +} TfLiteSparsity; + // An tensor in the interpreter system which is a wrapper around a buffer of // data including a dimensionality (or NULL if not currently defined). typedef struct { @@ -357,6 +380,11 @@ typedef struct { // Quantization information. Replaces params field above. TfLiteQuantization quantization; + + // Parameters used to encode a sparse tensor. + // This is optional. The field is NULL if a tensor is dense. + // WARNING: This is an experimental interface that is subject to change. + TfLiteSparsity* sparsity; } TfLiteTensor; // Free data memory of tensor `t`. @@ -365,6 +393,9 @@ void TfLiteTensorDataFree(TfLiteTensor* t); // Free quantization data. void TfLiteQuantizationFree(TfLiteQuantization* quantization); +// Free sparsity parameters. +void TfLiteSparsityFree(TfLiteSparsity* sparsity); + // Free memory of tensor `t`. void TfLiteTensorFree(TfLiteTensor* t); diff --git a/tensorflow/lite/c/common_test.cc b/tensorflow/lite/c/common_test.cc index 88ac181faf6..65c6ec63b28 100644 --- a/tensorflow/lite/c/common_test.cc +++ b/tensorflow/lite/c/common_test.cc @@ -96,6 +96,7 @@ TEST(Quantization, TestQuantizationFree) { t.allocation_type = kTfLiteArenaRw; t.dims = nullptr; t.quantization.type = kTfLiteAffineQuantization; + t.sparsity = nullptr; auto* params = reinterpret_cast( malloc(sizeof(TfLiteAffineQuantization))); params->scale = TfLiteFloatArrayCreate(3); @@ -104,6 +105,31 @@ TEST(Quantization, TestQuantizationFree) { TfLiteTensorFree(&t); } +TEST(Sparsity, TestSparsityFree) { + TfLiteTensor t; + // Set these values, otherwise TfLiteTensorFree has uninitialized values. + t.allocation_type = kTfLiteArenaRw; + t.dims = nullptr; + + // A dummy CSR sparse matrix. + t.sparsity = static_cast(malloc(sizeof(TfLiteSparsity))); + t.sparsity->traversal_order = TfLiteIntArrayCreate(2); + t.sparsity->block_map = nullptr; + + t.sparsity->dim_metadata = static_cast( + malloc(sizeof(TfLiteDimensionMetadata) * 2)); + t.sparsity->dim_metadata_size = 2; + + t.sparsity->dim_metadata[0].format = kTfLiteDimDense; + t.sparsity->dim_metadata[0].dense_size = 4; + + t.sparsity->dim_metadata[1].format = kTfLiteDimSparseCSR; + t.sparsity->dim_metadata[1].array_segments = TfLiteIntArrayCreate(2); + t.sparsity->dim_metadata[1].array_indices = TfLiteIntArrayCreate(3); + + TfLiteTensorFree(&t); +} + } // namespace tflite int main(int argc, char** argv) { diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc index e453ff2ff7e..69c39769593 100644 --- a/tensorflow/lite/core/subgraph.cc +++ b/tensorflow/lite/core/subgraph.cc @@ -40,6 +40,15 @@ struct TfLiteQuantizationDeleter { using ScopedTfLiteQuantization = std::unique_ptr; +struct TfLiteSparsityDeleter { + void operator()(TfLiteSparsity* s) { + if (s) TfLiteSparsityFree(s); + } +}; + +using ScopedTfLiteSparsity = + std::unique_ptr; + TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node, const TfLiteRegistration& registration, int node_index, const char* message) { @@ -908,9 +917,10 @@ TfLiteStatus Subgraph::GetNodeAndRegistration( TfLiteStatus Subgraph::SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantization quantization, const char* buffer, - size_t bytes, const Allocation* allocation) { + size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity) { // Ensure quantization cleanup on failure. ScopedTfLiteQuantization scoped_quantization(&quantization); + ScopedTfLiteSparsity scoped_sparsity(sparsity); if (state_ == kStateInvokableAndImmutable) { ReportError( "SetTensorParametersReadOnly is disallowed when graph is immutable."); @@ -919,10 +929,12 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( TF_LITE_ENSURE(&context_, tensor_index < context_.tensors_size && tensor_index >= 0); + // For most tensors we know exactly how much memory is necessary so we can // ensure the buffer is large enough. However, we need to skip string tensors - // because their sizes change with the contents of the individual strings. - if (type != kTfLiteString) { + // and sparse tensors because their sizes change with the contents. + // TODO(b/145615516): Extend BytesRequired to check sparse tensors. + if (type != kTfLiteString && sparsity == nullptr) { size_t required_bytes; TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims, rank, &required_bytes)); @@ -939,6 +951,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims); tensor.params = GetLegacyQuantization(quantization); tensor.quantization = *scoped_quantization.release(); + tensor.sparsity = scoped_sparsity.release(); tensor.allocation_type = kTfLiteMmapRo; tensor.allocation = allocation; } else { @@ -950,6 +963,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( // TODO(suharshs): Update TfLiteTensorReset to include the new quantization // if there are other required callers. tensor.quantization = *scoped_quantization.release(); + tensor.sparsity = scoped_sparsity.release(); } return kTfLiteOk; } diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h index 89a9da7db28..c2572546709 100644 --- a/tensorflow/lite/core/subgraph.h +++ b/tensorflow/lite/core/subgraph.h @@ -94,16 +94,17 @@ class Subgraph { inline TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const std::vector& dims, TfLiteQuantization quantization, - const char* buffer, size_t bytes, - const Allocation* allocation = nullptr) { + const char* buffer, size_t bytes, const Allocation* allocation = nullptr, + TfLiteSparsity* sparsity = nullptr) { return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(), dims.data(), quantization, buffer, bytes, - allocation); + allocation, sparsity); } TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantization quantization, const char* buffer, - size_t bytes, const Allocation* allocation = nullptr); + size_t bytes, const Allocation* allocation = nullptr, + TfLiteSparsity* sparsity = nullptr); // Set description of inputs/outputs/data/fptrs for node `node_index`. // This variant assumes an external buffer has been allocated of size diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc index d060289a0ee..a4287a57ea0 100644 --- a/tensorflow/lite/model.cc +++ b/tensorflow/lite/model.cc @@ -416,6 +416,77 @@ TfLiteStatus InterpreterBuilder::ParseQuantization( return kTfLiteOk; } +// TODO(b/145614687): Add sparse tensor verification check in +// lite/tools/verifier.cc. +TfLiteStatus InterpreterBuilder::ParseSparsity( + const SparsityParameters* src_sparsity, TfLiteSparsity** sparsity_ptr) { + if (!src_sparsity) { + return kTfLiteOk; + } + + auto* sparsity = + reinterpret_cast(malloc(sizeof(TfLiteSparsity))); + memset(sparsity, 0, sizeof(TfLiteSparsity)); + *sparsity_ptr = sparsity; + + if (src_sparsity->traversal_order()) { + const size_t traversal_order_size = src_sparsity->traversal_order()->size(); + sparsity->traversal_order = TfLiteIntArrayCreate(traversal_order_size); + for (int i = 0; i < traversal_order_size; i++) { + sparsity->traversal_order->data[i] = + src_sparsity->traversal_order()->Get(i); + } + } + + if (src_sparsity->block_map()) { + const size_t block_map_size = src_sparsity->block_map()->size(); + sparsity->block_map = TfLiteIntArrayCreate(block_map_size); + for (int i = 0; i < block_map_size; i++) { + sparsity->block_map->data[i] = src_sparsity->block_map()->Get(i); + } + } + + if (src_sparsity->dim_metadata()) { + const size_t dim_metadata_size = src_sparsity->dim_metadata()->size(); + sparsity->dim_metadata_size = dim_metadata_size; + sparsity->dim_metadata = reinterpret_cast( + malloc(dim_metadata_size * sizeof(TfLiteDimensionMetadata))); + memset(sparsity->dim_metadata, 0, + dim_metadata_size * sizeof(TfLiteDimensionMetadata)); + + for (int i = 0; i < dim_metadata_size; i++) { + const auto* src_metadata = src_sparsity->dim_metadata()->Get(i); + auto* tgt_metadata = &sparsity->dim_metadata[i]; + + tgt_metadata->format = + static_cast(src_metadata->format()); + + if (tgt_metadata->format == kTfLiteDimDense) { + tgt_metadata->dense_size = src_metadata->dense_size(); + } else if (tgt_metadata->format == kTfLiteDimSparseCSR) { + const int array_segments_size = src_metadata->array_segments()->size(); + tgt_metadata->array_segments = + TfLiteIntArrayCreate(array_segments_size); + for (int j = 0; j < array_segments_size; j++) { + tgt_metadata->array_segments->data[j] = + src_metadata->array_segments()->Get(j); + } + const int array_indices_size = src_metadata->array_indices()->size(); + tgt_metadata->array_indices = TfLiteIntArrayCreate(array_indices_size); + for (int j = 0; j < array_indices_size; j++) { + tgt_metadata->array_indices->data[j] = + src_metadata->array_indices()->Get(j); + } + } else { + error_reporter_->Report("Unsupported dimension type."); + return kTfLiteError; + } + } + } + + return kTfLiteOk; +} + TfLiteStatus InterpreterBuilder::ParseTensors( const flatbuffers::Vector>* buffers, const flatbuffers::Vector>* tensors, @@ -474,6 +545,13 @@ TfLiteStatus InterpreterBuilder::ParseTensors( continue; } + const auto* src_sparsity = tensor->sparsity(); + TfLiteSparsity* sparsity = nullptr; + if (ParseSparsity(src_sparsity, &sparsity) != kTfLiteOk) { + status = kTfLiteError; + continue; + } + bool is_variable = tensor->is_variable(); if (buffer_ptr) { if (is_variable) { @@ -486,12 +564,13 @@ TfLiteStatus InterpreterBuilder::ParseTensors( if (subgraph->SetTensorParametersReadOnly( i, type, get_name(tensor), dims, quantization, buffer_ptr, - buffer_size, allocation_) != kTfLiteOk) { + buffer_size, allocation_, sparsity) != kTfLiteOk) { error_reporter_->Report("Tensor %d is invalidly specified in schema.\n", i); status = kTfLiteError; } } else { + // TODO(b/144999664): Non-constant sparse tensor is not supported now. if (subgraph->SetTensorParametersReadWrite(i, type, get_name(tensor), dims, quantization, is_variable) != kTfLiteOk) { diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h index fafb38ffd10..b8b4b4457da 100644 --- a/tensorflow/lite/model.h +++ b/tensorflow/lite/model.h @@ -223,6 +223,8 @@ class InterpreterBuilder { TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization, TfLiteQuantization* quantization, const std::vector& dims); + TfLiteStatus ParseSparsity(const SparsityParameters* src_sparsity, + TfLiteSparsity** sparsity); const ::tflite::Model* model_; const OpResolver& op_resolver_; diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc index 7dc582b8862..2675715a613 100644 --- a/tensorflow/lite/model_test.cc +++ b/tensorflow/lite/model_test.cc @@ -331,6 +331,78 @@ TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) { ASSERT_EQ(model2->GetMinimumRuntime(), "1.10.0"); } +// The test model has the following tensor encoded in the TACO format: +// [[1, 0, 2, 3], +// [0, 4, 0, 0], +// [0, 0, 5, 0], +// [0, 0, 0, 6]]. +// TACO supports multiple encodings like CSR, CSC, etc. We chose to use the one +// similar to the blocked-CSR format with 2x2 row-major dense blocks. +TEST(BasicFlatBufferModel, TestParseModelWithSparseTensor) { + // The model only has 1 sparse constant tensor. + auto model = FlatBufferModel::BuildFromFile( + "tensorflow/lite/testdata/sparse_tensor.bin"); + ASSERT_TRUE(model); + + std::unique_ptr interpreter(new Interpreter); + ASSERT_EQ(InterpreterBuilder(*model, TrivialResolver())(&interpreter), + kTfLiteOk); + ASSERT_NE(interpreter, nullptr); + ASSERT_EQ(interpreter->tensors_size(), 1); + TfLiteTensor* t1 = interpreter->tensor(0); + ASSERT_EQ(t1->allocation_type, kTfLiteMmapRo); + + TfLiteIntArray* traversal_order = TfLiteIntArrayCreate(4); + traversal_order->data[0] = 0; + traversal_order->data[1] = 1; + traversal_order->data[2] = 2; + traversal_order->data[3] = 3; + ASSERT_TRUE( + TfLiteIntArrayEqual(t1->sparsity->traversal_order, traversal_order)); + TfLiteIntArrayFree(traversal_order); + + TfLiteIntArray* block_map = TfLiteIntArrayCreate(2); + block_map->data[0] = 0; + block_map->data[1] = 1; + ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->block_map, block_map)); + TfLiteIntArrayFree(block_map); + + ASSERT_EQ(t1->sparsity->dim_metadata_size, 4); + + ASSERT_EQ(t1->sparsity->dim_metadata[0].format, kTfLiteDimDense); + ASSERT_EQ(t1->sparsity->dim_metadata[0].dense_size, 2); + ASSERT_EQ(t1->sparsity->dim_metadata[0].array_segments, nullptr); + ASSERT_EQ(t1->sparsity->dim_metadata[0].array_indices, nullptr); + + ASSERT_EQ(t1->sparsity->dim_metadata[1].format, kTfLiteDimSparseCSR); + ASSERT_EQ(t1->sparsity->dim_metadata[1].dense_size, 0); + TfLiteIntArray* array_segments = TfLiteIntArrayCreate(3); + array_segments->data[0] = 0; + array_segments->data[1] = 2; + array_segments->data[2] = 3; + ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->dim_metadata[1].array_segments, + array_segments)); + TfLiteIntArrayFree(array_segments); + + TfLiteIntArray* array_indices = TfLiteIntArrayCreate(3); + array_indices->data[0] = 0; + array_indices->data[1] = 1; + array_indices->data[2] = 1; + ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->dim_metadata[1].array_indices, + array_indices)); + TfLiteIntArrayFree(array_indices); + + ASSERT_EQ(t1->sparsity->dim_metadata[2].format, kTfLiteDimDense); + ASSERT_EQ(t1->sparsity->dim_metadata[2].dense_size, 2); + ASSERT_EQ(t1->sparsity->dim_metadata[2].array_segments, nullptr); + ASSERT_EQ(t1->sparsity->dim_metadata[2].array_indices, nullptr); + + ASSERT_EQ(t1->sparsity->dim_metadata[3].format, kTfLiteDimDense); + ASSERT_EQ(t1->sparsity->dim_metadata[3].dense_size, 2); + ASSERT_EQ(t1->sparsity->dim_metadata[3].array_segments, nullptr); + ASSERT_EQ(t1->sparsity->dim_metadata[3].array_indices, nullptr); +} + // TODO(aselle): Add tests for serialization of builtin op data types. // These tests will occur with the evaluation tests of individual operators, // not here. diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs index f1fbfc655d6..63fd3bbc4d6 100644 --- a/tensorflow/lite/schema/schema.fbs +++ b/tensorflow/lite/schema/schema.fbs @@ -80,6 +80,82 @@ table QuantizationParameters { quantized_dimension:int; } +// Sparse tensors. +// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1), +// potentially with a k-dimensional block (0 <= k <= n) with dims +// (dn, ..., dn+k-1), the format needs to specify: +// 1. In what order to traverse these dimensions. For example, to store a 2-D +// matrix in row major order, the traversal order would be (d0, d1), +// whereas to store it in column major order, the traversal order would be +// (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order +// could be (d0, d1, d2, d3). +// 2. In the order of (d0, ..., dn-1, dn, ..., dn+k-1), whether each dimension +// is DENSE or SPARSE. +// 3. How each block dimension in (dn, ..., dn+k-1) maps to the original +// tensor dimension in (d0, ..., dn-1). +// 4. Index metadata for each dimension. For a dense dimension, this is just +// the size of that dimension. For a sparse dimension, it's the same as +// the compressed index defined in the Compressed Sparse Row (CSR) format. +// (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html) + +// The storage type for a dimension. Currently we support: +// 1. DENSE: each coordinate in this dimension is stored implicitly. +// 2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The +// compression technique is the same what CSR uses. +// More types like a sparse dimension with a different compression technique +// could be added to the list in the future. +enum DimensionType : byte { + DENSE = 0, + SPARSE_CSR = 1, +} + +table DimensionMetadata { + // Whether each dimension is dense or sparse. + format:DimensionType; + // Index metadata used for each dimension. + // - If format is DimensionType.DENSE then we use the dense_size field to + // store the size of that dimension. Each index in that dimension is + // stored implicitly. + // - If format is DimensionType.SPARSE_CSR then we use array_segments and + // array_indices to encode that dimension. array_segments represents how + // to segment the indices array, each segment corresponds to one element + // in the previous dimension. array_indices represents the index of the + // non-zero elements within this dimension (as those in the CSR matrix + // format, where the first array is row pointers and the second array is + // column indices). + dense_size:int; + array_segments:[int]; + array_indices:[int]; +} + +// Parameters to encode a sparse TfLite tensor. +table SparsityParameters { + // The traversal order of the dimensions defined in the `shape` field of the + // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1, + // ..., dn-1), + // - if not block sparse, the traversal_order is just a permutation of (d0, + // ..., dn-1). For example, a 2-D matrix stored in row-major order would + // have traversal_order = (d0, d1). + // - if block sparse with a k-dimensional block (0 <= k <= n), the + // traversal_order has n + k elements. The first n elements are still a + // permutation of (d0, ..., dn-1). The lask k elements are a permutation + // of (dn, ..., dn+k-1), defining how to traverse a block internally. For + // example, a 2-D matrix with 2-D blocks, both stored in row-major order + // would have traversal_order = (d0, d1, d2, d3). + traversal_order:[int]; + // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n), + // stores how a block dimension in (dn, ..., dn+k-1) maps to the original + // tensor dimension in (d0, ..., dn). + // It's stored in the order of (dn, ..., dn+k-1). + // If not block-sparse, this field is NULL. + block_map:[int]; + // In the order of (d0, ..., dn-1, dn, ..., dn+k-1), the metadata needed for + // each dimension to locate the non-zero values in the original dense tensor. + // The size of the dim_metadata array = the size of the traversal_order array + // = n + k. + dim_metadata:[DimensionMetadata]; +} + table Tensor { // The tensor shape. The meaning of each entry is operator-specific but // builtin ops use: [batch size, height, width, number of channels] (That's @@ -99,6 +175,10 @@ table Tensor { quantization:QuantizationParameters; // Optional. is_variable:bool = false; + + // Parameters to encode a sparse tensor. See the example in + // tensorflow/lite/testdata/sparse_tensor.json. + sparsity:SparsityParameters; // Optional. } // A list of builtin operators. Builtin operators are slightly faster than custom diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h index ea2f1cc0b8b..ae523cc7d5a 100755 --- a/tensorflow/lite/schema/schema_generated.h +++ b/tensorflow/lite/schema/schema_generated.h @@ -28,6 +28,12 @@ struct CustomQuantizationT; struct QuantizationParameters; struct QuantizationParametersT; +struct DimensionMetadata; +struct DimensionMetadataT; + +struct SparsityParameters; +struct SparsityParametersT; + struct Tensor; struct TensorT; @@ -477,6 +483,36 @@ struct QuantizationDetailsUnion { bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type); bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); +enum DimensionType { + DimensionType_DENSE = 0, + DimensionType_SPARSE_CSR = 1, + DimensionType_MIN = DimensionType_DENSE, + DimensionType_MAX = DimensionType_SPARSE_CSR +}; + +inline const DimensionType (&EnumValuesDimensionType())[2] { + static const DimensionType values[] = { + DimensionType_DENSE, + DimensionType_SPARSE_CSR + }; + return values; +} + +inline const char * const *EnumNamesDimensionType() { + static const char * const names[] = { + "DENSE", + "SPARSE_CSR", + nullptr + }; + return names; +} + +inline const char *EnumNameDimensionType(DimensionType e) { + if (e < DimensionType_DENSE || e > DimensionType_SPARSE_CSR) return ""; + const size_t index = static_cast(e); + return EnumNamesDimensionType()[index]; +} + enum BuiltinOperator { BuiltinOperator_ADD = 0, BuiltinOperator_AVERAGE_POOL_2D = 1, @@ -2867,6 +2903,206 @@ inline flatbuffers::Offset CreateQuantizationParametersD flatbuffers::Offset CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct DimensionMetadataT : public flatbuffers::NativeTable { + typedef DimensionMetadata TableType; + DimensionType format; + int32_t dense_size; + std::vector array_segments; + std::vector array_indices; + DimensionMetadataT() + : format(DimensionType_DENSE), + dense_size(0) { + } +}; + +struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef DimensionMetadataT NativeTableType; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_FORMAT = 4, + VT_DENSE_SIZE = 6, + VT_ARRAY_SEGMENTS = 8, + VT_ARRAY_INDICES = 10 + }; + DimensionType format() const { + return static_cast(GetField(VT_FORMAT, 0)); + } + int32_t dense_size() const { + return GetField(VT_DENSE_SIZE, 0); + } + const flatbuffers::Vector *array_segments() const { + return GetPointer *>(VT_ARRAY_SEGMENTS); + } + const flatbuffers::Vector *array_indices() const { + return GetPointer *>(VT_ARRAY_INDICES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_FORMAT) && + VerifyField(verifier, VT_DENSE_SIZE) && + VerifyOffset(verifier, VT_ARRAY_SEGMENTS) && + verifier.VerifyVector(array_segments()) && + VerifyOffset(verifier, VT_ARRAY_INDICES) && + verifier.VerifyVector(array_indices()) && + verifier.EndTable(); + } + DimensionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct DimensionMetadataBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_format(DimensionType format) { + fbb_.AddElement(DimensionMetadata::VT_FORMAT, static_cast(format), 0); + } + void add_dense_size(int32_t dense_size) { + fbb_.AddElement(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0); + } + void add_array_segments(flatbuffers::Offset> array_segments) { + fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments); + } + void add_array_indices(flatbuffers::Offset> array_indices) { + fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices); + } + explicit DimensionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + DimensionMetadataBuilder &operator=(const DimensionMetadataBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateDimensionMetadata( + flatbuffers::FlatBufferBuilder &_fbb, + DimensionType format = DimensionType_DENSE, + int32_t dense_size = 0, + flatbuffers::Offset> array_segments = 0, + flatbuffers::Offset> array_indices = 0) { + DimensionMetadataBuilder builder_(_fbb); + builder_.add_array_indices(array_indices); + builder_.add_array_segments(array_segments); + builder_.add_dense_size(dense_size); + builder_.add_format(format); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateDimensionMetadataDirect( + flatbuffers::FlatBufferBuilder &_fbb, + DimensionType format = DimensionType_DENSE, + int32_t dense_size = 0, + const std::vector *array_segments = nullptr, + const std::vector *array_indices = nullptr) { + auto array_segments__ = array_segments ? _fbb.CreateVector(*array_segments) : 0; + auto array_indices__ = array_indices ? _fbb.CreateVector(*array_indices) : 0; + return tflite::CreateDimensionMetadata( + _fbb, + format, + dense_size, + array_segments__, + array_indices__); +} + +flatbuffers::Offset CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +struct SparsityParametersT : public flatbuffers::NativeTable { + typedef SparsityParameters TableType; + std::vector traversal_order; + std::vector block_map; + std::vector> dim_metadata; + SparsityParametersT() { + } +}; + +struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef SparsityParametersT NativeTableType; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_TRAVERSAL_ORDER = 4, + VT_BLOCK_MAP = 6, + VT_DIM_METADATA = 8 + }; + const flatbuffers::Vector *traversal_order() const { + return GetPointer *>(VT_TRAVERSAL_ORDER); + } + const flatbuffers::Vector *block_map() const { + return GetPointer *>(VT_BLOCK_MAP); + } + const flatbuffers::Vector> *dim_metadata() const { + return GetPointer> *>(VT_DIM_METADATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_TRAVERSAL_ORDER) && + verifier.VerifyVector(traversal_order()) && + VerifyOffset(verifier, VT_BLOCK_MAP) && + verifier.VerifyVector(block_map()) && + VerifyOffset(verifier, VT_DIM_METADATA) && + verifier.VerifyVector(dim_metadata()) && + verifier.VerifyVectorOfTables(dim_metadata()) && + verifier.EndTable(); + } + SparsityParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct SparsityParametersBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_traversal_order(flatbuffers::Offset> traversal_order) { + fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order); + } + void add_block_map(flatbuffers::Offset> block_map) { + fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map); + } + void add_dim_metadata(flatbuffers::Offset>> dim_metadata) { + fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata); + } + explicit SparsityParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + SparsityParametersBuilder &operator=(const SparsityParametersBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateSparsityParameters( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> traversal_order = 0, + flatbuffers::Offset> block_map = 0, + flatbuffers::Offset>> dim_metadata = 0) { + SparsityParametersBuilder builder_(_fbb); + builder_.add_dim_metadata(dim_metadata); + builder_.add_block_map(block_map); + builder_.add_traversal_order(traversal_order); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateSparsityParametersDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *traversal_order = nullptr, + const std::vector *block_map = nullptr, + const std::vector> *dim_metadata = nullptr) { + auto traversal_order__ = traversal_order ? _fbb.CreateVector(*traversal_order) : 0; + auto block_map__ = block_map ? _fbb.CreateVector(*block_map) : 0; + auto dim_metadata__ = dim_metadata ? _fbb.CreateVector>(*dim_metadata) : 0; + return tflite::CreateSparsityParameters( + _fbb, + traversal_order__, + block_map__, + dim_metadata__); +} + +flatbuffers::Offset CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct TensorT : public flatbuffers::NativeTable { typedef Tensor TableType; std::vector shape; @@ -2875,6 +3111,7 @@ struct TensorT : public flatbuffers::NativeTable { std::string name; std::unique_ptr quantization; bool is_variable; + std::unique_ptr sparsity; TensorT() : type(TensorType_FLOAT32), buffer(0), @@ -2890,7 +3127,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_BUFFER = 8, VT_NAME = 10, VT_QUANTIZATION = 12, - VT_IS_VARIABLE = 14 + VT_IS_VARIABLE = 14, + VT_SPARSITY = 16 }; const flatbuffers::Vector *shape() const { return GetPointer *>(VT_SHAPE); @@ -2910,6 +3148,9 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { bool is_variable() const { return GetField(VT_IS_VARIABLE, 0) != 0; } + const SparsityParameters *sparsity() const { + return GetPointer(VT_SPARSITY); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) && @@ -2921,6 +3162,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_QUANTIZATION) && verifier.VerifyTable(quantization()) && VerifyField(verifier, VT_IS_VARIABLE) && + VerifyOffset(verifier, VT_SPARSITY) && + verifier.VerifyTable(sparsity()) && verifier.EndTable(); } TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -2949,6 +3192,9 @@ struct TensorBuilder { void add_is_variable(bool is_variable) { fbb_.AddElement(Tensor::VT_IS_VARIABLE, static_cast(is_variable), 0); } + void add_sparsity(flatbuffers::Offset sparsity) { + fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity); + } explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -2968,8 +3214,10 @@ inline flatbuffers::Offset CreateTensor( uint32_t buffer = 0, flatbuffers::Offset name = 0, flatbuffers::Offset quantization = 0, - bool is_variable = false) { + bool is_variable = false, + flatbuffers::Offset sparsity = 0) { TensorBuilder builder_(_fbb); + builder_.add_sparsity(sparsity); builder_.add_quantization(quantization); builder_.add_name(name); builder_.add_buffer(buffer); @@ -2986,7 +3234,8 @@ inline flatbuffers::Offset CreateTensorDirect( uint32_t buffer = 0, const char *name = nullptr, flatbuffers::Offset quantization = 0, - bool is_variable = false) { + bool is_variable = false, + flatbuffers::Offset sparsity = 0) { auto shape__ = shape ? _fbb.CreateVector(*shape) : 0; auto name__ = name ? _fbb.CreateString(name) : 0; return tflite::CreateTensor( @@ -2996,7 +3245,8 @@ inline flatbuffers::Offset CreateTensorDirect( buffer, name__, quantization, - is_variable); + is_variable, + sparsity); } flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); @@ -9726,6 +9976,73 @@ inline flatbuffers::Offset CreateQuantizationParameters( _quantized_dimension); } +inline DimensionMetadataT *DimensionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new DimensionMetadataT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = format(); _o->format = _e; }; + { auto _e = dense_size(); _o->dense_size = _e; }; + { auto _e = array_segments(); if (_e) { _o->array_segments.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_segments[_i] = _e->Get(_i); } } }; + { auto _e = array_indices(); if (_e) { _o->array_indices.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_indices[_i] = _e->Get(_i); } } }; +} + +inline flatbuffers::Offset DimensionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateDimensionMetadata(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _format = _o->format; + auto _dense_size = _o->dense_size; + auto _array_segments = _o->array_segments.size() ? _fbb.CreateVector(_o->array_segments) : 0; + auto _array_indices = _o->array_indices.size() ? _fbb.CreateVector(_o->array_indices) : 0; + return tflite::CreateDimensionMetadata( + _fbb, + _format, + _dense_size, + _array_segments, + _array_indices); +} + +inline SparsityParametersT *SparsityParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new SparsityParametersT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } }; + { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } }; + { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dim_metadata[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; +} + +inline flatbuffers::Offset SparsityParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateSparsityParameters(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _traversal_order = _o->traversal_order.size() ? _fbb.CreateVector(_o->traversal_order) : 0; + auto _block_map = _o->block_map.size() ? _fbb.CreateVector(_o->block_map) : 0; + auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0; + return tflite::CreateSparsityParameters( + _fbb, + _traversal_order, + _block_map, + _dim_metadata); +} + inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new TensorT(); UnPackTo(_o, _resolver); @@ -9741,6 +10058,7 @@ inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t { auto _e = name(); if (_e) _o->name = _e->str(); }; { auto _e = quantization(); if (_e) _o->quantization = std::unique_ptr(_e->UnPack(_resolver)); }; { auto _e = is_variable(); _o->is_variable = _e; }; + { auto _e = sparsity(); if (_e) _o->sparsity = std::unique_ptr(_e->UnPack(_resolver)); }; } inline flatbuffers::Offset Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -9757,6 +10075,7 @@ inline flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder & auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name); auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0; auto _is_variable = _o->is_variable; + auto _sparsity = _o->sparsity ? CreateSparsityParameters(_fbb, _o->sparsity.get(), _rehasher) : 0; return tflite::CreateTensor( _fbb, _shape, @@ -9764,7 +10083,8 @@ inline flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder & _buffer, _name, _quantization, - _is_variable); + _is_variable, + _sparsity); } inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { @@ -12544,7 +12864,7 @@ inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const voi auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - default: return false; + default: return true; } } @@ -12997,7 +13317,7 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - default: return false; + default: return true; } } diff --git a/tensorflow/lite/testdata/sparse_tensor.bin b/tensorflow/lite/testdata/sparse_tensor.bin new file mode 100644 index 0000000000000000000000000000000000000000..d1445ac648065da9918a1ba72ab8b53374273b5e GIT binary patch literal 412 zcmX|-O%4G;6ohMNKM_P^79tK{Z{q}3mi8iHi_BO!f&*DtSU89SIE7Z<{CJbk-x$r^34h&r zRMgI_rG;MB`uGoOCIuU7THMM+bV)V#`Zn;qjE6x+>Ul3`@0s0 Date: Wed, 4 Dec 2019 16:09:41 -0800 Subject: [PATCH 100/383] Optimize operation ordering to support non-congruent indices. This change adds support for non-congruent indices in the operation ordering within a basic block. This effect of this is that insertions are less likely to cause an invalidation of the ordering within a block. This has a big effect on modules that have very large basic blocks. PiperOrigin-RevId: 283858136 Change-Id: Ic5a15e1d5f9d541d8fb6f79bbfd2018ead71d250 --- third_party/mlir/include/mlir/IR/Operation.h | 20 +++++ third_party/mlir/lib/IR/Block.cpp | 7 +- third_party/mlir/lib/IR/Operation.cpp | 78 ++++++++++++++++++-- 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/third_party/mlir/include/mlir/IR/Operation.h b/third_party/mlir/include/mlir/IR/Operation.h index 27bc1b17b63..1d9a401cfc0 100644 --- a/third_party/mlir/include/mlir/IR/Operation.h +++ b/third_party/mlir/include/mlir/IR/Operation.h @@ -574,6 +574,26 @@ public: /// handlers that may be listening. InFlightDiagnostic emitRemark(const Twine &message = {}); +private: + //===--------------------------------------------------------------------===// + // Ordering + //===--------------------------------------------------------------------===// + + /// This value represents an invalid index ordering for an operation within a + /// block. + static constexpr unsigned kInvalidOrderIdx = -1; + + /// This value represents the stride to use when computing a new order for an + /// operation. + static constexpr unsigned kOrderStride = 5; + + /// Update the order index of this operation of this operation if necessary, + /// potentially recomputing the order of the parent block. + void updateOrderIfNecessary(); + + /// Returns true if this operation has a valid order. + bool hasValidOrder() { return orderIndex != kInvalidOrderIdx; } + private: Operation(Location location, OperationName name, unsigned numResults, unsigned numSuccessors, unsigned numRegions, diff --git a/third_party/mlir/lib/IR/Block.cpp b/third_party/mlir/lib/IR/Block.cpp index a5013bd86fb..ad68a36f1ee 100644 --- a/third_party/mlir/lib/IR/Block.cpp +++ b/third_party/mlir/lib/IR/Block.cpp @@ -122,7 +122,8 @@ bool Block::verifyOpOrder() { for (auto &i : *this) { // The previous operation must have a smaller order index than the next as // it appears earlier in the list. - if (prev && prev->orderIndex >= i.orderIndex) + if (prev && prev->orderIndex != Operation::kInvalidOrderIdx && + prev->orderIndex >= i.orderIndex) return true; prev = &i; } @@ -133,11 +134,9 @@ bool Block::verifyOpOrder() { void Block::recomputeOpOrder() { parentValidOpOrderPair.setInt(true); - // TODO(riverriddle) Have non-congruent indices to reduce the number of times - // an insert invalidates the list. unsigned orderIndex = 0; for (auto &op : *this) - op.orderIndex = orderIndex++; + op.orderIndex = (orderIndex += Operation::kOrderStride); } //===----------------------------------------------------------------------===// diff --git a/third_party/mlir/lib/IR/Operation.cpp b/third_party/mlir/lib/IR/Operation.cpp index d079033e39b..69b8d056cd5 100644 --- a/third_party/mlir/lib/IR/Operation.cpp +++ b/third_party/mlir/lib/IR/Operation.cpp @@ -366,9 +366,12 @@ InFlightDiagnostic Operation::emitRemark(const Twine &message) { } //===----------------------------------------------------------------------===// -// Other +// Operation Ordering //===----------------------------------------------------------------------===// +constexpr unsigned Operation::kInvalidOrderIdx; +constexpr unsigned Operation::kOrderStride; + /// Given an operation 'other' that is within the same parent block, return /// whether the current operation is before 'other' in the operation list /// of the parent block. @@ -378,12 +381,77 @@ bool Operation::isBeforeInBlock(Operation *other) { assert(block && "Operations without parent blocks have no order."); assert(other && other->block == block && "Expected other operation to have the same parent block."); - // Recompute the parent ordering if necessary. - if (!block->isOpOrderValid()) + // If the order of the block is already invalid, directly recompute the + // parent. + if (!block->isOpOrderValid()) { block->recomputeOpOrder(); + } else { + // Update the order either operation if necessary. + updateOrderIfNecessary(); + other->updateOrderIfNecessary(); + } + return orderIndex < other->orderIndex; } +/// Update the order index of this operation of this operation if necessary, +/// potentially recomputing the order of the parent block. +void Operation::updateOrderIfNecessary() { + assert(block && "expected valid parent"); + + // If the order is valid for this operation there is nothing to do. + if (hasValidOrder()) + return; + Operation *blockFront = &block->front(); + Operation *blockBack = &block->back(); + + // This method is expected to only be invoked on blocks with more than one + // operation. + assert(blockFront != blockBack && "expected more than one operation"); + + // If the operation is at the end of the block. + if (this == blockBack) { + Operation *prevNode = getPrevNode(); + if (!prevNode->hasValidOrder()) + return block->recomputeOpOrder(); + + // Add the stride to the previous operation. + orderIndex = prevNode->orderIndex + kOrderStride; + return; + } + + // If this is the first operation try to use the next operation to compute the + // ordering. + if (this == blockFront) { + Operation *nextNode = getNextNode(); + if (!nextNode->hasValidOrder()) + return block->recomputeOpOrder(); + // There is no order to give this operation. + if (nextNode->orderIndex == 0) + return block->recomputeOpOrder(); + + // If we can't use the stride, just take the middle value left. This is safe + // because we know there is at least one valid index to assign to. + if (nextNode->orderIndex <= kOrderStride) + orderIndex = (nextNode->orderIndex / 2); + else + orderIndex = kOrderStride; + return; + } + + // Otherwise, this operation is between two others. Place this operation in + // the middle of the previous and next if possible. + Operation *prevNode = getPrevNode(), *nextNode = getNextNode(); + if (!prevNode->hasValidOrder() || !nextNode->hasValidOrder()) + return block->recomputeOpOrder(); + unsigned prevOrder = prevNode->orderIndex, nextOrder = nextNode->orderIndex; + + // Check to see if there is a valid order between the two. + if (prevOrder + 1 == nextOrder) + return block->recomputeOpOrder(); + orderIndex = prevOrder + 1 + ((nextOrder - prevOrder) / 2); +} + //===----------------------------------------------------------------------===// // ilist_traits for Operation //===----------------------------------------------------------------------===// @@ -430,8 +498,8 @@ void llvm::ilist_traits<::mlir::Operation>::addNodeToList(Operation *op) { assert(!op->getBlock() && "already in a operation block!"); op->block = getContainingBlock(); - // Invalidate the block ordering. - op->block->invalidateOpOrder(); + // Invalidate the order on the operation. + op->orderIndex = Operation::kInvalidOrderIdx; } /// This is a trait method invoked when a operation is removed from a block. From 72dcc7d28a0c443728fa1c278771fd2e753fad55 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 16:27:09 -0800 Subject: [PATCH 101/383] Make tf.size() docstring example testable. PiperOrigin-RevId: 283861716 Change-Id: Ib17d226ece32dc4613ced6be3467e2ead68ee5ab --- tensorflow/python/ops/array_ops.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index c550feedfc2..05a1ddc5cea 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -645,10 +645,9 @@ def size_v2(input, out_type=dtypes.int32, name=None): For example: - ```python - t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]) - tf.size(t) # 12 - ``` + >>> t = tf.constant([[[1, 1, 1], [2, 2, 2]], [[3, 3, 3], [4, 4, 4]]]) + >>> tf.size(t) + Args: input: A `Tensor` or `SparseTensor`. From 3524e73e09bd70bccb8c8664ba4451a1add6f9a5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 16:44:00 -0800 Subject: [PATCH 102/383] Add a testable example to tf.math.maximum PiperOrigin-RevId: 283865579 Change-Id: I89e85ebe09a4479898191d495dbc7f2d75b0b271 --- tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt index 130729ece17..f53382118f3 100644 --- a/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Maximum.pbtxt @@ -6,4 +6,12 @@ op { endpoint { name: "maximum" } + description: <>> x = tf.constant([0., 0., 0., 0.]) +>>> y = tf.constant([-2., 0., 2., 5.]) +>>> tf.math.maximum(x, y) + + +END } From fda7ab091fa547aa3d021c22156ce8938b1076ef Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 16:45:58 -0800 Subject: [PATCH 103/383] Move the debug string from TraceMe name to argument. PiperOrigin-RevId: 283865965 Change-Id: I996b10a3501c57c674bc5fe7d674d36e5d16ea3f --- .../core/distributed_runtime/eager/eager_service_impl.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc index e1a5f341816..3fe219a0290 100644 --- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc +++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc @@ -412,7 +412,8 @@ Status EagerServiceImpl::Enqueue(const EnqueueRequest* request, EnqueueResponse* response, uint64 stream_id) { profiler::TraceMe activity( [&] { - return absl::StrCat("EagerService:Enqueue:", request->DebugString()); + return absl::StrCat( + "EagerService:Enqueue#debug_str=", request->DebugString(), "#"); }, profiler::TraceMeLevel::kInfo); ServerContext* context = nullptr; From ca86f6cfe9e3c852d10ff3d52bb610e95a3d4db5 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Wed, 4 Dec 2019 16:51:02 -0800 Subject: [PATCH 104/383] Update TFRecordWriter docstring. PiperOrigin-RevId: 283866948 Change-Id: Id9dfa076ed3eb683f48e04dd5222848f90f889bc --- tensorflow/python/lib/io/tf_record.py | 58 ++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py index fed88004ee4..052aabf9288 100644 --- a/tensorflow/python/lib/io/tf_record.py +++ b/tensorflow/python/lib/io/tf_record.py @@ -192,8 +192,64 @@ def tf_record_iterator(path, options=None): class TFRecordWriter(object): """A class to write records to a TFRecords file. + [TFRecords tutorial](https://www.tensorflow.org/tutorials/load_data/tfrecord) + + TFRecords is a binary format which is optimized for high throughput data + retrieval, generally in conjunction with `tf.data`. `TFRecordWriter` is used + to write serialized examples to a file for later consumption. The key steps + are: + + Ahead of time: + + - [Convert data into a serialized format]( + https://www.tensorflow.org/tutorials/load_data/tfrecord#tfexample) + - [Write the serialized data to one or more files]( + https://www.tensorflow.org/tutorials/load_data/tfrecord#tfrecord_files_in_python) + + During training or evaluation: + + - [Read serialized examples into memory]( + https://www.tensorflow.org/tutorials/load_data/tfrecord#reading_a_tfrecord_file) + - [Parse (deserialize) examples]( + https://www.tensorflow.org/tutorials/load_data/tfrecord#reading_a_tfrecord_file) + + A minimal example is given below: + + >>> import tempfile + >>> example_path = os.path.join(tempfile.gettempdir(), "example.tfrecords") + >>> np.random.seed(0) + + >>> # Write the records to a file. + ... with tf.io.TFRecordWriter(example_path) as file_writer: + ... for _ in range(4): + ... x, y = np.random.random(), np.random.random() + ... + ... record_bytes = tf.train.Example(features=tf.train.Features(feature={ + ... "x": tf.train.Feature(float_list=tf.train.FloatList(value=[x])), + ... "y": tf.train.Feature(float_list=tf.train.FloatList(value=[y])), + ... })).SerializeToString() + ... file_writer.write(record_bytes) + + >>> # Read the data back out. + >>> def decode_fn(record_bytes): + ... return tf.io.parse_single_example( + ... # Data + ... record_bytes, + ... + ... # Schema + ... {"x": tf.io.FixedLenFeature([], dtype=tf.float32), + ... "y": tf.io.FixedLenFeature([], dtype=tf.float32)} + ... ) + + >>> for batch in tf.data.TFRecordDataset([example_path]).map(decode_fn): + ... print("x = {x:.4f}, y = {y:.4f}".format(**batch)) + x = 0.5488, y = 0.7152 + x = 0.6028, y = 0.5449 + x = 0.4237, y = 0.6459 + x = 0.4376, y = 0.8918 + This class implements `__enter__` and `__exit__`, and can be used - in `with` blocks like a normal file. + in `with` blocks like a normal file. (See the usage example above.) """ # TODO(josh11b): Support appending? From 30b99dd053aad083927faecc5a6eaca398e55410 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Wed, 4 Dec 2019 16:52:47 -0800 Subject: [PATCH 105/383] Refactor the keras optimizer gradient apply function to reuse a context call and skip name scopes in eager mode where they have no effect. This reduces the Python overhead of applying gradient updates in eager mode. PiperOrigin-RevId: 283867294 Change-Id: I8d61428b79d377c3f0ff724a56aaffdb795865ba --- .../python/keras/optimizer_v2/optimizer_v2.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py index 1e97ae469bb..c2ad67c3103 100644 --- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py +++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py @@ -474,18 +474,17 @@ class OptimizerV2(trackable.Trackable): else: return update_op + eagerly_outside_functions = ops.executing_eagerly_outside_functions() update_ops = [] - with backend.name_scope(name or self._name): + with ops.name_scope(name or self._name, skip_on_eager=True): for grad, var in grads_and_vars: - scope_name = ("update" if ops.executing_eagerly_outside_functions() else - "update_" + var.op.name) # Colocate the update with variables to avoid unnecessary communication # delays. See b/136304694. - with backend.name_scope( - scope_name), distribution.extended.colocate_vars_with(var): - update_ops.extend( - distribution.extended.update( - var, apply_grad_to_update_var, args=(grad,), group=False)) + with distribution.extended.colocate_vars_with(var): + with ops.name_scope("update" if eagerly_outside_functions else + "update_" + var.op.name, skip_on_eager=True): + update_ops.extend(distribution.extended.update( + var, apply_grad_to_update_var, args=(grad,), group=False)) any_symbolic = any(isinstance(i, ops.Operation) or tf_utils.is_symbolic_tensor(i) for i in update_ops) From b656428fb17dc3c382f5f587aa4fefea41d4dede Mon Sep 17 00:00:00 2001 From: Rachel Lim Date: Wed, 4 Dec 2019 16:54:49 -0800 Subject: [PATCH 106/383] 1) Remove `autotune_algorithm` from `experimental_options`. Instead, use HILL_CLIMB when buffer size autotuning is disabled and GRADIENT_DESCENT when buffer size autotuning is enabled. 2) Some refactoring: a) s/static_optimization/static_rewrite, because not all our rewrites are 'optimizations', so to speak b) moved logic for determining which autotuning options to apply into `optimization_options.py` PiperOrigin-RevId: 283867649 Change-Id: Ica01a469f8c0039b11db2aa2304a50c700a5ddd7 --- .../benchmarks/autotune_benchmark.py | 252 +++++++----------- .../kernel_tests/optimize_dataset_test.py | 35 ++- .../kernel_tests/prefetch_with_slack_test.py | 8 +- .../experimental/ops/optimization_options.py | 69 +++-- tensorflow/python/data/ops/dataset_ops.py | 95 +++---- ...a.experimental.-optimization-options.pbtxt | 4 - ...a.experimental.-optimization-options.pbtxt | 4 - 7 files changed, 217 insertions(+), 250 deletions(-) diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py index a6ee0d7dec7..9123aff4df9 100644 --- a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py +++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py @@ -23,6 +23,7 @@ import numpy as np from tensorflow.python.client import session from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.data.util import nest from tensorflow.python.ops import math_ops from tensorflow.python.platform import test @@ -30,149 +31,116 @@ from tensorflow.python.platform import test class AutotuneBenchmark(test.Benchmark): """Benchmarks for autotuning performance knobs.""" - def benchmark_map(self): - a = self._benchmark_map(autotune=False) - b = self._benchmark_map(autotune=True) - c = self._benchmark_map( - autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT) - print("HillClimb vs Default speedup: %f" % (a / b)) - print("GradientDescent vs Default speedup: %f" % (a / c)) - - def _benchmark_map(self, - autotune, - algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB): - k = 1024 * 1024 - dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k), - np.random.rand(4 * k, - 1))).repeat() - dataset = dataset.map( - math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) + def _run_benchmark(self, dataset, autotune, autotune_buffers, + benchmark_iters, benchmark_label): options = dataset_ops.Options() options.experimental_optimization.apply_default_optimizations = False options.experimental_optimization.autotune = autotune - if autotune: - options.experimental_optimization.autotune_algorithm = algorithm.value + options.experimental_optimization.autotune_buffers = autotune_buffers dataset = dataset.with_options(options) iterator = dataset_ops.make_one_shot_iterator(dataset) get_next = iterator.get_next() + # Run the op directly to avoid copying the tensor to python. + get_next_op = nest.flatten(get_next)[0].op deltas = [] with session.Session() as sess: for _ in range(5): - sess.run(get_next.op) - for _ in range(10000): + sess.run(get_next_op) + for _ in range(benchmark_iters): start = time.time() - sess.run(get_next.op) + sess.run(get_next_op) end = time.time() deltas.append(end - start) + autotune_string = "_autotune_{}".format( + "parallelism_and_buffer_sizes" + if autotune_buffers else "parallelism_only") + self.report_benchmark( - iters=10000, + iters=benchmark_iters, wall_time=np.median(deltas), - name="map" + (("_autotune_%s" % algorithm.name) if autotune else "")) + name=benchmark_label + (autotune_string if autotune else "")) return np.median(deltas) + def benchmark_map(self): + a = self._benchmark_map(autotune=False) + b = self._benchmark_map(autotune=True, autotune_buffers=False) + c = self._benchmark_map(autotune=True, autotune_buffers=True) + print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) + print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" + .format(a / c)) + + def _benchmark_map(self, autotune, autotune_buffers=False): + k = 1024 * 1024 + dataset = dataset_ops.Dataset.from_tensors( + (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() + dataset = dataset.map( + math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) + return self._run_benchmark( + dataset, + autotune, + autotune_buffers, + benchmark_iters=10000, + benchmark_label="map") + def benchmark_map_and_batch(self): a = self._benchmark_map_and_batch(autotune=False) - b = self._benchmark_map_and_batch(autotune=True) - c = self._benchmark_map_and_batch( - autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT) - print("HillClimb vs Default speedup: %f" % (a / b)) - print("GradientDescent vs Default speedup: %f" % (a / c)) + b = self._benchmark_map_and_batch(autotune=True, autotune_buffers=False) + c = self._benchmark_map_and_batch(autotune=True, autotune_buffers=True) + print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) + print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" + .format(a / c)) - def _benchmark_map_and_batch( - self, autotune, algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB): + def _benchmark_map_and_batch(self, autotune, autotune_buffers=False): batch_size = 16 k = 1024 * 1024 - dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k), - np.random.rand(4 * k, - 1))).repeat() + dataset = dataset_ops.Dataset.from_tensors( + (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() dataset = dataset.map( math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) dataset = dataset.batch(batch_size=batch_size) - options = dataset_ops.Options() - options.experimental_optimization.apply_default_optimizations = False - options.experimental_optimization.map_and_batch_fusion = True - options.experimental_optimization.autotune = autotune - if autotune: - options.experimental_optimization.autotune_algorithm = algorithm.value - dataset = dataset.with_options(options) - iterator = dataset_ops.make_one_shot_iterator(dataset) - get_next = iterator.get_next() - - deltas = [] - with session.Session() as sess: - for _ in range(5): - sess.run(get_next.op) - for _ in range(1000): - start = time.time() - sess.run(get_next.op) - end = time.time() - deltas.append(end - start) - - self.report_benchmark( - iters=1000, - wall_time=np.median(deltas), - name="map_and_batch" + - (("_autotune_%s" % algorithm.name) if autotune else "")) - return np.median(deltas) + return self._run_benchmark( + dataset, + autotune, + autotune_buffers, + benchmark_iters=1000, + benchmark_label="map_and_batch") def benchmark_interleave(self): a = self._benchmark_interleave(autotune=False) - b = self._benchmark_interleave(autotune=True) - c = self._benchmark_interleave( - autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT) - print("HillClimb vs Default speedup: %f" % (a / b)) - print("GradientDescent vs Default speedup: %f" % (a / c)) + b = self._benchmark_interleave(autotune=True, autotune_buffers=False) + c = self._benchmark_interleave(autotune=True, autotune_buffers=True) + print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) + print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" + .format(a / c)) - def _benchmark_interleave(self, - autotune, - algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB): + def _benchmark_interleave(self, autotune, autotune_buffers=False): k = 1024 * 1024 - dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k), - np.random.rand(4 * k, - 1))).repeat() + dataset = dataset_ops.Dataset.from_tensors( + (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))).repeat() dataset = dataset.map(math_ops.matmul) dataset = dataset_ops.Dataset.range(1).repeat().interleave( lambda _: dataset, cycle_length=10, num_parallel_calls=dataset_ops.AUTOTUNE) - options = dataset_ops.Options() - options.experimental_optimization.apply_default_optimizations = False - options.experimental_optimization.autotune = autotune - if autotune: - options.experimental_optimization.autotune_algorithm = algorithm.value - dataset = dataset.with_options(options) - iterator = dataset_ops.make_one_shot_iterator(dataset) - get_next = iterator.get_next() - - deltas = [] - with session.Session() as sess: - for _ in range(5): - sess.run(get_next.op) - for _ in range(10000): - start = time.time() - sess.run(get_next.op) - end = time.time() - deltas.append(end - start) - - self.report_benchmark( - iters=10000, - wall_time=np.median(deltas), - name="interleave" + - (("_autotune_%s" % algorithm.name) if autotune else "")) - return np.median(deltas) + return self._run_benchmark( + dataset, + autotune, + autotune_buffers, + benchmark_iters=10000, + benchmark_label="interleave") def benchmark_map_and_interleave(self): a = self._benchmark_map_and_interleave(autotune=False) - b = self._benchmark_map_and_interleave(autotune=True) - c = self._benchmark_map_and_interleave( - autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT) - print("HillClimb vs Default speedup: %f" % (a / b)) - print("GradientDescent vs Default speedup: %f" % (a / c)) + b = self._benchmark_map_and_interleave( + autotune=True, autotune_buffers=False) + c = self._benchmark_map_and_interleave(autotune=True, autotune_buffers=True) + print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) + print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" + .format(a / c)) - def _benchmark_map_and_interleave( - self, autotune, algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB): + def _benchmark_map_and_interleave(self, autotune, autotune_buffers=False): k = 1024 * 1024 a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1)) b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1)) @@ -204,42 +172,26 @@ class AutotuneBenchmark(test.Benchmark): dataset = dataset_ops.Dataset.zip((dataset, dataset_c)) dataset = dataset.map(f2, num_parallel_calls=dataset_ops.AUTOTUNE) - options = dataset_ops.Options() - options.experimental_optimization.apply_default_optimizations = False - options.experimental_optimization.autotune = autotune - if autotune: - options.experimental_optimization.autotune_algorithm = algorithm.value - dataset = dataset.with_options(options) - iterator = dataset_ops.make_one_shot_iterator(dataset) - get_next = iterator.get_next() - - deltas = [] - with session.Session() as sess: - for _ in range(5): - sess.run(get_next) - for _ in range(10000): - start = time.time() - sess.run(get_next) - end = time.time() - deltas.append(end - start) - - self.report_benchmark( - iters=10000, - wall_time=np.median(deltas), - name="map_and_interleave" + - (("_autotune_%s" % algorithm.name) if autotune else "")) - return np.median(deltas) + return self._run_benchmark( + dataset, + autotune, + autotune_buffers, + benchmark_iters=10000, + benchmark_label="map_and_interleave") def benchmark_map_batch_and_interleave(self): a = self._benchmark_map_batch_and_interleave(autotune=False) - b = self._benchmark_map_batch_and_interleave(autotune=True) + b = self._benchmark_map_batch_and_interleave( + autotune=True, autotune_buffers=False) c = self._benchmark_map_batch_and_interleave( - autotune=True, algorithm=dataset_ops.AutotuneAlgorithm.GRADIENT_DESCENT) - print("HillClimb vs Default speedup: %f" % (a / b)) - print("GradientDescent vs Default speedup: %f" % (a / c)) + autotune=True, autotune_buffers=True) + print("autotune parallelism vs no autotuning speedup: {}".format(a / b)) + print("autotune parallelism and buffer sizes vs no autotuning speedup: {}" + .format(a / c)) - def _benchmark_map_batch_and_interleave( - self, autotune, algorithm=dataset_ops.AutotuneAlgorithm.HILL_CLIMB): + def _benchmark_map_batch_and_interleave(self, + autotune, + autotune_buffers=False): batch_size = 16 k = 1024 * 1024 a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1)) @@ -268,32 +220,12 @@ class AutotuneBenchmark(test.Benchmark): math_ops.matmul, num_parallel_calls=dataset_ops.AUTOTUNE) dataset_c = dataset_c.batch(batch_size=batch_size) dataset = dataset_ops.Dataset.zip((dataset, dataset_c)) - options = dataset_ops.Options() - options.experimental_optimization.apply_default_optimizations = False - options.experimental_optimization.map_and_batch_fusion = True - options.experimental_optimization.autotune = autotune - if autotune: - options.experimental_optimization.autotune_algorithm = algorithm.value - dataset = dataset.with_options(options) - iterator = dataset_ops.make_one_shot_iterator(dataset) - get_next = iterator.get_next() - - deltas = [] - with session.Session() as sess: - for _ in range(5): - sess.run(get_next) - for _ in range(1000): - start = time.time() - sess.run(get_next) - end = time.time() - deltas.append(end - start) - - self.report_benchmark( - iters=1000, - wall_time=np.median(deltas), - name="map_batch_and_interleave" + - (("_autotune_%s" % algorithm.name) if autotune else "")) - return np.median(deltas) + return self._run_benchmark( + dataset, + autotune, + autotune_buffers, + benchmark_iters=1000, + benchmark_label="map_batch_and_interleave") if __name__ == "__main__": diff --git a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py index 1bd7e320466..397703e1c40 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimize_dataset_test.py @@ -24,6 +24,7 @@ import numpy as np from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import grouping +from tensorflow.python.data.experimental.ops import optimization_options from tensorflow.python.data.experimental.ops import scan_ops from tensorflow.python.data.experimental.ops import testing from tensorflow.python.data.experimental.ops import threadpool @@ -215,11 +216,11 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): optimized_it = dataset_ops.make_initializable_iterator(optimized_dataset) self.assertGreaterEqual(len(w), 1) - expected = ("tf.data static optimizations are not compatible with " - "tf.Variable. The following optimizations will be disabled: %s." - " To enable optimizations, use resource variables instead by " + expected = ("tf.data graph rewrites are not compatible with " + "tf.Variable. The following rewrites will be disabled: %s." + " To enable rewrites, use resource variables instead by " "calling `tf.enable_resource_variables()` at the start of the " - "program." % (", ".join(options._static_optimizations()))) + "program." % (", ".join(options._graph_rewrites()))) self.assertTrue(any([expected in str(warning) for warning in w])) # Check that outputs are the same in the optimized and unoptimized cases, @@ -249,10 +250,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): "shuffle_and_repeat_fusion", ] self.assertEqual( - set(options._static_optimizations()), set(expected_optimizations)) + set(options._graph_rewrites()), set(expected_optimizations)) def testOptimizationDisableDefault(self): - """Tests that we can disable all static optimizations enabled by default. + """Tests that we can disable all graph optimizations enabled by default. If the `apply_default_optimizations` optimization options flag is False, only explicitly enabled optimizations will be applied. @@ -266,7 +267,27 @@ class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): "noop_elimination", ] self.assertEqual( - set(options._static_optimizations()), set(expected_optimizations)) + set(options._graph_rewrites()), set(expected_optimizations)) + + def testAutotuningDefaults(self): + options = dataset_ops.Options() + + # Check defaults + autotune, algorithm, cpu_budget = options._autotune_settings() + self.assertTrue(autotune) + self.assertEqual(algorithm, + optimization_options._AutotuneAlgorithm.HILL_CLIMB) + self.assertEqual(cpu_budget, 0) + + def testAutotuningBufferSizes(self): + options = dataset_ops.Options() + options.experimental_optimization.autotune_buffers = True + self.assertIn("inject_prefetch", options._graph_rewrites()) + autotune, algorithm, cpu_budget = options._autotune_settings() + self.assertTrue(autotune) + self.assertEqual(algorithm, + optimization_options._AutotuneAlgorithm.GRADIENT_DESCENT) + self.assertEqual(cpu_budget, 0) if __name__ == "__main__": diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py index 5de98189322..abc9eb5f0ad 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py @@ -44,9 +44,9 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator( dataset, ["/cpu:1", "/cpu:2"]) dataset = multi_device_iterator._dataset # pylint: disable=protected-access - self.assertIn("slack", dataset.options()._static_optimizations()) + self.assertIn("slack", dataset.options()._graph_rewrites()) self.assertIn("slack:slack_period:2", - dataset.options()._static_optimization_configs()) + dataset.options()._graph_rewrite_configs()) config = config_pb2.ConfigProto(device_count={"CPU": 3}) with self.test_session(config=config): @@ -67,9 +67,9 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): options = dataset_ops.Options() options.experimental_slack = True dataset = dataset.with_options(options) - self.assertIn("slack", dataset.options()._static_optimizations()) + self.assertIn("slack", dataset.options()._graph_rewrites()) self.assertIn("slack:slack_period:1", - dataset.options()._static_optimization_configs()) + dataset.options()._graph_rewrite_configs()) self.assertDatasetProduces(dataset, range(10)) def testWithPassthroughDataset(self): diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py index 57cee3d0e5f..5db4db91c17 100644 --- a/tensorflow/python/data/experimental/ops/optimization_options.py +++ b/tensorflow/python/data/experimental/ops/optimization_options.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import enum + from tensorflow.python.data.util import options from tensorflow.python.util.tf_export import tf_export @@ -24,6 +26,12 @@ from tensorflow.python.util.tf_export import tf_export _ENABLE_AUTOTUNE_BUFFERS_BY_DEFAULT = False +class _AutotuneAlgorithm(enum.Enum): + """Controls what algorithm is used in the autotune implementation.""" + HILL_CLIMB = 0 + GRADIENT_DESCENT = 1 + + @tf_export("data.experimental.MapVectorizationOptions") class MapVectorizationOptions(options.OptionsBase): """Represents options for the MapVectorization optimization.""" @@ -44,12 +52,14 @@ class MapVectorizationOptions(options.OptionsBase): "original segment at runtime based on their iterations speed. If None, " "defaults to False.") - def _static_optimizations(self): + def _graph_rewrites(self): if self.enabled: return ["map_vectorization"] return [] - def _static_optimization_configs(self): + def _graph_rewrite_configs(self): + if not self.enabled: + return [] if self.use_choose_fastest: return ["map_vectorization:use_choose_fastest:true"] else: @@ -76,7 +86,7 @@ class OptimizationOptions(options.OptionsBase): name="apply_default_optimizations", ty=bool, docstring= - "Whether to apply default static optimizations. If False, only static " + "Whether to apply default graph optimizations. If False, only graph " "optimizations that have been explicitly enabled will be applied.") autotune = options.create_option( @@ -86,13 +96,6 @@ class OptimizationOptions(options.OptionsBase): "Whether to automatically tune performance knobs. If None, defaults to " "True.") - autotune_algorithm = options.create_option( - name="autotune_algorithm", - ty=int, - docstring= - "When autotuning is enabled (through `autotune`), identifies the " - "algorithm to use for the autotuning optimization.") - autotune_buffers = options.create_option( name="autotune_buffers", ty=bool, @@ -183,8 +186,34 @@ class OptimizationOptions(options.OptionsBase): docstring="Whether to fuse shuffle and repeat transformations. If None, " "defaults to True.") - def _static_optimizations(self): - """Produces the list of enabled static optimizations.""" + def _autotune_buffers(self): + if self.autotune_buffers is not None: + return self.autotune_buffers + # The default setting for autotune_buffers is based on + # _ENABLE_AUTOTUNE_BUFFERS_BY_DEFAULT + return _ENABLE_AUTOTUNE_BUFFERS_BY_DEFAULT + + def _autotune_settings(self): + # Default autotune settings + autotune = True + + # If autotune_buffers is enabled, we use the GRADIENT_DESCENT algorithm by + # default, which is more performant for tuning heterogeneous parameters. + algorithm = ( + _AutotuneAlgorithm.GRADIENT_DESCENT + if self._autotune_buffers() else _AutotuneAlgorithm.HILL_CLIMB) + cpu_budget = 0 # Indicates that all CPU cores should be used by default. + + # Set these options if they are explicitly set by the user. + if self.autotune is False: # pylint: disable=g-bool-id-comparison + autotune = False + if self.autotune_cpu_budget is not None: + cpu_budget = self.autotune_cpu_budget + + return autotune, algorithm, cpu_budget + + def _graph_rewrites(self): + """Produces the list of enabled graph optimizations.""" result = set() all_optimizations = [ "filter_fusion", @@ -215,17 +244,19 @@ class OptimizationOptions(options.OptionsBase): result.add(optimization) if self.map_vectorization is not None: - result.update(self.map_vectorization._static_optimizations()) # pylint: disable=protected-access + result.update(self.map_vectorization._graph_rewrites()) # pylint: disable=protected-access - # The default setting for autotune_buffers is based on - # _ENABLE_AUTOTUNE_BUFFERS_BY_DEFAULT - autotune_buffers = self.autotune_buffers or ( - self.autotune_buffers is None and _ENABLE_AUTOTUNE_BUFFERS_BY_DEFAULT) + autotune_buffers = self._autotune_buffers() if self.autotune is not False and autotune_buffers: # pylint: disable=g-bool-id-comparison + # When autotuning buffer sizes is enabled, we inject a `prefetch` + # transformation after asynchronous dataset ops. Only the buffer sizes of + # prefetch transformations will be autotuned, though this is practically + # equivalent to tuning the buffer sizes of the other asynchronous + # transformations. result.add("inject_prefetch") return sorted(list(result)) - def _static_optimization_configs(self): + def _graph_rewrite_configs(self): if self.map_vectorization is not None: - return self.map_vectorization._static_optimization_configs() # pylint: disable=protected-access + return self.map_vectorization._graph_rewrite_configs() # pylint: disable=protected-access return [] diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py index f3367023a7b..06bdfd03eb8 100644 --- a/tensorflow/python/data/ops/dataset_ops.py +++ b/tensorflow/python/data/ops/dataset_ops.py @@ -29,7 +29,6 @@ import numpy as np import six from six.moves import queue as Queue # pylint: disable=redefined-builtin - from tensorflow.core.framework import graph_pb2 from tensorflow.python import tf2 from tensorflow.python.compat import compat @@ -90,17 +89,11 @@ autograph = lazy_loader.LazyLoader( ops.NotDifferentiable("ReduceDataset") - # A constant that can be used to enable auto-tuning. AUTOTUNE = -1 tf_export("data.experimental.AUTOTUNE").export_constant(__name__, "AUTOTUNE") -class AutotuneAlgorithm(enum.Enum): - HILL_CLIMB = 0 - GRADIENT_DESCENT = 1 - - class ExternalStatePolicy(enum.Enum): WARN = 0 IGNORE = 1 @@ -227,9 +220,9 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor): def. In that case, the state in these ops would be thrown away. strip_device_assignment: If true, non-local (i.e. job and task) device assignment is stripped from ops in the serialized graph. - external_state_policy: The ExternalStatePolicy enum that determines how - we handle input pipelines that depend on external state. By default, - its set to WARN. + external_state_policy: The ExternalStatePolicy enum that determines how we + handle input pipelines that depend on external state. By default, its + set to WARN. Returns: A scalar `tf.Tensor` of `tf.string` type, representing this dataset as a @@ -355,6 +348,8 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor): dataset = self options = self.options() + + # (1) Apply threading options if options.experimental_threading is not None: t_options = options.experimental_threading if t_options.max_intra_op_parallelism is not None: @@ -363,36 +358,31 @@ class DatasetV2(tracking_base.Trackable, composite_tensor.CompositeTensor): if t_options.private_threadpool_size is not None: dataset = _PrivateThreadPoolDataset(dataset, t_options.private_threadpool_size) + + # (2) Apply graph rewrite options # pylint: disable=protected-access - static_optimizations = options._static_optimizations() - static_optimization_configs = options._static_optimization_configs() + graph_rewrites = options._graph_rewrites() + graph_rewrite_configs = options._graph_rewrite_configs() # pylint: enable=protected-access - if static_optimizations: + if graph_rewrites: if self._has_captured_ref(): warnings.warn( - "tf.data static optimizations are not compatible with tf.Variable. " - "The following optimizations will be disabled: %s. To enable " - "optimizations, use resource variables instead by calling " + "tf.data graph rewrites are not compatible with tf.Variable. " + "The following rewrites will be disabled: %s. To enable " + "rewrites, use resource variables instead by calling " "`tf.enable_resource_variables()` at the start of the program." % - ", ".join(static_optimizations)) + ", ".join(graph_rewrites)) else: - dataset = _OptimizeDataset(dataset, static_optimizations, - static_optimization_configs) + dataset = _OptimizeDataset(dataset, graph_rewrites, + graph_rewrite_configs) - autotune = True - algorithm = AutotuneAlgorithm.HILL_CLIMB - cpu_budget = 0 # Indicates that all CPU cores should be used. - if options.experimental_optimization is not None: - if options.experimental_optimization.autotune is False: # pylint: disable=g-bool-id-comparison - autotune = False - if options.experimental_optimization.autotune_algorithm is not None: - algorithm = options.experimental_optimization.autotune_algorithm - if options.experimental_optimization.autotune_cpu_budget is not None: - cpu_budget = options.experimental_optimization.autotune_cpu_budget + # (3) Apply autotune options + autotune, algorithm, cpu_budget = options._autotune_settings() # pylint: disable=protected-access if autotune: dataset = _ModelDataset(dataset, algorithm, cpu_budget) + # (4) Apply stats aggregator options if options.experimental_stats and options.experimental_stats.aggregator: # pylint: disable=line-too-long dataset = _SetStatsAggregatorDataset( # pylint: disable=protected-access dataset, options.experimental_stats.aggregator, @@ -2600,7 +2590,7 @@ def get_legacy_output_types(dataset_or_iterator): class Options(options_lib.OptionsBase): """Represents options for tf.data.Dataset. - An `Options` object can be, for instance, used to control which static + An `Options` object can be, for instance, used to control which graph optimizations to apply or whether to use performance modeling to dynamically tune the parallelism of operations such as `tf.data.Dataset.map` or `tf.data.Dataset.interleave`. @@ -2675,11 +2665,15 @@ class Options(options_lib.OptionsBase): "might be thrown away; FAIL: We fail if any state is being captured.", default_factory=lambda: ExternalStatePolicy.WARN) - def _static_optimizations(self): - """Produces the list of enabled static optimizations.""" - + def _graph_rewrites(self): + """Produces the list of enabled static graph rewrites.""" result = [] - result.extend(self.experimental_optimization._static_optimizations()) # pylint: disable=protected-access + if self.experimental_optimization is not None: + result.extend(self.experimental_optimization._graph_rewrites()) # pylint: disable=protected-access + else: + # Apply default options + result.extend( + optimization_options.OptimizationOptions()._graph_rewrites()) # pylint: disable=protected-access if self.experimental_deterministic is False: result.append("make_sloppy") @@ -2692,12 +2686,11 @@ class Options(options_lib.OptionsBase): result.append("make_stateless") return result - def _static_optimization_configs(self): - """Produces the list of configurations for enabled static optimizations.""" + def _graph_rewrite_configs(self): + """Produces the list of configurations for enabled graph optimizations.""" result = [] if self.experimental_optimization: - result.extend( - self.experimental_optimization._static_optimization_configs()) # pylint: disable=protected-access + result.extend(self.experimental_optimization._graph_rewrite_configs()) # pylint: disable=protected-access if self.experimental_slack: num_devices = self.experimental_distribute.num_devices @@ -2706,6 +2699,13 @@ class Options(options_lib.OptionsBase): result.append("slack:slack_period:%d" % num_devices) return result + def _autotune_settings(self): + if self.experimental_optimization is not None: + return self.experimental_optimization._autotune_settings() # pylint: disable=protected-access + + # Return default autotune options + return optimization_options.OptimizationOptions()._autotune_settings() # pylint: disable=protected-access + def merge(self, options): """Merges itself with the given `tf.data.Options`. @@ -4177,20 +4177,11 @@ class _ModelDataset(UnaryUnchangedStructureDataset): def __init__(self, input_dataset, algorithm, cpu_budget): self._input_dataset = input_dataset - # TODO(jsimsa): This check is introduced for forward compatibility and can - # be removed after 7/24/2019. At that point, all servers are expected to - # recognize the `algorithm` attribute. - if algorithm != AutotuneAlgorithm.HILL_CLIMB: - variant_tensor = gen_dataset_ops.model_dataset( - input_dataset._variant_tensor, # pylint: disable=protected-access - algorithm=algorithm, - cpu_budget=cpu_budget, - **self._flat_structure) - else: - variant_tensor = gen_dataset_ops.model_dataset( - input_dataset._variant_tensor, # pylint: disable=protected-access - cpu_budget=cpu_budget, - **self._flat_structure) + variant_tensor = gen_dataset_ops.model_dataset( + input_dataset._variant_tensor, # pylint: disable=protected-access + algorithm=algorithm.value, + cpu_budget=cpu_budget, + **self._flat_structure) super(_ModelDataset, self).__init__(input_dataset, variant_tensor) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt index f7301ff180c..a79d205cf0b 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt @@ -11,10 +11,6 @@ tf_class { name: "autotune" mtype: "" } - member { - name: "autotune_algorithm" - mtype: "" - } member { name: "autotune_buffers" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt index f7301ff180c..a79d205cf0b 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt @@ -11,10 +11,6 @@ tf_class { name: "autotune" mtype: "" } - member { - name: "autotune_algorithm" - mtype: "" - } member { name: "autotune_buffers" mtype: "" From a0931eb1d37189861537198ddea8ba7578fdb145 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:03:22 -0800 Subject: [PATCH 107/383] Add a converter from tf_stats.proto to GViz DataTable format. PiperOrigin-RevId: 283869235 Change-Id: If983eb4e963e73274f0104bb76172379d3835fb6 --- .../python/profiler/tf_stats_proto_to_gviz.py | 93 ++++++++++++ .../profiler/tf_stats_proto_to_gviz_test.py | 133 ++++++++++++++++++ 2 files changed, 226 insertions(+) create mode 100644 tensorflow/python/profiler/tf_stats_proto_to_gviz.py create mode 100644 tensorflow/python/profiler/tf_stats_proto_to_gviz_test.py diff --git a/tensorflow/python/profiler/tf_stats_proto_to_gviz.py b/tensorflow/python/profiler/tf_stats_proto_to_gviz.py new file mode 100644 index 00000000000..0c4718912ca --- /dev/null +++ b/tensorflow/python/profiler/tf_stats_proto_to_gviz.py @@ -0,0 +1,93 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Contains utilities for conversion of TF proto types to GViz types. + +Usage: + gviz_data_table = generate_chart_table(stats_table) +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import google_type_annotations +from __future__ import print_function + +import gviz_api + + +def get_chart_table_args(stats_table): + """Creates gviz DataTable object from a a TensorFlow stats table. + + Args: + stats_table: A tf_stats_pb2.TfStatsTable. + + Returns: + Returns a gviz_api.DataTable + """ + + ## Create schema + table_description = [ + ("rank", "number", "Rank"), + ("host_or_device", "string", "Host/device"), + ("type", "string", "Type"), + ("operation", "string", "Operation"), + ("occurrences", "number", "#Occurrences"), + ("total_time", "number", "Total time (us)"), + ("avg_time", "number", "Avg. time (us)"), + ("total_self_time", "number", "Total self-time (us)"), + ("avg_self_time", "number", "Avg. self-time (us)"), + ("device_total_self_time_percent", "number", + "Total self-time on Device (%)"), + ("device_cumulative_total_self_time_percent", "number", + "Cumulative total-self time on Device (%)"), + ("host_total_self_time_percent", "number", "Total self-time on Host (%)"), + ("Host_cumulative_total_self_time_percent", "number", + "Cumulative total-self time on Host (%)"), + ("measured_flop_rate", "number", "Measured GFLOPs/Sec"), + ("measured_memory_bw", "number", "Measured Memory BW (GBytes/Sec)"), + ("operational_intensity", "number", "Operational Intensity (FLOPs/Byte)"), + ("bound_by", "string", "Bound by"), + ] + + data = [] + for record in stats_table.tf_stats_record: + row = [ + record.rank, + record.host_or_device, + record.op_type, + record.op_name, + record.occurrences, + record.total_time_in_us, + record.avg_time_in_us, + record.total_self_time_in_us, + record.avg_self_time_in_us, + record.device_total_self_time_as_fraction, + record.device_cumulative_total_self_time_as_fraction, + record.host_total_self_time_as_fraction, + record.host_cumulative_total_self_time_as_fraction, + record.measured_flop_rate, + record.measured_memory_bw, + record.operational_intensity, + record.bound_by, + ] + + data.append(row) + + return (table_description, data, []) + + +def generate_chart_table(stats_table): + (table_description, data, + custom_properties) = get_chart_table_args(stats_table) + return gviz_api.DataTable(table_description, data, custom_properties) diff --git a/tensorflow/python/profiler/tf_stats_proto_to_gviz_test.py b/tensorflow/python/profiler/tf_stats_proto_to_gviz_test.py new file mode 100644 index 00000000000..ab16867cc1c --- /dev/null +++ b/tensorflow/python/profiler/tf_stats_proto_to_gviz_test.py @@ -0,0 +1,133 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# Lint as: python3 +"""Tests for tf_stats_proto_to_gviz.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import csv +import io + +import gviz_api + +# pylint: disable=g-direct-tensorflow-import +from tensorflow.core.profiler.protobuf import tf_stats_pb2 +from tensorflow.python.platform import test +from tensorflow.python.profiler import tf_stats_proto_to_gviz +# pylint: enable=g-direct-tensorflow-import + + +class ProtoToGvizTest(test.TestCase): + + @staticmethod + def create_empty_stats_table(): + table = tf_stats_pb2.TfStatsTable() + + return table + + @staticmethod + def create_mock_stats_table(): + table = tf_stats_pb2.TfStatsTable() + + record = table.tf_stats_record.add() + record.rank = 100 + record.host_or_device = "Device" + record.op_type = "Compute" + record.op_name = "Compute0" + record.occurrences = 1 + record.total_time_in_us = 0.1799 + record.avg_time_in_us = 0.1799 + record.total_self_time_in_us = 0.1799 + record.avg_self_time_in_us = 0.1799 + record.device_total_self_time_as_fraction = 0.2020 + record.device_cumulative_total_self_time_as_fraction = 0.7980 + record.host_total_self_time_as_fraction = 0 + record.host_cumulative_total_self_time_as_fraction = 0 + record.measured_flop_rate = 1.6666 + record.measured_memory_bw = 2.7777 + record.operational_intensity = 0.6000 + record.bound_by = "Memory" + + record = table.tf_stats_record.add() + record.rank = 200 + record.host_or_device = "Host" + record.op_type = "Loop" + record.op_name = "while" + record.occurrences = 2 + record.total_time_in_us = 0.3 + record.avg_time_in_us = 0.5 + record.total_self_time_in_us = 0.7 + record.avg_self_time_in_us = 0.11 + record.device_total_self_time_as_fraction = 0.13 + record.device_cumulative_total_self_time_as_fraction = 0.17 + record.host_total_self_time_as_fraction = 0.19 + record.host_cumulative_total_self_time_as_fraction = 0.23 + record.measured_flop_rate = 2.9 + record.measured_memory_bw = 3.1 + record.operational_intensity = 0.37 + record.bound_by = "Compute" + + return table + + def test_stats_table_empty(self): + stats_table = ProtoToGvizTest.create_empty_stats_table() + data_table = tf_stats_proto_to_gviz.generate_chart_table(stats_table) + + self.assertEqual(0, data_table.NumberOfRows(), + "Empty table should have 0 rows.") + # "Stats table has 17 columns as defined in tf_stats.proto." + self.assertLen(data_table.columns, 17) + + def test_stats_table_simple(self): + stats_table = ProtoToGvizTest.create_mock_stats_table() + (table_description, data, custom_properties + ) = tf_stats_proto_to_gviz.get_chart_table_args(stats_table) + data_table = gviz_api.DataTable(table_description, data, custom_properties) + + # Data is a list of 2 rows. + self.assertLen(data, 2) + self.assertEqual(2, data_table.NumberOfRows(), "Simple table has 2 rows.") + # Table descriptor is a list of 17 columns. + self.assertLen(table_description, 17) + # Stats table has 17 columns as defined in tf_stats.proto. + self.assertLen(data_table.columns, 17) + + csv_file = io.StringIO(data_table.ToCsv()) + reader = csv.reader(csv_file) + + for (rr, row_values) in enumerate(reader): + if rr == 0: + for (cc, column_header) in enumerate(row_values): + self.assertEqual(table_description[cc][2], column_header) + else: + for (cc, cell_str) in enumerate(row_values): + raw_value = data[rr - 1][cc] + value_type = table_description[cc][1] + + # Only number and strings are used in our (tf_stats) proto. + self.assertIn(value_type, ["number", "string"]) + + # Encode in similar fashion as DataTable.ToCsv() + expected_value = gviz_api.DataTable.CoerceValue(raw_value, value_type) + self.assertNotIsInstance(expected_value, tuple) + self.assertEqual(expected_value, raw_value) + self.assertEqual(str(expected_value), cell_str) + + +if __name__ == "__main__": + test.main() From 4660bb7bfcefeacf785e49933cd6480fff6f20ea Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:06:01 -0800 Subject: [PATCH 108/383] Add a testable example to tf.math.rsqrt PiperOrigin-RevId: 283869732 Change-Id: I87ee6ddc4ceda907dd43f0d00d86297312a817d2 --- tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt index 3cfbfc1106e..a558be92be1 100644 --- a/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Rsqrt.pbtxt @@ -7,4 +7,13 @@ op { name: "rsqrt" deprecation_version: 2 } + description: <>> x = tf.constant([2., 0., -2.]) +>>> tf.math.rsqrt(x) + + +END } From 84ae116f8a88b365c2cbd81c715e4312f5119de1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:07:00 -0800 Subject: [PATCH 109/383] Add a testable example to tf.math.square PiperOrigin-RevId: 283869860 Change-Id: I7e014b8bc41249a56a6a1df8fe5e6b32cf0003ee --- tensorflow/core/api_def/python_api/api_def_Square.pbtxt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt index 0bd2f1bf41b..5e57a335686 100644 --- a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt @@ -6,4 +6,11 @@ op { endpoint { name: "square" } + description: <>> tf.math.square([-2., 0., 3.]) + + +END } From 081e50a7b118542e220a84ed2ce98fdbfd8ac50b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:08:26 -0800 Subject: [PATCH 110/383] Add a testable example to tf.math.log1p PiperOrigin-RevId: 283870059 Change-Id: Ic8b0db8bf14e8f1fe43b6d32dac1b3e7deb768bb --- tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt index e3da451de3f..3950c25169b 100644 --- a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt @@ -7,4 +7,13 @@ op { name: "log1p" deprecation_version: 2 } + description: <>> x = tf.constant([0, 0.5, 1, 5]) +>>> tf.math.log1p(x) + + +END } From c06d44e1578c29414cd6b5baa78a556c8df4350c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:09:46 -0800 Subject: [PATCH 111/383] Add a testable example to tf.math.minimum PiperOrigin-RevId: 283870282 Change-Id: Ia95032573fbc71b380e1b5b315c7fda318d1d3e2 --- tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt b/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt index 8aded1f154d..e7f90893fce 100644 --- a/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Minimum.pbtxt @@ -6,4 +6,12 @@ op { endpoint { name: "minimum" } + description: <>> x = tf.constant([0., 0., 0., 0.]) +>>> y = tf.constant([-5., -2., 0., 3.]) +>>> tf.math.minimum(x, y) + + +END } From fb25c3bd6a5e7d9275dd73f9abe78d1042dce6b8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:27:45 -0800 Subject: [PATCH 112/383] Fix minor warnings. When compiling with -Wall/-Werror, several warnings related to signed/unsigned comparison and an incorrect format string kill the build. Additionally, when compiling under GCC 4.8.x, `max_align_t` is not a member of `std`. This change fixes these minor errors. PiperOrigin-RevId: 283872988 Change-Id: I0f278f554d62dd10c7430ae747ad7678869f726b --- .../lite/experimental/micro/micro_allocator.cc | 17 +++++++++++++++-- .../experimental/micro/micro_interpreter.cc | 4 ++-- .../micro/micro_optional_debug_tools.cc | 9 ++++++++- .../lite/experimental/micro/test_helpers.cc | 2 +- 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_allocator.cc b/tensorflow/lite/experimental/micro/micro_allocator.cc index 82b3b350c23..48a0901c7ce 100644 --- a/tensorflow/lite/experimental/micro/micro_allocator.cc +++ b/tensorflow/lite/experimental/micro/micro_allocator.cc @@ -42,6 +42,19 @@ struct TensorInfo { // requirement for SIMD extensions. constexpr int kBufferAlignment = 16; +// If building with GCC 4.8.x or lower, `max_align_t` is not a member of `std`. +// If using a newer version of GCC, we import `max_align_t` into the local +// anonymous namespace to be able to use it like the global `max_align_t` from +// the older clib. +#ifdef __GNUC__ +#if __GNUC_PREREQ(4, 9) +using std::max_align_t; +#endif +#else +// We assume other compilers don't have this issue. +using std::max_align_t; +#endif + class MicroBuiltinDataAllocator : public BuiltinDataAllocator { public: explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator) @@ -51,7 +64,7 @@ class MicroBuiltinDataAllocator : public BuiltinDataAllocator { // Align to an address that is proper for all primitive types, but no more // than the size. return memory_allocator_->AllocateFromTail( - size, std::min(size, alignof(std::max_align_t))); + size, std::min(size, alignof(max_align_t))); } void Deallocate(void* data) override { // Do not deallocate, builtin data needs to be available for the life time @@ -412,7 +425,7 @@ TfLiteStatus MicroAllocator::InitializeRuntimeTensor( // If we've found a buffer, does it have any data? if (auto* array = buffer->data()) { // If it has any data, is the data size larger than zero? - if (size_t array_size = array->size()) { + if (array->size()) { // We've found a buffer with valid data, so update the runtime tensor // data structure to point to it. result->data.raw = diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 0c7b58aaece..5cc545f1460 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -21,7 +21,7 @@ limitations under the License. namespace tflite { namespace { -const int kStackDataAllocatorSize = 128; +const size_t kStackDataAllocatorSize = 128; class StackDataAllocator : public BuiltinDataAllocator { public: void* Allocate(size_t size) override { @@ -92,7 +92,7 @@ MicroInterpreter::MicroInterpreter(const Model* model, // NOTE: This requires that the flatbuffer is held in memory which can be // modified by this process. if (!FLATBUFFERS_LITTLEENDIAN) { - for (int t = 0; t < tensors_size(); ++t) { + for (size_t t = 0; t < tensors_size(); ++t) { TfLiteTensor* thisTensor = &context_.tensors[t]; if (thisTensor->allocation_type == kTfLiteMmapRo) CorrectTensorEndianness(thisTensor); diff --git a/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc b/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc index e27317a5443..1f6ce531f05 100644 --- a/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc +++ b/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc @@ -14,6 +14,13 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/experimental/micro/micro_optional_debug_tools.h" +// `cinttypes` requires `__STDC_FORMAT_MACROS` to be defined to expose `PRId32`. +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include + #include "tensorflow/lite/schema/schema_generated.h" namespace tflite { namespace { @@ -122,7 +129,7 @@ void PrintInterpreterState(MicroInterpreter* interpreter) { printf("Node %3zu Operator Custom Name %s\n", node_index, reg->custom_name); } else { - printf("Node %3zu Operator Builtin Code %3d %s\n", node_index, + printf("Node %3zu Operator Builtin Code %3" PRId32 " %s\n", node_index, reg->builtin_code, EnumNamesBuiltinOperator()[reg->builtin_code]); } printf(" Inputs:"); diff --git a/tensorflow/lite/experimental/micro/test_helpers.cc b/tensorflow/lite/experimental/micro/test_helpers.cc index 03e1d91fce0..a1b9801ffc9 100644 --- a/tensorflow/lite/experimental/micro/test_helpers.cc +++ b/tensorflow/lite/experimental/micro/test_helpers.cc @@ -47,7 +47,7 @@ class StackAllocator : public flatbuffers::Allocator { return *inst; } - static constexpr int kStackAllocatorSize = 4096; + static constexpr size_t kStackAllocatorSize = 4096; private: uint8_t data_backing_[kStackAllocatorSize]; From bb62f32e835441a9d9c76ee1ccdcf9ce5c717fe4 Mon Sep 17 00:00:00 2001 From: Jian Li Date: Wed, 4 Dec 2019 17:28:41 -0800 Subject: [PATCH 113/383] Extend support of cell clip up to 64. PiperOrigin-RevId: 283873117 Change-Id: I501710914b15dc35f144827e53b1621c77126c5a --- .../internal/optimized/neon_tensor_utils.cc | 32 +++++++----- .../internal/optimized/neon_tensor_utils.h | 16 ++---- .../optimized/neon_tensor_utils_impl.h | 10 +--- .../internal/optimized/sse_tensor_utils.h | 16 ++---- .../reference/portable_tensor_utils.cc | 49 +++++++++---------- .../reference/portable_tensor_utils.h | 16 ++---- .../reference/portable_tensor_utils_impl.h | 10 +--- .../lite/kernels/internal/tensor_utils.h | 28 ++--------- .../kernels/internal/tensor_utils_test.cc | 6 +-- tensorflow/lite/kernels/lstm.cc | 3 +- tensorflow/lite/kernels/lstm_eval.cc | 10 ++-- 11 files changed, 66 insertions(+), 130 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc index e5a71c7243d..9622f30d2ea 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc @@ -1504,19 +1504,25 @@ void NeonApplyTanhImpl(const int16_t* input, int32_t n_batch, int32_t n_input, } } -void NeonApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - NeonApplyTanhImpl<0>(input, n_batch, n_input, output); -} - -void NeonApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - NeonApplyTanhImpl<3>(input, n_batch, n_input, output); -} - -void NeonApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - NeonApplyTanhImpl<4>(input, n_batch, n_input, output); +void NeonApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output) { + assert(integer_bits <= 6); +#define DISPATCH_TANH(i) \ + case i: \ + NeonApplyTanhImpl(input, n_batch, n_input, output); \ + break; + switch (integer_bits) { + DISPATCH_TANH(0); + DISPATCH_TANH(1); + DISPATCH_TANH(2); + DISPATCH_TANH(3); + DISPATCH_TANH(4); + DISPATCH_TANH(5); + DISPATCH_TANH(6); + default: + return; + } +#undef DISPATCH_TANH } void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h index 4c7ef2cf3fe..cbb2cab36ac 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h @@ -110,19 +110,9 @@ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, NEON_OR_PORTABLE(ApplySigmoid, input, n_batch, n_input, output); } -void ApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - NEON_OR_PORTABLE(ApplyTanh0, input, n_batch, n_input, output); -} - -void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - NEON_OR_PORTABLE(ApplyTanh3, input, n_batch, n_input, output); -} - -void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - NEON_OR_PORTABLE(ApplyTanh4, input, n_batch, n_input, output); +void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output) { + NEON_OR_PORTABLE(ApplyTanh, integer_bits, input, n_batch, n_input, output); } void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h index 5b189f761b6..ec98185a7ba 100644 --- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h @@ -57,14 +57,8 @@ void NeonApplyLayerNorm(const int16_t* input, const int16_t* layer_norm_weights, void NeonApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, int16_t* output); -void NeonApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); - -void NeonApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); - -void NeonApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); +void NeonApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output); void NeonCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, int n_input, int shift, int16_t* output); diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h index 7b08823e1ac..0127645539c 100644 --- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h @@ -120,19 +120,9 @@ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, PortableApplySigmoid(input, n_batch, n_input, output); } -void ApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - PortableApplyTanh0(input, n_batch, n_input, output); -} - -void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - PortableApplyTanh3(input, n_batch, n_input, output); -} - -void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - PortableApplyTanh4(input, n_batch, n_input, output); +void ApplyTanh(int32_t intger_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output) { + PortableApplyTanh(intger_bits, input, n_batch, n_input, output); } void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc index dcf0df8ebed..1b36144cdff 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc @@ -394,22 +394,10 @@ void PortableApplySigmoid(const int16_t* input, int32_t n_batch, } } -void PortableApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - using F0 = gemmlowp::FixedPoint; - for (int batch = 0; batch < n_batch; ++batch) { - for (int i = 0; i < n_input; ++i) { - const int index = batch * n_input + i; - F0 tanh_input = F0::FromRaw(input[index]); - F0 tanh_output = gemmlowp::tanh(tanh_input); - output[index] = tanh_output.raw(); - } - } -} - -void PortableApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - using FX = gemmlowp::FixedPoint; +template +void PortableApplyTanhImpl(const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output) { + using FX = gemmlowp::FixedPoint; using F0 = gemmlowp::FixedPoint; for (int batch = 0; batch < n_batch; ++batch) { for (int i = 0; i < n_input; ++i) { @@ -421,18 +409,25 @@ void PortableApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, } } -void PortableApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - using FX = gemmlowp::FixedPoint; - using F0 = gemmlowp::FixedPoint; - for (int batch = 0; batch < n_batch; ++batch) { - for (int i = 0; i < n_input; ++i) { - const int index = batch * n_input + i; - FX tanh_input = FX::FromRaw(input[index]); - F0 tanh_output = gemmlowp::tanh(tanh_input); - output[index] = tanh_output.raw(); - } +void PortableApplyTanh(int32_t integer_bits, const int16_t* input, + int32_t n_batch, int32_t n_input, int16_t* output) { + assert(integer_bits <= 6); +#define DISPATCH_TANH(i) \ + case i: \ + PortableApplyTanhImpl(input, n_batch, n_input, output); \ + break; + switch (integer_bits) { + DISPATCH_TANH(0); + DISPATCH_TANH(1); + DISPATCH_TANH(2); + DISPATCH_TANH(3); + DISPATCH_TANH(4); + DISPATCH_TANH(5); + DISPATCH_TANH(6); + default: + return; } +#undef DISPATCH_TANH } void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2, diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h index 068fe3a8593..f3f41f704e3 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h @@ -144,19 +144,9 @@ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, PortableApplySigmoid(input, n_batch, n_input, output); } -void ApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - PortableApplyTanh0(input, n_batch, n_input, output); -} - -void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - PortableApplyTanh3(input, n_batch, n_input, output); -} - -void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output) { - PortableApplyTanh4(input, n_batch, n_input, output); +void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output) { + PortableApplyTanh(integer_bits, input, n_batch, n_input, output); } void CwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h index fe06f582320..0398edfa181 100644 --- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h +++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h @@ -128,14 +128,8 @@ void PortableApplyLayerNorm(const int16_t* input, void PortableApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, int16_t* output); -void PortableApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); - -void PortableApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); - -void PortableApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); +void PortableApplyTanh(int32_t integer_bits, const int16_t* input, + int32_t n_batch, int32_t n_input, int16_t* output); void PortableCwiseMul(const int16_t* input_1, const int16_t* input_2, int n_batch, int n_input, int shift, int16_t* output); diff --git a/tensorflow/lite/kernels/internal/tensor_utils.h b/tensorflow/lite/kernels/internal/tensor_utils.h index b62cc8b089c..76162e3d976 100644 --- a/tensorflow/lite/kernels/internal/tensor_utils.h +++ b/tensorflow/lite/kernels/internal/tensor_utils.h @@ -233,33 +233,15 @@ void ApplySigmoid(const int16_t* input, int32_t n_batch, int32_t n_input, // Apply Tanh to a quantized vector. // Parameters: +// - integer_bits: the integer bits of the input. +// Currently supports 0, 1, 2, 3, 4, 5, 6. // - input: batch vector of size n_batch * n_input; 16 bit. // - n_batch: the number of batches. // - n_input: the size for input and output. // - output: the 16 bit output -// The input is in Q0.15 format and the output is in Q0.15 format. -void ApplyTanh0(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); - -// Apply Tanh to a quantized vector. -// Parameters: -// - input: batch vector of size n_batch * n_input; 16 bit. -// - n_batch: the number of batches. -// - n_input: the size for input and output. -// - output: the 16 bit output -// The input is in Q3.12 format and the output is in Q0.15 format. -void ApplyTanh3(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); - -// Apply Tanh to a quantized vector. -// Parameters: -// - input: batch vector of size n_batch * n_input; 16 bit. -// - n_batch: the number of batches. -// - n_input: the size for input and output. -// - output: the 16 bit output -// The input is in Q4.11 format and the output is in Q0.15 format. -void ApplyTanh4(const int16_t* input, int32_t n_batch, int32_t n_input, - int16_t* output); +// The input is in Qm.15-m format and the output is in Q0.15 format. +void ApplyTanh(int32_t integer_bits, const int16_t* input, int32_t n_batch, + int32_t n_input, int16_t* output); // Element-wise multiplication of two quantized vectors. // Parameters: diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc index cf31bf046b2..e6b76ee19a9 100644 --- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc +++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc @@ -524,7 +524,7 @@ TEST(uKernels, QuantTanh0Test) { 653, -29, -53, 1058, -52, -164, -149, -635, 201, -1297, }; std::vector output(4 * 15, 0); - ApplyTanh0(input.data(), 4, 15, output.data()); + ApplyTanh(0, input.data(), 4, 15, output.data()); const std::vector expected_output = { -136, 904, -176, -40, 260, 292, 8, 28, -44, -1304, -120, 120, -24, 112, 376, -576, -308, 88, -544, 544, @@ -547,7 +547,7 @@ TEST(uKernels, QuantTanh3Test) { 653, -29, -53, 1058, -52, -164, -149, -635, 201, -1297, }; std::vector output(4 * 15, 0); - ApplyTanh3(input.data(), 4, 15, output.data()); + ApplyTanh(3, input.data(), 4, 15, output.data()); const std::vector expected_output = { -1156, 7076, -1412, -276, 2104, 2308, 64, 220, -288, -10132, -964, 1016, -120, 844, 2944, -4640, -2392, 736, -4352, 4352, @@ -568,7 +568,7 @@ TEST(uKernels, QuantTanh4Test) { -26, -36, 9, -73, 25, 14, -2, -1, 29, -10, -12, -18, -29, 51, -92, }; std::vector output(4 * 15, 0); - ApplyTanh4(input.data(), 4, 15, output.data()); + ApplyTanh(4, input.data(), 4, 15, output.data()); const std::vector expected_output = { -76, 2596, -496, -76, 856, 1436, 24, 36, -64, -672, -120, 456, 0, 752, 2400, -412, -576, 148, -1168, 400, diff --git a/tensorflow/lite/kernels/lstm.cc b/tensorflow/lite/kernels/lstm.cc index 0072faba358..678864cec60 100644 --- a/tensorflow/lite/kernels/lstm.cc +++ b/tensorflow/lite/kernels/lstm.cc @@ -289,8 +289,7 @@ TfLiteStatus PopulateQuantizedLstmParams( &context->tensors[op_data->cell_state_tensor_index]; TF_LITE_ENSURE(context, CheckedLog2(cell_state->params.scale, &cell_scale)); - // TODO(jianlijianli): remove this check once kernel has better tanh support. - TF_LITE_ENSURE(context, cell_scale == -11 || cell_scale == -15); + TF_LITE_ENSURE(context, cell_scale <= -9); quantized_lstm_param->cell_scale = cell_scale; input_scale = input->params.scale; diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc index ba631a6ee24..6e1dc28ce34 100644 --- a/tensorflow/lite/kernels/lstm_eval.cc +++ b/tensorflow/lite/kernels/lstm_eval.cc @@ -1077,7 +1077,7 @@ inline void LstmStepQuantized( n_batch, n_cell, scratch_2_ptr); } - tensor_utils::ApplyTanh3(scratch_2_ptr, n_batch, n_cell, scratch_2_ptr); + tensor_utils::ApplyTanh(3, scratch_2_ptr, n_batch, n_cell, scratch_2_ptr); // Ouptut gate. tensor_utils::MatrixBatchVectorMultiplyAccumulate( @@ -1139,12 +1139,8 @@ inline void LstmStepQuantized( tensor_utils::CwiseClipping(cell_ptr, quantized_cell_clip, n_batch, n_cell); } - // TODO(jianlijianli): swtich to a tempalte. - if (cell_scale == -11) { - tensor_utils::ApplyTanh4(cell_ptr, n_batch, n_cell, scratch_0_ptr); - } else if (cell_scale == -15) { - tensor_utils::ApplyTanh0(cell_ptr, n_batch, n_cell, scratch_0_ptr); - } + tensor_utils::ApplyTanh(15 + cell_scale, cell_ptr, n_batch, n_cell, + scratch_0_ptr); tensor_utils::CwiseMul(scratch_3_ptr, scratch_0_ptr, effective_hidden_scale_a, effective_hidden_scale_b, n_batch, n_cell, hidden_zp, From 5e7654c3003884bf6618eb383a44f2d41ff85118 Mon Sep 17 00:00:00 2001 From: Yash Katariya Date: Wed, 4 Dec 2019 17:29:23 -0800 Subject: [PATCH 114/383] Make convert_to_tensor usage example testable. PiperOrigin-RevId: 283873221 Change-Id: I9dd2ebd779cb44ceb8ab539faca2b682c876538c --- tensorflow/python/framework/ops.py | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 5209e30c7a2..0b645102d1c 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -1206,18 +1206,26 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None): objects. It accepts `Tensor` objects, numpy arrays, Python lists, and Python scalars. For example: - ```python - import numpy as np + >>> def my_func(arg): + ... arg = tf.convert_to_tensor(arg, dtype=tf.float32) + ... return arg - def my_func(arg): - arg = tf.convert_to_tensor(arg, dtype=tf.float32) - return tf.matmul(arg, arg) + arg - - # The following calls are equivalent. - value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]])) - value_2 = my_func([[1.0, 2.0], [3.0, 4.0]]) - value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)) - ``` + >>> # The following calls are equivalent. + >>> value_1 = my_func(tf.constant([[1.0, 2.0], [3.0, 4.0]])) + >>> print(value_1) + tf.Tensor( + [[1. 2.] + [3. 4.]], shape=(2, 2), dtype=float32) + >>> value_2 = my_func([[1.0, 2.0], [3.0, 4.0]]) + >>> print(value_2) + tf.Tensor( + [[1. 2.] + [3. 4.]], shape=(2, 2), dtype=float32) + >>> value_3 = my_func(np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)) + >>> print(value_3) + tf.Tensor( + [[1. 2.] + [3. 4.]], shape=(2, 2), dtype=float32) This function can be useful when composing a new operation in Python (such as `my_func` in the example above). All standard Python op From da7ad11b6b630a418af1d5538dca468fb66d703e Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Wed, 4 Dec 2019 17:29:57 -0800 Subject: [PATCH 115/383] Improve documentation of tf.math.ceil PiperOrigin-RevId: 283873309 Change-Id: Icad7b01267ae61e0a115975ec9062c3fabd29c08 --- .../api_def/python_api/api_def_Ceil.pbtxt | 8 +----- tensorflow/python/ops/math_ops.py | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt b/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt index 331bb9cbf55..5d24634b328 100644 --- a/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Ceil.pbtxt @@ -1,10 +1,4 @@ op { graph_op_name: "Ceil" - endpoint { - name: "math.ceil" - } - endpoint { - name: "ceil" - deprecation_version: 2 - } + visibility: HIDDEN } diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 078219e2f23..527fc850c5f 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4251,3 +4251,30 @@ def ndtri(x, name=None): """ with ops.name_scope(name, "ndtri", [x]): return gen_math_ops.ndtri(x) + + +@tf_export("math.ceil", v1=["math.ceil", "ceil"]) +@deprecation.deprecated_endpoints("ceil") +@dispatch.add_dispatch_support +def ceil(x, name=None): + """Return the ceiling of the input, element-wise. + + For example: + + >>> tf.math.ceil([-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]) + + + Args: + x: A `tf.Tensor`. Must be one of the following types: `bfloat16`, `half`, + `float32`, `float64`. `int32` + name: A name for the operation (optional). + + Returns: + A `tf.Tensor`. Has the same type as `x`. + + @compatibility(numpy) + Equivalent to np.ceil + @end_compatibility + """ + return gen_math_ops.ceil(x, name) From 21e57f996cce4e9b05deccaf2cf7dcbeb24f323e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:32:17 -0800 Subject: [PATCH 116/383] Update tf.identity docstring PiperOrigin-RevId: 283873712 Change-Id: I19b4f7571e5300a4b01aa3a416c61c5f2b7f1f8c --- tensorflow/python/ops/array_ops.py | 32 +++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 05a1ddc5cea..c7f717018a2 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -245,18 +245,32 @@ def fill(dims, value, name=None): @tf_export("identity") @dispatch.add_dispatch_support def identity(input, name=None): # pylint: disable=redefined-builtin - r"""Return a tensor with the same shape and contents as input. + r"""Return a Tensor with the same shape and contents as input. + + The return value is not the same Tensor as the original, but contains the same + values. This operation is fast when used on the same device. For example: - ```python - import tensorflow as tf - val0 = tf.ones((1,), dtype=tf.float32) - a = tf.atan2(val0, val0) - a_identity = tf.identity(a) - print(a.numpy()) #[0.7853982] - print(a_identity.numpy()) #[0.7853982] - ``` + >>> a = tf.constant([0.78]) + >>> a_identity = tf.identity(a) + >>> a.numpy() + array([0.78], dtype=float32) + >>> a_identity.numpy() + array([0.78], dtype=float32) + + Calling `tf.identity` on a variable will make a Tensor that represents the + value of that variable at the time it is called. This is equivalent to calling + `.read_value()`. + + >>> a = tf.Variable(5) + >>> a_identity = tf.identity(a) + >>> a.assign_add(1) + + >>> a.numpy() + 6 + >>> a_identity.numpy() + 5 Args: input: A `Tensor`. From f255728114187cd2ff1ab2f96a24168b79af7627 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 17:35:39 -0800 Subject: [PATCH 117/383] Update docstring for tf.tensor_scatter_nd_update PiperOrigin-RevId: 283874210 Change-Id: I0daa221ad93fa7d8c93f154a0a1d39369a625a7d --- .../api_def_TensorScatterUpdate.pbtxt | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt index c0e62302129..3cd2a9e9580 100644 --- a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt @@ -62,17 +62,11 @@ tensor with 8 elements. In Python, this scatter operation would look like this: -```python - indices = tf.constant([[4], [3], [1], [7]]) - updates = tf.constant([9, 10, 11, 12]) - tensor = tf.ones([8], dtype=tf.int32) - updated = tf.tensor_scatter_nd_update(tensor, indices, updates) - print(updated) -``` - -The resulting tensor would look like this: - - [1, 11, 1, 10, 9, 1, 1, 12] + >>> indices = tf.constant([[4], [3], [1], [7]]) + >>> updates = tf.constant([9, 10, 11, 12]) + >>> tensor = tf.ones([8], dtype=tf.int32) + >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates)) + tf.Tensor([ 1 11 1 10 9 1 1 12], shape=(8,), dtype=int32) We can also, insert entire slices of a higher rank tensor all at once. For example, if we wanted to insert two slices in the first dimension of a @@ -80,23 +74,29 @@ rank-3 tensor with two matrices of new values. In Python, this scatter operation would look like this: -```python - indices = tf.constant([[0], [2]]) - updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6], - [7, 7, 7, 7], [8, 8, 8, 8]], - [[5, 5, 5, 5], [6, 6, 6, 6], - [7, 7, 7, 7], [8, 8, 8, 8]]]) - tensor = tf.ones([4, 4, 4],dtype=tf.int32) - updated = tf.tensor_scatter_nd_update(tensor, indices, updates) - print(updated) -``` - -The resulting tensor would look like this: - - [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]], - [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]], - [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]], - [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]] + >>> indices = tf.constant([[0], [2]]) + >>> updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6], + ... [7, 7, 7, 7], [8, 8, 8, 8]], + ... [[5, 5, 5, 5], [6, 6, 6, 6], + ... [7, 7, 7, 7], [8, 8, 8, 8]]]) + >>> tensor = tf.ones([4, 4, 4], dtype=tf.int32) + >>> print(tf.tensor_scatter_nd_update(tensor, indices, updates).numpy()) + [[[5 5 5 5] + [6 6 6 6] + [7 7 7 7] + [8 8 8 8]] + [[1 1 1 1] + [1 1 1 1] + [1 1 1 1] + [1 1 1 1]] + [[5 5 5 5] + [6 6 6 6] + [7 7 7 7] + [8 8 8 8]] + [[1 1 1 1] + [1 1 1 1] + [1 1 1 1] + [1 1 1 1]]] Note that on CPU, if an out of bound index is found, an error is returned. On GPU, if an out of bound index is found, the index is ignored. From 18a94a75ffc6ba4a8f4e466f6d7726ae2cb336a3 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Wed, 4 Dec 2019 17:40:26 -0800 Subject: [PATCH 118/383] Fix the formatting of BatchNormalization equations. PiperOrigin-RevId: 283875007 Change-Id: I153c4f501f3ccc53963267467dd93014566ed121 --- .../python/keras/layers/normalization.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py index 1e3e2efa651..467b6c6eef3 100644 --- a/tensorflow/python/keras/layers/normalization.py +++ b/tensorflow/python/keras/layers/normalization.py @@ -144,23 +144,23 @@ class BatchNormalizationBase(Layer): Normalization equations: Consider the intermediate activations \(x\) of a mini-batch of size - \(m\): + \\(m\\): We can compute the mean and variance of the batch - \({\mu_B} = \frac{1}{m} \sum_{i=1}^{m} {x_i}\) + \\({\mu_B} = \frac{1}{m} \sum_{i=1}^{m} {x_i}\\) - \({\sigma_B^2} = \frac{1}{m} \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\) + \\({\sigma_B^2} = \frac{1}{m} \sum_{i=1}^{m} ({x_i} - {\mu_B})^2\\) - and then compute a normalized \(x\), including a small factor - \({\epsilon}\) for numerical stability. + and then compute a normalized \\(x\\), including a small factor + \\({\epsilon}\\) for numerical stability. - \(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\) + \\(\hat{x_i} = \frac{x_i - \mu_B}{\sqrt{\sigma_B^2 + \epsilon}}\\) - And finally \(\hat{x}\) is linearly transformed by \({\gamma}\) - and \({\beta}\), which are learned parameters: + And finally \\(\hat{x}\) is linearly transformed by \({\gamma}\\) + and \\({\beta}\\), which are learned parameters: - \({y_i} = {\gamma * \hat{x_i} + \beta}\) + \\({y_i} = {\gamma * \hat{x_i} + \beta}\\) References: - [Batch Normalization: Accelerating Deep Network Training by Reducing From db4e53c270615b1df36713aef961f17cf29998e4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 18:04:03 -0800 Subject: [PATCH 119/383] Updated docs for `tf.fill()`. PiperOrigin-RevId: 283878586 Change-Id: I8f3b385540df90f3ffdc048cd327367219c71975 --- tensorflow/python/ops/array_ops.py | 44 ++++++++++++++++-------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index c7f717018a2..dd058e82223 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -210,32 +210,34 @@ def fill(dims, value, name=None): For example: - ``` - # Output tensor has shape [2, 3]. - fill([2, 3], 9) ==> [[9, 9, 9] - [9, 9, 9]] - ``` + # Output tensor with shape [2, 3]. + >>> tf.fill([2, 3], 9) + - `tf.fill` differs from `tf.constant` in a few ways: - - * `tf.fill` only supports scalar contents, whereas `tf.constant` supports - Tensor values. - * `tf.fill` creates an Op in the computation graph that constructs the - actual - Tensor value at runtime. This is in contrast to `tf.constant` which embeds - the entire Tensor into the graph with a `Const` node. - * Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes - based on other runtime Tensors, unlike `tf.constant`. + `tf.fill` evaluates at graph runtime and supports dynamic shapes based on + other runtime `tf.Tensors`, unlike `tf.constant(value, shape=dims)`, which + embeds the value as a `Const` node. Args: - dims: A `Tensor`. Must be one of the following types: `int32`, `int64`. 1-D. - Represents the shape of the output tensor. - value: A `Tensor`. 0-D (scalar). Value to fill the returned tensor. - @compatibility(numpy) Equivalent to np.full @end_compatibility - name: A name for the operation (optional). + dims: A 1-D sequence of non-negative numbers. Represents the shape of the + output `tf.Tensor`. Entries should be of type: `int32`, `int64`. + value: A value to fill the returned `tf.Tensor`. + name: Optional string. The name of the output `tf.Tensor`. Returns: - A `Tensor`. Has the same type as `value`. + A `tf.Tensor` with shape `dims` and the same dtype as `value`. + + Raises: + InvalidArgumentError: `dims` contains negative entries. + NotFoundError: `dims` contains non-integer entries. + + @compatibility(numpy) + Similar to `np.full`. In `numpy`, more parameters are supported. Passing a + number argument as the shape (`np.full(5, value)`) is valid in `numpy` for + specifying a 1-D shaped result, while TensorFlow does not support this syntax. + @end_compatibility """ result = gen_array_ops.fill(dims, value, name=name) tensor_util.maybe_set_static_shape(result, dims) From 6d5a4a87a458a8fee63b28f87e84e1b37496652a Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Wed, 4 Dec 2019 18:05:04 -0800 Subject: [PATCH 120/383] Improvement documentation of tf.math.exp PiperOrigin-RevId: 283878760 Change-Id: I807119167cf4117806b5bd27727d419550d5bb56 --- .../core/api_def/python_api/api_def_Exp.pbtxt | 7 +-- tensorflow/python/ops/math_ops.py | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt index 38a9078d9f6..4c89cd7afcc 100644 --- a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt @@ -1,9 +1,4 @@ op { graph_op_name: "Exp" - endpoint { - name: "math.exp" - } - endpoint { - name: "exp" - } + visibility: HIDDEN } diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 527fc850c5f..36c6bd86370 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4278,3 +4278,52 @@ def ceil(x, name=None): @end_compatibility """ return gen_math_ops.ceil(x, name) + + +# pylint: disable=g-docstring-has-escape +@tf_export("math.exp", "exp") +@dispatch.add_dispatch_support +def exp(x, name=None): + """Computes exponential of x element-wise. \\(y = e^x\\). + + This function computes the exponential of the input tensor element-wise. + i.e. `math.exp(x)` or \\(e^x\\), where `x` is the input tensor. + \\(e\\) denotes Euler's number and is approximately equal to 2.718281. + Output is positive for any real input. + + >>> x = tf.constant(2.0) + >>> tf.math.exp(x) + + + >>> x = tf.constant([2.0, 8.0]) + >>> tf.math.exp(x) + + + For complex numbers, the exponential value is calculated as + \\(e^{x+iy}={e^x}{e^{iy}}={e^x}{\cos(y)+i\sin(y)}\\) + + For `1+1j` the value would be computed as: + \\(e^1{\cos(1)+i\sin(1)} = 2.7182817 \times (0.5403023+0.84147096j)\\) + + >>> x = tf.constant(1 + 1j) + >>> tf.math.exp(x) + + + Args: + x: A `tf.Tensor`. Must be one of the following types: `bfloat16`, `half`, + `float32`, `float64`, `complex64`, `complex128`. + name: A name for the operation (optional). + + Returns: + A `tf.Tensor`. Has the same type as `x`. + + @compatibility(numpy) + Equivalent to np.exp + @end_compatibility + """ + return gen_math_ops.exp(x, name) + + +# pylint: enable=g-docstring-has-escape From b21d30f6aa4a5d62721222683e16003390717bb4 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 18:18:39 -0800 Subject: [PATCH 121/383] Explicitly export files needed by other packages PiperOrigin-RevId: 283880548 Change-Id: I802964f92f069ebb964dc7b4d3ae8e8baa5e0bea --- tensorflow/lite/c/BUILD | 5 ++++- tensorflow/lite/delegates/gpu/BUILD | 2 ++ tensorflow/lite/java/BUILD | 2 ++ tensorflow/lite/tools/benchmark/BUILD | 2 ++ tensorflow/lite/tools/benchmark/experimental/c/BUILD | 5 +++++ tensorflow/python/kernel_tests/proto/BUILD | 5 ++++- 6 files changed, 19 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD index 629320370cb..b3e231e8cb3 100644 --- a/tensorflow/lite/c/BUILD +++ b/tensorflow/lite/c/BUILD @@ -137,7 +137,10 @@ cc_library( ) # For use with library targets that can't use relative paths. -exports_files(["common.h"]) +exports_files([ + "c_api.h", + "common.h", +]) # Test the C extension API code. cc_test( diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD index 4cfbeff2081..7faa83ae5ab 100644 --- a/tensorflow/lite/delegates/gpu/BUILD +++ b/tensorflow/lite/delegates/gpu/BUILD @@ -6,6 +6,8 @@ package( licenses = ["notice"], # Apache 2.0 ) +exports_files(["metal_delegate.h"]) + # Primary purpose of this config is to replace ::util::Status with our custom # light implementation ::tflite::gpu::StatusLite to reduce binary size. Besides # that, certain features that were hard to communicate without full open source diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD index 08972941950..4cd3da9f843 100644 --- a/tensorflow/lite/java/BUILD +++ b/tensorflow/lite/java/BUILD @@ -11,6 +11,8 @@ package( licenses = ["notice"], # Apache 2.0 ) +exports_files(["src/testdata/add.bin"]) + JAVA_SRCS = glob([ "src/main/java/org/tensorflow/lite/*.java", "src/main/java/org/tensorflow/lite/annotations/*.java", diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD index 4514ca8a2d3..97d021c6326 100644 --- a/tensorflow/lite/tools/benchmark/BUILD +++ b/tensorflow/lite/tools/benchmark/BUILD @@ -9,6 +9,8 @@ package( licenses = ["notice"], # Apache 2.0 ) +exports_files(["logging.h"]) + common_copts = ["-Wall"] + tflite_copts() cc_library( diff --git a/tensorflow/lite/tools/benchmark/experimental/c/BUILD b/tensorflow/lite/tools/benchmark/experimental/c/BUILD index 28bbd3fdfe6..2bd26e8e127 100644 --- a/tensorflow/lite/tools/benchmark/experimental/c/BUILD +++ b/tensorflow/lite/tools/benchmark/experimental/c/BUILD @@ -12,6 +12,11 @@ package_group( ], ) +exports_files( + ["benchmark_c_api.h"], + visibility = ["//tensorflow/lite/tools/benchmark/experimental/c:benchmark"], +) + cc_library( name = "benchmark_c_api", srcs = ["benchmark_c_api.cc"], diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD index 75100a3ff38..389612c9827 100644 --- a/tensorflow/python/kernel_tests/proto/BUILD +++ b/tensorflow/python/kernel_tests/proto/BUILD @@ -10,7 +10,10 @@ package( licenses = ["notice"], # Apache 2.0 ) -exports_files(["LICENSE"]) +exports_files([ + "LICENSE", + "test_example.proto", +]) tf_py_test( name = "decode_proto_op_test", From 7a3343391171377e45952dcc19923ad134cfa0c3 Mon Sep 17 00:00:00 2001 From: Haoyu Zhang Date: Wed, 4 Dec 2019 18:26:01 -0800 Subject: [PATCH 122/383] Fix Kokoro build. PiperOrigin-RevId: 283881484 Change-Id: I13edeb9da81c385eea9b7b402a572f2501f3ce6b --- .../core/distributed_runtime/rpc/eager/grpc_eager_client.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc index 921696efbcc..5ad48118ae9 100644 --- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc +++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc @@ -230,8 +230,9 @@ class GrpcEagerClientCache : public EagerClientCache { } int assigned_index = AssignClientToThread(target); GrpcEagerClientThread* thread = threads_[assigned_index].get(); - auto worker = new GrpcEagerClient(shared, thread); - it = clients_.emplace(target, worker).first; + core::RefCountPtr worker( + new GrpcEagerClient(shared, thread)); + it = clients_.emplace(target, std::move(worker)).first; } it->second->Ref(); From 5e94ded1de5a514474f4a7ec6de285198fdb458e Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Wed, 4 Dec 2019 18:29:37 -0800 Subject: [PATCH 123/383] Optimize int8 transpose_conv PiperOrigin-RevId: 283881873 Change-Id: I568f5db0ba3663cb17208af41d30c2179e2e485c --- tensorflow/lite/kernels/internal/BUILD | 1 + .../optimized/integer_ops/transpose_conv.h | 105 +++++++++++++++++ .../internal/optimized/optimized_ops.h | 111 ++++++++++++++++++ tensorflow/lite/kernels/transpose_conv.cc | 42 +++++-- 4 files changed, 247 insertions(+), 12 deletions(-) create mode 100644 tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD index d71b36547f2..e82d3c16b31 100644 --- a/tensorflow/lite/kernels/internal/BUILD +++ b/tensorflow/lite/kernels/internal/BUILD @@ -227,6 +227,7 @@ cc_library( "optimized/integer_ops/mul.h", "optimized/integer_ops/pooling.h", "optimized/integer_ops/softmax.h", + "optimized/integer_ops/transpose_conv.h", "optimized/optimized_ops.h", ], copts = tflite_copts(), diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h new file mode 100644 index 00000000000..4d24ff65250 --- /dev/null +++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h @@ -0,0 +1,105 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_TRANSPOSE_CONV_H_ +#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_TRANSPOSE_CONV_H_ + +#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" + +namespace tflite { +namespace optimized_integer_ops { + +// TransposeConvV2 expect the weights in HWOI order. +inline void TransposeConvV2( + const ConvParams& params, const int32* output_multiplier, + const int32* output_shift, const RuntimeShape& input_shape, + const int8_t* input_data, const RuntimeShape& hwoi_ordered_filter_shape, + const int8_t* hwoi_ordered_filter_data, const RuntimeShape& output_shape, + int8_t* output_data, const RuntimeShape& col2im_shape, int32_t* col2im_data, + int32_t* scratch_data, CpuBackendContext* cpu_backend_context) { + gemmlowp::ScopedProfilingLabel label("TransposeConvV2/int8"); + TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4); + TFLITE_DCHECK_EQ(hwoi_ordered_filter_shape.DimensionsCount(), 4); + const int batch_size = input_shape.Dims(0); + TFLITE_DCHECK(col2im_data); + TFLITE_DCHECK(hwoi_ordered_filter_data); + + const int input_image_size = input_shape.Dims(1) * input_shape.Dims(2); + const int output_height = output_shape.Dims(1); + const int output_width = output_shape.Dims(2); + const int output_image_size = output_height * output_width; + const int input_depth = + MatchingDim(input_shape, 3, hwoi_ordered_filter_shape, 3); + const int output_depth = + MatchingDim(output_shape, 3, hwoi_ordered_filter_shape, 2); + const int input_offset = input_image_size * input_depth; + const int output_offset = output_image_size * output_depth; + + const int filter_height = hwoi_ordered_filter_shape.Dims(0); + const int filter_width = hwoi_ordered_filter_shape.Dims(1); + const int padding_top = params.padding_values.height; + const int padding_bottom = + params.padding_values.height + params.padding_values.height_offset; + const int padding_left = params.padding_values.width; + const int padding_right = + params.padding_values.width + params.padding_values.width_offset; + const int stride_height = params.stride_height; + const int stride_width = params.stride_width; + + const int hwoi_ordered_filter_total_size = + filter_height * filter_width * output_depth; + + cpu_backend_gemm::MatrixParams lhs_params; + lhs_params.order = cpu_backend_gemm::Order::kRowMajor; + lhs_params.rows = hwoi_ordered_filter_total_size; + lhs_params.cols = input_depth; + // Since our weight is symmetric quantized, the zp will always be 0. + lhs_params.zero_point = 0; + + int32_t* scratch_data_p = scratch_data; + std::fill_n(scratch_data, output_offset * batch_size, static_cast(0)); + for (int i = 0; i < batch_size; ++i) { + cpu_backend_gemm::MatrixParams rhs_params; + rhs_params.order = cpu_backend_gemm::Order::kColMajor; + rhs_params.rows = input_depth; + rhs_params.cols = input_image_size; + rhs_params.zero_point = -params.input_offset; + + cpu_backend_gemm::MatrixParams dst_params; + dst_params.order = cpu_backend_gemm::Order::kColMajor; + dst_params.rows = hwoi_ordered_filter_total_size; + dst_params.cols = input_image_size; + + cpu_backend_gemm::GemmParams gemm_params; + cpu_backend_gemm::Gemm(lhs_params, hwoi_ordered_filter_data, rhs_params, + input_data + input_offset * i, dst_params, + col2im_data, gemm_params, cpu_backend_context); + + optimized_ops::Col2im( + col2im_data, output_depth, output_height, output_width, filter_height, + filter_width, padding_top, padding_left, padding_bottom, padding_right, + stride_height, stride_width, scratch_data_p); + + scratch_data_p += output_offset; + } + + optimized_ops::Quantize(output_multiplier, output_shift, output_depth, + output_shape.FlatSize(), params.output_offset, + scratch_data, output_data); +} + +} // namespace optimized_integer_ops +} // namespace tflite + +#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_TRANSPOSE_CONV_H_ diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index 26005e069a7..b5ee08dd7f2 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -5617,6 +5617,117 @@ inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size, } } +// TODO(b/145632530): Refactor other quantize per-channel to use this one. +inline void Quantize(const int32_t* multiplier, const int32_t* shift, + int32_t channel_size, int32_t total_size, + int32_t output_zp, int32_t* scratch, int8_t* output) { + gemmlowp::ScopedProfilingLabel label("Quantize/int8"); + + const int32_t output_min = std::numeric_limits::min(); + const int32_t output_max = std::numeric_limits::max(); + + // Here we're trying to quantize the raw accumulators: + // output_channels + // data data data data data + // rows data data data data data + // data data data data data + // .... + // + // In order to minimize the reload of the multipliers & shifts, once we load + // the multipliers & shifts, we load & quantize the raw accumualtrs for every + // row. +#ifdef USE_NEON + const int32x4_t output_offset_vec = vdupq_n_s32(output_zp); + const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min); + const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max); + const int32x4_t ones = vdupq_n_s32(1); + const int32x4_t minus_ones = vdupq_n_s32(-1); + const int32x4_t zeros = vdupq_n_s32(0); +#endif + + TFLITE_DCHECK_EQ(total_size % channel_size, 0); + const int32_t rows = total_size / channel_size; + + int c = 0; + + while (c < channel_size) { + int target_output_depth = channel_size; +#ifdef USE_NEON + using gemmlowp::RoundingDivideByPOT; + for (; c <= channel_size - 4; c += 4) { + int32x4_t out_shift = vld1q_s32(shift + c); + const bool out_shift_all_less_than_zero = + (vgetq_lane_s32(out_shift, 0) < 0) && + (vgetq_lane_s32(out_shift, 1) < 0) && + (vgetq_lane_s32(out_shift, 2) < 0) && + (vgetq_lane_s32(out_shift, 3) < 0); + const bool out_shift_all_greater_equal_than_zero = + (vgetq_lane_s32(out_shift, 0) >= 0) && + (vgetq_lane_s32(out_shift, 1) >= 0) && + (vgetq_lane_s32(out_shift, 2) >= 0) && + (vgetq_lane_s32(out_shift, 3) >= 0); + if (!out_shift_all_less_than_zero && + !out_shift_all_greater_equal_than_zero) { + // Fallback to general path. + // Then go ahead for next 4. + target_output_depth = c + 4; + break; + } + int32x4_t out_mul = vld1q_s32(multiplier + c); + for (int n = 0; n < rows; ++n) { + int loc = n * channel_size + c; + int32x4_t acc = vld1q_s32(scratch + loc); + if (out_shift_all_less_than_zero) { // output_shift all < 0 case. + acc = vqrdmulhq_s32(acc, out_mul); + int32x4_t negative_out_shift = vmulq_n_s32(out_shift, -1); + int32x4_t mask = + vaddq_s32(vshlq_s32(ones, negative_out_shift), minus_ones); + int32x4_t remainder = vandq_s32(acc, mask); + int32x4_t shifted_right_mask = vshlq_s32(mask, minus_ones); + int32x4_t temp = + vandq_s32(vreinterpretq_s32_u32(vcltq_s32(acc, zeros)), ones); + int32x4_t threshold = vaddq_s32(shifted_right_mask, temp); + temp = vandq_s32( + vreinterpretq_s32_u32(vcgtq_s32(remainder, threshold)), ones); + int32x4_t shifted_right_acc = vshlq_s32(acc, out_shift); + acc = vaddq_s32(shifted_right_acc, temp); + } else { // output_shift all > 0 case. + int32x4_t multiplier_power_of_two = vshlq_s32(ones, out_shift); + acc = vmulq_s32(acc, multiplier_power_of_two); + acc = vqrdmulhq_s32(acc, out_mul); + } + // Add the output offset. + acc = vaddq_s32(acc, output_offset_vec); + // Apply the activation function. + acc = vmaxq_s32(acc, output_activation_min_vec); + acc = vminq_s32(acc, output_activation_max_vec); + // Saturating cast to int8 and store to destination. + const int16x4_t acc_s16 = vqmovn_s32(acc); + const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); + const int8x8_t res_s8 = vqmovn_s16(res_s16); + vst1_lane_s8(output + loc + 0, res_s8, 0); + vst1_lane_s8(output + loc + 1, res_s8, 1); + vst1_lane_s8(output + loc + 2, res_s8, 2); + vst1_lane_s8(output + loc + 3, res_s8, 3); + } + } + +#endif // USE_NEON + // Handle leftover values, one by one. This is very slow. + for (; c < target_output_depth; c++) { + for (int n = 0; n < rows; ++n) { + int loc = n * channel_size + c; + int32 acc = scratch[loc]; + acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]); + acc += output_zp; + acc = std::max(acc, output_min); + acc = std::min(acc, output_max); + output[loc] = static_cast(acc); + } + } + } +} + // TransposeConvV2 expect the weights in HWOI order. inline void TransposeConvV2( const ConvParams& params, const RuntimeShape& input_shape, diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc index 0c62c305c0f..114b9ae48f4 100644 --- a/tensorflow/lite/kernels/transpose_conv.cc +++ b/tensorflow/lite/kernels/transpose_conv.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ + #include #include #include @@ -23,6 +24,8 @@ limitations under the License. #include "tensorflow/lite/c/common.h" #include "tensorflow/lite/kernels/cpu_backend_context.h" #include "tensorflow/lite/kernels/eigen_support.h" +// NOLINTNEXTLINE - This header file should't go to the top. +#include "tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h" #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" // NOLINTNEXTLINE - This header file should't go to the top. #include "tensorflow/lite/kernels/internal/reference/integer_ops/transpose_conv.h" @@ -422,6 +425,7 @@ void EvalQuantized(TfLiteContext* context, } } +template void EvalQuantizedPerChannel(TfLiteContext* context, const TfLiteTransposeConvParams* params, OpData* data, const TfLiteTensor* input, @@ -444,15 +448,29 @@ void EvalQuantizedPerChannel(TfLiteContext* context, op_params.quantized_activation_min = data->output_activation_min; op_params.quantized_activation_max = data->output_activation_max; - // TODO(b/143380105): Need to add optimized kernel for int8 quantized - // transpose conv. - reference_integer_ops::TransposeConv( - op_params, data->per_channel_output_multiplier.data(), - data->per_channel_output_shift.data(), GetTensorShape(input), - GetTensorData(input), GetTensorShape(weights), - GetTensorData(weights), GetTensorShape(output), - GetTensorData(output), GetTensorShape(col2im), - GetTensorData(col2im), GetTensorData(scratch_buffer)); + switch (kernel_type) { + case kReference: { + reference_integer_ops::TransposeConv( + op_params, data->per_channel_output_multiplier.data(), + data->per_channel_output_shift.data(), GetTensorShape(input), + GetTensorData(input), GetTensorShape(weights), + GetTensorData(weights), GetTensorShape(output), + GetTensorData(output), GetTensorShape(col2im), + GetTensorData(col2im), GetTensorData(scratch_buffer)); + break; + } + case kGenericOptimized: { + optimized_integer_ops::TransposeConvV2( + op_params, data->per_channel_output_multiplier.data(), + data->per_channel_output_shift.data(), GetTensorShape(input), + GetTensorData(input), GetTensorShape(transposed_weights), + GetTensorData(transposed_weights), GetTensorShape(output), + GetTensorData(output), GetTensorShape(col2im), + GetTensorData(col2im), GetTensorData(scratch_buffer), + CpuBackendContext::GetFromContext(context)); + break; + } + } } template @@ -535,9 +553,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { if (data->weights_are_transposed && !IsConstantTensor(weights)) { ResizeAndTransposeWeights(context, weights, transposed_weights); } - EvalQuantizedPerChannel(context, params, data, input, weights, - transposed_weights, col2im, output, - scratch_buffer); + EvalQuantizedPerChannel(context, params, data, input, + weights, transposed_weights, col2im, + output, scratch_buffer); break; } default: From 6f96b26d9db356b76c37cef6de757db02bb4c0a7 Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Wed, 4 Dec 2019 18:32:25 -0800 Subject: [PATCH 124/383] Python 3 Migration. //tensorflow/(compiler|contrib|python) PiperOrigin-RevId: 283882221 Change-Id: I8451df5bc5781efe3ea947afd7df4bd82a798469 --- tensorflow/compiler/xla/python/BUILD | 1 - tensorflow/python/feature_column/BUILD | 8 +------- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD index 409d954748c..cdbe69d617e 100644 --- a/tensorflow/compiler/xla/python/BUILD +++ b/tensorflow/compiler/xla/python/BUILD @@ -26,7 +26,6 @@ py_test( name = "xla_client_test", srcs = ["xla_client_test.py"], main = "xla_client_test.py", - python_version = "PY3", srcs_version = "PY2AND3", tags = ["no_oss"], # TODO(phawkins): This test passes, but requires --config=monolithic. deps = [ diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD index ca58ad5730b..38c3657ef58 100644 --- a/tensorflow/python/feature_column/BUILD +++ b/tensorflow/python/feature_column/BUILD @@ -110,7 +110,6 @@ tf_py_test( additional_deps = [ ":feature_column_test_main_lib", ], - python_version = "PY3", tags = [ "no_cuda_on_cpu_tap", "no_pip", @@ -124,7 +123,6 @@ tf_py_test( additional_deps = [ ":feature_column_test_main_lib", ], - python_version = "PY3", tags = ["no_pip"], ) @@ -161,7 +159,6 @@ tf_py_test( name = "feature_column_v2_test", srcs = ["feature_column_v2_test.py"], additional_deps = [":feature_column_v2_test_main_lib"], - python_version = "PY3", shard_count = 5, tags = [ "no_cuda_on_cpu_tap", @@ -176,7 +173,6 @@ tf_py_test( additional_deps = [ ":feature_column_v2_test_main_lib", ], - python_version = "PY3", tags = ["no_pip"], ) @@ -239,13 +235,12 @@ tf_py_test( "//tensorflow/python:sparse_tensor", "//tensorflow/python:training", ], - python_version = "PY3", ) py_test( name = "sequence_feature_column_integration_test", srcs = ["sequence_feature_column_integration_test.py"], - python_version = "PY3", + python_version = "PY2", srcs_version = "PY2AND3", tags = ["no_pip"], deps = [ @@ -268,5 +263,4 @@ tf_py_test( "//tensorflow/python:client_testlib", "//tensorflow/python:util", ], - python_version = "PY3", ) From 5666233dc7ec23509d3596e8145e13466c828ce5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 18:43:25 -0800 Subject: [PATCH 125/383] Add int32 support to floor, ceil & rint Though the result is trivial, this avoids the need to call tf.cast if receiving a int32 tensor from another operation. PiperOrigin-RevId: 283883439 Change-Id: I351206bd165fbf681f0231886ec131522ddf83ed --- tensorflow/core/kernels/cwise_op_ceil.cc | 3 +- tensorflow/core/kernels/cwise_op_floor.cc | 3 +- tensorflow/core/kernels/cwise_op_rint.cc | 2 +- tensorflow/core/ops/math_ops.cc | 6 ++-- tensorflow/python/ops/math_ops_test.py | 42 ----------------------- 5 files changed, 6 insertions(+), 50 deletions(-) diff --git a/tensorflow/core/kernels/cwise_op_ceil.cc b/tensorflow/core/kernels/cwise_op_ceil.cc index c6e38a55efd..4b1847d758c 100644 --- a/tensorflow/core/kernels/cwise_op_ceil.cc +++ b/tensorflow/core/kernels/cwise_op_ceil.cc @@ -16,8 +16,7 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double, - int32); +REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_floor.cc b/tensorflow/core/kernels/cwise_op_floor.cc index ba0340372f4..25210a0fa51 100644 --- a/tensorflow/core/kernels/cwise_op_floor.cc +++ b/tensorflow/core/kernels/cwise_op_floor.cc @@ -16,8 +16,7 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double, - int32); +REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double); diff --git a/tensorflow/core/kernels/cwise_op_rint.cc b/tensorflow/core/kernels/cwise_op_rint.cc index c6071d02295..f9fe8321947 100644 --- a/tensorflow/core/kernels/cwise_op_rint.cc +++ b/tensorflow/core/kernels/cwise_op_rint.cc @@ -16,7 +16,7 @@ limitations under the License. #include "tensorflow/core/kernels/cwise_ops_common.h" namespace tensorflow { -REGISTER3(UnaryOp, CPU, "Rint", functor::rint, float, double, int32); +REGISTER2(UnaryOp, CPU, "Rint", functor::rint, float, double); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER2(UnaryOp, GPU, "Rint", functor::rint, float, double); #endif diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc index 691f34ff307..ccdcf0b76e6 100644 --- a/tensorflow/core/ops/math_ops.cc +++ b/tensorflow/core/ops/math_ops.cc @@ -349,19 +349,19 @@ REGISTER_OP("Sign") REGISTER_OP("Floor") .Input("x: T") .Output("y: T") - .Attr("T: {bfloat16, half, float, double, int32}") + .Attr("T: {bfloat16, half, float, double}") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("Ceil") .Input("x: T") .Output("y: T") - .Attr("T: {bfloat16, half, float, double, int32}") + .Attr("T: {bfloat16, half, float, double}") .SetShapeFn(shape_inference::UnchangedShape); REGISTER_OP("Rint") .Input("x: T") .Output("y: T") - .Attr("T: {bfloat16, half, float, double, int32}") + .Attr("T: {bfloat16, half, float, double}") .SetShapeFn(shape_inference::UnchangedShape); // Declares cwise binary operations signature: 't, 't -> 't. diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index c1591791cbd..f49ba3dd2a3 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -179,48 +179,6 @@ class RoundTest(test_util.TensorFlowTestCase): self.assertAllClose(y_tf_np, y_np, atol=1e-2) -@test_util.run_all_in_graph_and_eager_modes -class FloorTest(test_util.TensorFlowTestCase): - - def testFloor(self): - x = np.arange(-5.0, 5.0, .25) - for dtype in [np.float32, np.double, np.int32]: - x_np = np.array(x, dtype=dtype) - x_tf = constant_op.constant(x_np, shape=x_np.shape) - y_tf = math_ops.floor(x_tf) - y_tf_np = self.evaluate(y_tf) - y_np = np.floor(x_np) - self.assertAllClose(y_tf_np, y_np, atol=1e-2) - - -@test_util.run_all_in_graph_and_eager_modes -class CeilTest(test_util.TensorFlowTestCase): - - def testCeil(self): - x = np.arange(-5.0, 5.0, .25) - for dtype in [np.float32, np.double, np.int32]: - x_np = np.array(x, dtype=dtype) - x_tf = constant_op.constant(x_np, shape=x_np.shape) - y_tf = math_ops.ceil(x_tf) - y_tf_np = self.evaluate(y_tf) - y_np = np.ceil(x_np) - self.assertAllClose(y_tf_np, y_np, atol=1e-2) - - -@test_util.run_all_in_graph_and_eager_modes -class RintTest(test_util.TensorFlowTestCase): - - def testRint(self): - x = np.arange(-5.0, 5.0, .25) - for dtype in [np.float32, np.double, np.int32]: - x_np = np.array(x, dtype=dtype) - x_tf = constant_op.constant(x_np, shape=x_np.shape) - y_tf = math_ops.rint(x_tf) - y_tf_np = self.evaluate(y_tf) - y_np = np.rint(x_np) - self.assertAllClose(y_tf_np, y_np, atol=1e-2) - - @test_util.run_all_in_graph_and_eager_modes class ModTest(test_util.TensorFlowTestCase): From 0f85db356db4c3b6b32160f20d833faa01e74580 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 19:13:06 -0800 Subject: [PATCH 126/383] Sparse tensor definition in TFLite. PiperOrigin-RevId: 283886690 Change-Id: I66789d53ac2c7bdd85762cec14ec95748d0259f2 --- tensorflow/lite/BUILD | 1 - tensorflow/lite/c/common.c | 34 -- tensorflow/lite/c/common.h | 31 -- tensorflow/lite/c/common_test.cc | 26 -- tensorflow/lite/core/subgraph.cc | 20 +- tensorflow/lite/core/subgraph.h | 9 +- tensorflow/lite/model.cc | 81 +---- tensorflow/lite/model.h | 2 - tensorflow/lite/model_test.cc | 72 ---- tensorflow/lite/schema/schema.fbs | 80 ----- tensorflow/lite/schema/schema_generated.h | 334 +----------------- tensorflow/lite/testdata/sparse_tensor.bin | Bin 412 -> 0 bytes tensorflow/lite/testdata/sparse_tensor.json | 63 ---- .../benchmark/experimental/c/c_api_types.h | 31 -- 14 files changed, 15 insertions(+), 769 deletions(-) delete mode 100644 tensorflow/lite/testdata/sparse_tensor.bin delete mode 100644 tensorflow/lite/testdata/sparse_tensor.json diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD index 84150546353..530b27aa7d3 100644 --- a/tensorflow/lite/BUILD +++ b/tensorflow/lite/BUILD @@ -318,7 +318,6 @@ cc_test( "testdata/2_subgraphs.bin", "testdata/empty_model.bin", "testdata/multi_add_flex.bin", - "testdata/sparse_tensor.bin", "testdata/test_min_runtime.bin", "testdata/test_model.bin", "testdata/test_model_broken.bin", diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c index 0b17c049e93..524bf8091fe 100644 --- a/tensorflow/lite/c/common.c +++ b/tensorflow/lite/c/common.c @@ -103,46 +103,12 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) { quantization->type = kTfLiteNoQuantization; } -void TfLiteSparsityFree(TfLiteSparsity* sparsity) { - if (sparsity == NULL) { - return; - } - - if (sparsity->traversal_order) { - TfLiteIntArrayFree(sparsity->traversal_order); - sparsity->traversal_order = NULL; - } - - if (sparsity->block_map) { - TfLiteIntArrayFree(sparsity->block_map); - sparsity->block_map = NULL; - } - - if (sparsity->dim_metadata) { - for (int i = 0; i < sparsity->dim_metadata_size; i++) { - TfLiteDimensionMetadata metadata = sparsity->dim_metadata[i]; - if (metadata.format == kTfLiteDimSparseCSR) { - TfLiteIntArrayFree(metadata.array_segments); - metadata.array_segments = NULL; - TfLiteIntArrayFree(metadata.array_indices); - metadata.array_indices = NULL; - } - } - free(sparsity->dim_metadata); - sparsity->dim_metadata = NULL; - } - - free(sparsity); -} - void TfLiteTensorFree(TfLiteTensor* t) { TfLiteTensorDataFree(t); if (t->dims) TfLiteIntArrayFree(t->dims); t->dims = NULL; TfLiteQuantizationFree(&t->quantization); - TfLiteSparsityFree(t->sparsity); - t->sparsity = NULL; } void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims, diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h index 332b9b68881..b3b0ddc059d 100644 --- a/tensorflow/lite/c/common.h +++ b/tensorflow/lite/c/common.h @@ -303,29 +303,6 @@ enum { kTfLiteNullBufferHandle = -1, }; -// Storage format of each dimension in a sparse tensor. -typedef enum { - kTfLiteDimDense = 0, - kTfLiteDimSparseCSR, -} TfLiteDimensionType; - -// Metadata to encode each dimension in a sparse tensor. -typedef struct { - TfLiteDimensionType format; - int dense_size; - TfLiteIntArray* array_segments; - TfLiteIntArray* array_indices; -} TfLiteDimensionMetadata; - -// Parameters used to encode a sparse tensor. For detailed explanation of each -// field please refer to lite/schema/schema.fbs. -typedef struct { - TfLiteIntArray* traversal_order; - TfLiteIntArray* block_map; - TfLiteDimensionMetadata* dim_metadata; - int dim_metadata_size; -} TfLiteSparsity; - // An tensor in the interpreter system which is a wrapper around a buffer of // data including a dimensionality (or NULL if not currently defined). typedef struct { @@ -380,11 +357,6 @@ typedef struct { // Quantization information. Replaces params field above. TfLiteQuantization quantization; - - // Parameters used to encode a sparse tensor. - // This is optional. The field is NULL if a tensor is dense. - // WARNING: This is an experimental interface that is subject to change. - TfLiteSparsity* sparsity; } TfLiteTensor; // Free data memory of tensor `t`. @@ -393,9 +365,6 @@ void TfLiteTensorDataFree(TfLiteTensor* t); // Free quantization data. void TfLiteQuantizationFree(TfLiteQuantization* quantization); -// Free sparsity parameters. -void TfLiteSparsityFree(TfLiteSparsity* sparsity); - // Free memory of tensor `t`. void TfLiteTensorFree(TfLiteTensor* t); diff --git a/tensorflow/lite/c/common_test.cc b/tensorflow/lite/c/common_test.cc index 65c6ec63b28..88ac181faf6 100644 --- a/tensorflow/lite/c/common_test.cc +++ b/tensorflow/lite/c/common_test.cc @@ -96,7 +96,6 @@ TEST(Quantization, TestQuantizationFree) { t.allocation_type = kTfLiteArenaRw; t.dims = nullptr; t.quantization.type = kTfLiteAffineQuantization; - t.sparsity = nullptr; auto* params = reinterpret_cast( malloc(sizeof(TfLiteAffineQuantization))); params->scale = TfLiteFloatArrayCreate(3); @@ -105,31 +104,6 @@ TEST(Quantization, TestQuantizationFree) { TfLiteTensorFree(&t); } -TEST(Sparsity, TestSparsityFree) { - TfLiteTensor t; - // Set these values, otherwise TfLiteTensorFree has uninitialized values. - t.allocation_type = kTfLiteArenaRw; - t.dims = nullptr; - - // A dummy CSR sparse matrix. - t.sparsity = static_cast(malloc(sizeof(TfLiteSparsity))); - t.sparsity->traversal_order = TfLiteIntArrayCreate(2); - t.sparsity->block_map = nullptr; - - t.sparsity->dim_metadata = static_cast( - malloc(sizeof(TfLiteDimensionMetadata) * 2)); - t.sparsity->dim_metadata_size = 2; - - t.sparsity->dim_metadata[0].format = kTfLiteDimDense; - t.sparsity->dim_metadata[0].dense_size = 4; - - t.sparsity->dim_metadata[1].format = kTfLiteDimSparseCSR; - t.sparsity->dim_metadata[1].array_segments = TfLiteIntArrayCreate(2); - t.sparsity->dim_metadata[1].array_indices = TfLiteIntArrayCreate(3); - - TfLiteTensorFree(&t); -} - } // namespace tflite int main(int argc, char** argv) { diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc index 69c39769593..e453ff2ff7e 100644 --- a/tensorflow/lite/core/subgraph.cc +++ b/tensorflow/lite/core/subgraph.cc @@ -40,15 +40,6 @@ struct TfLiteQuantizationDeleter { using ScopedTfLiteQuantization = std::unique_ptr; -struct TfLiteSparsityDeleter { - void operator()(TfLiteSparsity* s) { - if (s) TfLiteSparsityFree(s); - } -}; - -using ScopedTfLiteSparsity = - std::unique_ptr; - TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node, const TfLiteRegistration& registration, int node_index, const char* message) { @@ -917,10 +908,9 @@ TfLiteStatus Subgraph::GetNodeAndRegistration( TfLiteStatus Subgraph::SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantization quantization, const char* buffer, - size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity) { + size_t bytes, const Allocation* allocation) { // Ensure quantization cleanup on failure. ScopedTfLiteQuantization scoped_quantization(&quantization); - ScopedTfLiteSparsity scoped_sparsity(sparsity); if (state_ == kStateInvokableAndImmutable) { ReportError( "SetTensorParametersReadOnly is disallowed when graph is immutable."); @@ -929,12 +919,10 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( TF_LITE_ENSURE(&context_, tensor_index < context_.tensors_size && tensor_index >= 0); - // For most tensors we know exactly how much memory is necessary so we can // ensure the buffer is large enough. However, we need to skip string tensors - // and sparse tensors because their sizes change with the contents. - // TODO(b/145615516): Extend BytesRequired to check sparse tensors. - if (type != kTfLiteString && sparsity == nullptr) { + // because their sizes change with the contents of the individual strings. + if (type != kTfLiteString) { size_t required_bytes; TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims, rank, &required_bytes)); @@ -951,7 +939,6 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims); tensor.params = GetLegacyQuantization(quantization); tensor.quantization = *scoped_quantization.release(); - tensor.sparsity = scoped_sparsity.release(); tensor.allocation_type = kTfLiteMmapRo; tensor.allocation = allocation; } else { @@ -963,7 +950,6 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( // TODO(suharshs): Update TfLiteTensorReset to include the new quantization // if there are other required callers. tensor.quantization = *scoped_quantization.release(); - tensor.sparsity = scoped_sparsity.release(); } return kTfLiteOk; } diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h index c2572546709..89a9da7db28 100644 --- a/tensorflow/lite/core/subgraph.h +++ b/tensorflow/lite/core/subgraph.h @@ -94,17 +94,16 @@ class Subgraph { inline TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const std::vector& dims, TfLiteQuantization quantization, - const char* buffer, size_t bytes, const Allocation* allocation = nullptr, - TfLiteSparsity* sparsity = nullptr) { + const char* buffer, size_t bytes, + const Allocation* allocation = nullptr) { return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(), dims.data(), quantization, buffer, bytes, - allocation, sparsity); + allocation); } TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantization quantization, const char* buffer, - size_t bytes, const Allocation* allocation = nullptr, - TfLiteSparsity* sparsity = nullptr); + size_t bytes, const Allocation* allocation = nullptr); // Set description of inputs/outputs/data/fptrs for node `node_index`. // This variant assumes an external buffer has been allocated of size diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc index a4287a57ea0..d060289a0ee 100644 --- a/tensorflow/lite/model.cc +++ b/tensorflow/lite/model.cc @@ -416,77 +416,6 @@ TfLiteStatus InterpreterBuilder::ParseQuantization( return kTfLiteOk; } -// TODO(b/145614687): Add sparse tensor verification check in -// lite/tools/verifier.cc. -TfLiteStatus InterpreterBuilder::ParseSparsity( - const SparsityParameters* src_sparsity, TfLiteSparsity** sparsity_ptr) { - if (!src_sparsity) { - return kTfLiteOk; - } - - auto* sparsity = - reinterpret_cast(malloc(sizeof(TfLiteSparsity))); - memset(sparsity, 0, sizeof(TfLiteSparsity)); - *sparsity_ptr = sparsity; - - if (src_sparsity->traversal_order()) { - const size_t traversal_order_size = src_sparsity->traversal_order()->size(); - sparsity->traversal_order = TfLiteIntArrayCreate(traversal_order_size); - for (int i = 0; i < traversal_order_size; i++) { - sparsity->traversal_order->data[i] = - src_sparsity->traversal_order()->Get(i); - } - } - - if (src_sparsity->block_map()) { - const size_t block_map_size = src_sparsity->block_map()->size(); - sparsity->block_map = TfLiteIntArrayCreate(block_map_size); - for (int i = 0; i < block_map_size; i++) { - sparsity->block_map->data[i] = src_sparsity->block_map()->Get(i); - } - } - - if (src_sparsity->dim_metadata()) { - const size_t dim_metadata_size = src_sparsity->dim_metadata()->size(); - sparsity->dim_metadata_size = dim_metadata_size; - sparsity->dim_metadata = reinterpret_cast( - malloc(dim_metadata_size * sizeof(TfLiteDimensionMetadata))); - memset(sparsity->dim_metadata, 0, - dim_metadata_size * sizeof(TfLiteDimensionMetadata)); - - for (int i = 0; i < dim_metadata_size; i++) { - const auto* src_metadata = src_sparsity->dim_metadata()->Get(i); - auto* tgt_metadata = &sparsity->dim_metadata[i]; - - tgt_metadata->format = - static_cast(src_metadata->format()); - - if (tgt_metadata->format == kTfLiteDimDense) { - tgt_metadata->dense_size = src_metadata->dense_size(); - } else if (tgt_metadata->format == kTfLiteDimSparseCSR) { - const int array_segments_size = src_metadata->array_segments()->size(); - tgt_metadata->array_segments = - TfLiteIntArrayCreate(array_segments_size); - for (int j = 0; j < array_segments_size; j++) { - tgt_metadata->array_segments->data[j] = - src_metadata->array_segments()->Get(j); - } - const int array_indices_size = src_metadata->array_indices()->size(); - tgt_metadata->array_indices = TfLiteIntArrayCreate(array_indices_size); - for (int j = 0; j < array_indices_size; j++) { - tgt_metadata->array_indices->data[j] = - src_metadata->array_indices()->Get(j); - } - } else { - error_reporter_->Report("Unsupported dimension type."); - return kTfLiteError; - } - } - } - - return kTfLiteOk; -} - TfLiteStatus InterpreterBuilder::ParseTensors( const flatbuffers::Vector>* buffers, const flatbuffers::Vector>* tensors, @@ -545,13 +474,6 @@ TfLiteStatus InterpreterBuilder::ParseTensors( continue; } - const auto* src_sparsity = tensor->sparsity(); - TfLiteSparsity* sparsity = nullptr; - if (ParseSparsity(src_sparsity, &sparsity) != kTfLiteOk) { - status = kTfLiteError; - continue; - } - bool is_variable = tensor->is_variable(); if (buffer_ptr) { if (is_variable) { @@ -564,13 +486,12 @@ TfLiteStatus InterpreterBuilder::ParseTensors( if (subgraph->SetTensorParametersReadOnly( i, type, get_name(tensor), dims, quantization, buffer_ptr, - buffer_size, allocation_, sparsity) != kTfLiteOk) { + buffer_size, allocation_) != kTfLiteOk) { error_reporter_->Report("Tensor %d is invalidly specified in schema.\n", i); status = kTfLiteError; } } else { - // TODO(b/144999664): Non-constant sparse tensor is not supported now. if (subgraph->SetTensorParametersReadWrite(i, type, get_name(tensor), dims, quantization, is_variable) != kTfLiteOk) { diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h index b8b4b4457da..fafb38ffd10 100644 --- a/tensorflow/lite/model.h +++ b/tensorflow/lite/model.h @@ -223,8 +223,6 @@ class InterpreterBuilder { TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization, TfLiteQuantization* quantization, const std::vector& dims); - TfLiteStatus ParseSparsity(const SparsityParameters* src_sparsity, - TfLiteSparsity** sparsity); const ::tflite::Model* model_; const OpResolver& op_resolver_; diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc index 2675715a613..7dc582b8862 100644 --- a/tensorflow/lite/model_test.cc +++ b/tensorflow/lite/model_test.cc @@ -331,78 +331,6 @@ TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) { ASSERT_EQ(model2->GetMinimumRuntime(), "1.10.0"); } -// The test model has the following tensor encoded in the TACO format: -// [[1, 0, 2, 3], -// [0, 4, 0, 0], -// [0, 0, 5, 0], -// [0, 0, 0, 6]]. -// TACO supports multiple encodings like CSR, CSC, etc. We chose to use the one -// similar to the blocked-CSR format with 2x2 row-major dense blocks. -TEST(BasicFlatBufferModel, TestParseModelWithSparseTensor) { - // The model only has 1 sparse constant tensor. - auto model = FlatBufferModel::BuildFromFile( - "tensorflow/lite/testdata/sparse_tensor.bin"); - ASSERT_TRUE(model); - - std::unique_ptr interpreter(new Interpreter); - ASSERT_EQ(InterpreterBuilder(*model, TrivialResolver())(&interpreter), - kTfLiteOk); - ASSERT_NE(interpreter, nullptr); - ASSERT_EQ(interpreter->tensors_size(), 1); - TfLiteTensor* t1 = interpreter->tensor(0); - ASSERT_EQ(t1->allocation_type, kTfLiteMmapRo); - - TfLiteIntArray* traversal_order = TfLiteIntArrayCreate(4); - traversal_order->data[0] = 0; - traversal_order->data[1] = 1; - traversal_order->data[2] = 2; - traversal_order->data[3] = 3; - ASSERT_TRUE( - TfLiteIntArrayEqual(t1->sparsity->traversal_order, traversal_order)); - TfLiteIntArrayFree(traversal_order); - - TfLiteIntArray* block_map = TfLiteIntArrayCreate(2); - block_map->data[0] = 0; - block_map->data[1] = 1; - ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->block_map, block_map)); - TfLiteIntArrayFree(block_map); - - ASSERT_EQ(t1->sparsity->dim_metadata_size, 4); - - ASSERT_EQ(t1->sparsity->dim_metadata[0].format, kTfLiteDimDense); - ASSERT_EQ(t1->sparsity->dim_metadata[0].dense_size, 2); - ASSERT_EQ(t1->sparsity->dim_metadata[0].array_segments, nullptr); - ASSERT_EQ(t1->sparsity->dim_metadata[0].array_indices, nullptr); - - ASSERT_EQ(t1->sparsity->dim_metadata[1].format, kTfLiteDimSparseCSR); - ASSERT_EQ(t1->sparsity->dim_metadata[1].dense_size, 0); - TfLiteIntArray* array_segments = TfLiteIntArrayCreate(3); - array_segments->data[0] = 0; - array_segments->data[1] = 2; - array_segments->data[2] = 3; - ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->dim_metadata[1].array_segments, - array_segments)); - TfLiteIntArrayFree(array_segments); - - TfLiteIntArray* array_indices = TfLiteIntArrayCreate(3); - array_indices->data[0] = 0; - array_indices->data[1] = 1; - array_indices->data[2] = 1; - ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->dim_metadata[1].array_indices, - array_indices)); - TfLiteIntArrayFree(array_indices); - - ASSERT_EQ(t1->sparsity->dim_metadata[2].format, kTfLiteDimDense); - ASSERT_EQ(t1->sparsity->dim_metadata[2].dense_size, 2); - ASSERT_EQ(t1->sparsity->dim_metadata[2].array_segments, nullptr); - ASSERT_EQ(t1->sparsity->dim_metadata[2].array_indices, nullptr); - - ASSERT_EQ(t1->sparsity->dim_metadata[3].format, kTfLiteDimDense); - ASSERT_EQ(t1->sparsity->dim_metadata[3].dense_size, 2); - ASSERT_EQ(t1->sparsity->dim_metadata[3].array_segments, nullptr); - ASSERT_EQ(t1->sparsity->dim_metadata[3].array_indices, nullptr); -} - // TODO(aselle): Add tests for serialization of builtin op data types. // These tests will occur with the evaluation tests of individual operators, // not here. diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs index 63fd3bbc4d6..f1fbfc655d6 100644 --- a/tensorflow/lite/schema/schema.fbs +++ b/tensorflow/lite/schema/schema.fbs @@ -80,82 +80,6 @@ table QuantizationParameters { quantized_dimension:int; } -// Sparse tensors. -// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1), -// potentially with a k-dimensional block (0 <= k <= n) with dims -// (dn, ..., dn+k-1), the format needs to specify: -// 1. In what order to traverse these dimensions. For example, to store a 2-D -// matrix in row major order, the traversal order would be (d0, d1), -// whereas to store it in column major order, the traversal order would be -// (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order -// could be (d0, d1, d2, d3). -// 2. In the order of (d0, ..., dn-1, dn, ..., dn+k-1), whether each dimension -// is DENSE or SPARSE. -// 3. How each block dimension in (dn, ..., dn+k-1) maps to the original -// tensor dimension in (d0, ..., dn-1). -// 4. Index metadata for each dimension. For a dense dimension, this is just -// the size of that dimension. For a sparse dimension, it's the same as -// the compressed index defined in the Compressed Sparse Row (CSR) format. -// (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html) - -// The storage type for a dimension. Currently we support: -// 1. DENSE: each coordinate in this dimension is stored implicitly. -// 2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The -// compression technique is the same what CSR uses. -// More types like a sparse dimension with a different compression technique -// could be added to the list in the future. -enum DimensionType : byte { - DENSE = 0, - SPARSE_CSR = 1, -} - -table DimensionMetadata { - // Whether each dimension is dense or sparse. - format:DimensionType; - // Index metadata used for each dimension. - // - If format is DimensionType.DENSE then we use the dense_size field to - // store the size of that dimension. Each index in that dimension is - // stored implicitly. - // - If format is DimensionType.SPARSE_CSR then we use array_segments and - // array_indices to encode that dimension. array_segments represents how - // to segment the indices array, each segment corresponds to one element - // in the previous dimension. array_indices represents the index of the - // non-zero elements within this dimension (as those in the CSR matrix - // format, where the first array is row pointers and the second array is - // column indices). - dense_size:int; - array_segments:[int]; - array_indices:[int]; -} - -// Parameters to encode a sparse TfLite tensor. -table SparsityParameters { - // The traversal order of the dimensions defined in the `shape` field of the - // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1, - // ..., dn-1), - // - if not block sparse, the traversal_order is just a permutation of (d0, - // ..., dn-1). For example, a 2-D matrix stored in row-major order would - // have traversal_order = (d0, d1). - // - if block sparse with a k-dimensional block (0 <= k <= n), the - // traversal_order has n + k elements. The first n elements are still a - // permutation of (d0, ..., dn-1). The lask k elements are a permutation - // of (dn, ..., dn+k-1), defining how to traverse a block internally. For - // example, a 2-D matrix with 2-D blocks, both stored in row-major order - // would have traversal_order = (d0, d1, d2, d3). - traversal_order:[int]; - // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n), - // stores how a block dimension in (dn, ..., dn+k-1) maps to the original - // tensor dimension in (d0, ..., dn). - // It's stored in the order of (dn, ..., dn+k-1). - // If not block-sparse, this field is NULL. - block_map:[int]; - // In the order of (d0, ..., dn-1, dn, ..., dn+k-1), the metadata needed for - // each dimension to locate the non-zero values in the original dense tensor. - // The size of the dim_metadata array = the size of the traversal_order array - // = n + k. - dim_metadata:[DimensionMetadata]; -} - table Tensor { // The tensor shape. The meaning of each entry is operator-specific but // builtin ops use: [batch size, height, width, number of channels] (That's @@ -175,10 +99,6 @@ table Tensor { quantization:QuantizationParameters; // Optional. is_variable:bool = false; - - // Parameters to encode a sparse tensor. See the example in - // tensorflow/lite/testdata/sparse_tensor.json. - sparsity:SparsityParameters; // Optional. } // A list of builtin operators. Builtin operators are slightly faster than custom diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h index ae523cc7d5a..ea2f1cc0b8b 100755 --- a/tensorflow/lite/schema/schema_generated.h +++ b/tensorflow/lite/schema/schema_generated.h @@ -28,12 +28,6 @@ struct CustomQuantizationT; struct QuantizationParameters; struct QuantizationParametersT; -struct DimensionMetadata; -struct DimensionMetadataT; - -struct SparsityParameters; -struct SparsityParametersT; - struct Tensor; struct TensorT; @@ -483,36 +477,6 @@ struct QuantizationDetailsUnion { bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type); bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); -enum DimensionType { - DimensionType_DENSE = 0, - DimensionType_SPARSE_CSR = 1, - DimensionType_MIN = DimensionType_DENSE, - DimensionType_MAX = DimensionType_SPARSE_CSR -}; - -inline const DimensionType (&EnumValuesDimensionType())[2] { - static const DimensionType values[] = { - DimensionType_DENSE, - DimensionType_SPARSE_CSR - }; - return values; -} - -inline const char * const *EnumNamesDimensionType() { - static const char * const names[] = { - "DENSE", - "SPARSE_CSR", - nullptr - }; - return names; -} - -inline const char *EnumNameDimensionType(DimensionType e) { - if (e < DimensionType_DENSE || e > DimensionType_SPARSE_CSR) return ""; - const size_t index = static_cast(e); - return EnumNamesDimensionType()[index]; -} - enum BuiltinOperator { BuiltinOperator_ADD = 0, BuiltinOperator_AVERAGE_POOL_2D = 1, @@ -2903,206 +2867,6 @@ inline flatbuffers::Offset CreateQuantizationParametersD flatbuffers::Offset CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); -struct DimensionMetadataT : public flatbuffers::NativeTable { - typedef DimensionMetadata TableType; - DimensionType format; - int32_t dense_size; - std::vector array_segments; - std::vector array_indices; - DimensionMetadataT() - : format(DimensionType_DENSE), - dense_size(0) { - } -}; - -struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef DimensionMetadataT NativeTableType; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_FORMAT = 4, - VT_DENSE_SIZE = 6, - VT_ARRAY_SEGMENTS = 8, - VT_ARRAY_INDICES = 10 - }; - DimensionType format() const { - return static_cast(GetField(VT_FORMAT, 0)); - } - int32_t dense_size() const { - return GetField(VT_DENSE_SIZE, 0); - } - const flatbuffers::Vector *array_segments() const { - return GetPointer *>(VT_ARRAY_SEGMENTS); - } - const flatbuffers::Vector *array_indices() const { - return GetPointer *>(VT_ARRAY_INDICES); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyField(verifier, VT_FORMAT) && - VerifyField(verifier, VT_DENSE_SIZE) && - VerifyOffset(verifier, VT_ARRAY_SEGMENTS) && - verifier.VerifyVector(array_segments()) && - VerifyOffset(verifier, VT_ARRAY_INDICES) && - verifier.VerifyVector(array_indices()) && - verifier.EndTable(); - } - DimensionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; - void UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; - static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); -}; - -struct DimensionMetadataBuilder { - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_format(DimensionType format) { - fbb_.AddElement(DimensionMetadata::VT_FORMAT, static_cast(format), 0); - } - void add_dense_size(int32_t dense_size) { - fbb_.AddElement(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0); - } - void add_array_segments(flatbuffers::Offset> array_segments) { - fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments); - } - void add_array_indices(flatbuffers::Offset> array_indices) { - fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices); - } - explicit DimensionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb) - : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - DimensionMetadataBuilder &operator=(const DimensionMetadataBuilder &); - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateDimensionMetadata( - flatbuffers::FlatBufferBuilder &_fbb, - DimensionType format = DimensionType_DENSE, - int32_t dense_size = 0, - flatbuffers::Offset> array_segments = 0, - flatbuffers::Offset> array_indices = 0) { - DimensionMetadataBuilder builder_(_fbb); - builder_.add_array_indices(array_indices); - builder_.add_array_segments(array_segments); - builder_.add_dense_size(dense_size); - builder_.add_format(format); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateDimensionMetadataDirect( - flatbuffers::FlatBufferBuilder &_fbb, - DimensionType format = DimensionType_DENSE, - int32_t dense_size = 0, - const std::vector *array_segments = nullptr, - const std::vector *array_indices = nullptr) { - auto array_segments__ = array_segments ? _fbb.CreateVector(*array_segments) : 0; - auto array_indices__ = array_indices ? _fbb.CreateVector(*array_indices) : 0; - return tflite::CreateDimensionMetadata( - _fbb, - format, - dense_size, - array_segments__, - array_indices__); -} - -flatbuffers::Offset CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); - -struct SparsityParametersT : public flatbuffers::NativeTable { - typedef SparsityParameters TableType; - std::vector traversal_order; - std::vector block_map; - std::vector> dim_metadata; - SparsityParametersT() { - } -}; - -struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { - typedef SparsityParametersT NativeTableType; - enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { - VT_TRAVERSAL_ORDER = 4, - VT_BLOCK_MAP = 6, - VT_DIM_METADATA = 8 - }; - const flatbuffers::Vector *traversal_order() const { - return GetPointer *>(VT_TRAVERSAL_ORDER); - } - const flatbuffers::Vector *block_map() const { - return GetPointer *>(VT_BLOCK_MAP); - } - const flatbuffers::Vector> *dim_metadata() const { - return GetPointer> *>(VT_DIM_METADATA); - } - bool Verify(flatbuffers::Verifier &verifier) const { - return VerifyTableStart(verifier) && - VerifyOffset(verifier, VT_TRAVERSAL_ORDER) && - verifier.VerifyVector(traversal_order()) && - VerifyOffset(verifier, VT_BLOCK_MAP) && - verifier.VerifyVector(block_map()) && - VerifyOffset(verifier, VT_DIM_METADATA) && - verifier.VerifyVector(dim_metadata()) && - verifier.VerifyVectorOfTables(dim_metadata()) && - verifier.EndTable(); - } - SparsityParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; - void UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; - static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); -}; - -struct SparsityParametersBuilder { - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_traversal_order(flatbuffers::Offset> traversal_order) { - fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order); - } - void add_block_map(flatbuffers::Offset> block_map) { - fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map); - } - void add_dim_metadata(flatbuffers::Offset>> dim_metadata) { - fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata); - } - explicit SparsityParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb) - : fbb_(_fbb) { - start_ = fbb_.StartTable(); - } - SparsityParametersBuilder &operator=(const SparsityParametersBuilder &); - flatbuffers::Offset Finish() { - const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); - return o; - } -}; - -inline flatbuffers::Offset CreateSparsityParameters( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> traversal_order = 0, - flatbuffers::Offset> block_map = 0, - flatbuffers::Offset>> dim_metadata = 0) { - SparsityParametersBuilder builder_(_fbb); - builder_.add_dim_metadata(dim_metadata); - builder_.add_block_map(block_map); - builder_.add_traversal_order(traversal_order); - return builder_.Finish(); -} - -inline flatbuffers::Offset CreateSparsityParametersDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector *traversal_order = nullptr, - const std::vector *block_map = nullptr, - const std::vector> *dim_metadata = nullptr) { - auto traversal_order__ = traversal_order ? _fbb.CreateVector(*traversal_order) : 0; - auto block_map__ = block_map ? _fbb.CreateVector(*block_map) : 0; - auto dim_metadata__ = dim_metadata ? _fbb.CreateVector>(*dim_metadata) : 0; - return tflite::CreateSparsityParameters( - _fbb, - traversal_order__, - block_map__, - dim_metadata__); -} - -flatbuffers::Offset CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); - struct TensorT : public flatbuffers::NativeTable { typedef Tensor TableType; std::vector shape; @@ -3111,7 +2875,6 @@ struct TensorT : public flatbuffers::NativeTable { std::string name; std::unique_ptr quantization; bool is_variable; - std::unique_ptr sparsity; TensorT() : type(TensorType_FLOAT32), buffer(0), @@ -3127,8 +2890,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_BUFFER = 8, VT_NAME = 10, VT_QUANTIZATION = 12, - VT_IS_VARIABLE = 14, - VT_SPARSITY = 16 + VT_IS_VARIABLE = 14 }; const flatbuffers::Vector *shape() const { return GetPointer *>(VT_SHAPE); @@ -3148,9 +2910,6 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { bool is_variable() const { return GetField(VT_IS_VARIABLE, 0) != 0; } - const SparsityParameters *sparsity() const { - return GetPointer(VT_SPARSITY); - } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) && @@ -3162,8 +2921,6 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_QUANTIZATION) && verifier.VerifyTable(quantization()) && VerifyField(verifier, VT_IS_VARIABLE) && - VerifyOffset(verifier, VT_SPARSITY) && - verifier.VerifyTable(sparsity()) && verifier.EndTable(); } TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -3192,9 +2949,6 @@ struct TensorBuilder { void add_is_variable(bool is_variable) { fbb_.AddElement(Tensor::VT_IS_VARIABLE, static_cast(is_variable), 0); } - void add_sparsity(flatbuffers::Offset sparsity) { - fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity); - } explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -3214,10 +2968,8 @@ inline flatbuffers::Offset CreateTensor( uint32_t buffer = 0, flatbuffers::Offset name = 0, flatbuffers::Offset quantization = 0, - bool is_variable = false, - flatbuffers::Offset sparsity = 0) { + bool is_variable = false) { TensorBuilder builder_(_fbb); - builder_.add_sparsity(sparsity); builder_.add_quantization(quantization); builder_.add_name(name); builder_.add_buffer(buffer); @@ -3234,8 +2986,7 @@ inline flatbuffers::Offset CreateTensorDirect( uint32_t buffer = 0, const char *name = nullptr, flatbuffers::Offset quantization = 0, - bool is_variable = false, - flatbuffers::Offset sparsity = 0) { + bool is_variable = false) { auto shape__ = shape ? _fbb.CreateVector(*shape) : 0; auto name__ = name ? _fbb.CreateString(name) : 0; return tflite::CreateTensor( @@ -3245,8 +2996,7 @@ inline flatbuffers::Offset CreateTensorDirect( buffer, name__, quantization, - is_variable, - sparsity); + is_variable); } flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); @@ -9976,73 +9726,6 @@ inline flatbuffers::Offset CreateQuantizationParameters( _quantized_dimension); } -inline DimensionMetadataT *DimensionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const { - auto _o = new DimensionMetadataT(); - UnPackTo(_o, _resolver); - return _o; -} - -inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const { - (void)_o; - (void)_resolver; - { auto _e = format(); _o->format = _e; }; - { auto _e = dense_size(); _o->dense_size = _e; }; - { auto _e = array_segments(); if (_e) { _o->array_segments.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_segments[_i] = _e->Get(_i); } } }; - { auto _e = array_indices(); if (_e) { _o->array_indices.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_indices[_i] = _e->Get(_i); } } }; -} - -inline flatbuffers::Offset DimensionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) { - return CreateDimensionMetadata(_fbb, _o, _rehasher); -} - -inline flatbuffers::Offset CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) { - (void)_rehasher; - (void)_o; - struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; - auto _format = _o->format; - auto _dense_size = _o->dense_size; - auto _array_segments = _o->array_segments.size() ? _fbb.CreateVector(_o->array_segments) : 0; - auto _array_indices = _o->array_indices.size() ? _fbb.CreateVector(_o->array_indices) : 0; - return tflite::CreateDimensionMetadata( - _fbb, - _format, - _dense_size, - _array_segments, - _array_indices); -} - -inline SparsityParametersT *SparsityParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const { - auto _o = new SparsityParametersT(); - UnPackTo(_o, _resolver); - return _o; -} - -inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const { - (void)_o; - (void)_resolver; - { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } }; - { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } }; - { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dim_metadata[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; -} - -inline flatbuffers::Offset SparsityParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) { - return CreateSparsityParameters(_fbb, _o, _rehasher); -} - -inline flatbuffers::Offset CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) { - (void)_rehasher; - (void)_o; - struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; - auto _traversal_order = _o->traversal_order.size() ? _fbb.CreateVector(_o->traversal_order) : 0; - auto _block_map = _o->block_map.size() ? _fbb.CreateVector(_o->block_map) : 0; - auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0; - return tflite::CreateSparsityParameters( - _fbb, - _traversal_order, - _block_map, - _dim_metadata); -} - inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new TensorT(); UnPackTo(_o, _resolver); @@ -10058,7 +9741,6 @@ inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t { auto _e = name(); if (_e) _o->name = _e->str(); }; { auto _e = quantization(); if (_e) _o->quantization = std::unique_ptr(_e->UnPack(_resolver)); }; { auto _e = is_variable(); _o->is_variable = _e; }; - { auto _e = sparsity(); if (_e) _o->sparsity = std::unique_ptr(_e->UnPack(_resolver)); }; } inline flatbuffers::Offset Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -10075,7 +9757,6 @@ inline flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder & auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name); auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0; auto _is_variable = _o->is_variable; - auto _sparsity = _o->sparsity ? CreateSparsityParameters(_fbb, _o->sparsity.get(), _rehasher) : 0; return tflite::CreateTensor( _fbb, _shape, @@ -10083,8 +9764,7 @@ inline flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder & _buffer, _name, _quantization, - _is_variable, - _sparsity); + _is_variable); } inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { @@ -12864,7 +12544,7 @@ inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const voi auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - default: return true; + default: return false; } } @@ -13317,7 +12997,7 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - default: return true; + default: return false; } } diff --git a/tensorflow/lite/testdata/sparse_tensor.bin b/tensorflow/lite/testdata/sparse_tensor.bin deleted file mode 100644 index d1445ac648065da9918a1ba72ab8b53374273b5e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 412 zcmX|-O%4G;6ohMNKM_P^79tK{Z{q}3mi8iHi_BO!f&*DtSU89SIE7Z<{CJbk-x$r^34h&r zRMgI_rG;MB`uGoOCIuU7THMM+bV)V#`Zn;qjE6x+>Ul3`@0s0 Date: Thu, 5 Dec 2019 03:40:35 +0000 Subject: [PATCH 127/383] Fix `invalid syntax` error when `import carla` is present This fix tries to address the issue raised in 34828 where `import carla` followed by `import tensorflow` caused the following: ``` SyntaxError: invalid syntax ``` The issue is that, when `import carla` is invoked, I/O operation for `std::ostringstream s` might fail, which caused the conversion of AttrValue to string as empty. This PR check `s.good()` to make sure the I/O operation is OK, and, fallback to normal conversion if locale-neutral I/O operation fails. This PR fixes 34828. Signed-off-by: Yong Tang --- tensorflow/python/framework/python_op_gen_internal.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc index a94e6d8e1ab..05102db0189 100644 --- a/tensorflow/python/framework/python_op_gen_internal.cc +++ b/tensorflow/python/framework/python_op_gen_internal.cc @@ -449,7 +449,12 @@ string AttrValueToPython(const string& type, const AttrValue& value, std::ostringstream s; s.imbue(std::locale::classic()); s << std::setprecision(FLT_DIG) << value.f(); - return s.str(); + // If there is no I/O error for `std::ostringstream s` return s.str(), + // otherwise fallback to strings::StrCat(value.f()). + if (s.good()) { + return s.str(); + } + return strings::StrCat(value.f()); } } else if (type == "bool") { return value.b() ? "True" : "False"; From 3bf3392e223e01a6ed3dc2b3aa9958083b2860e2 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 20:01:41 -0800 Subject: [PATCH 128/383] Improvement documentation of tf.math.exp PiperOrigin-RevId: 283891642 Change-Id: I91a73e9d0f6c4a4d64aac9327b23dbf1171a2158 --- .../core/api_def/python_api/api_def_Exp.pbtxt | 7 ++- tensorflow/python/ops/math_ops.py | 49 ------------------- 2 files changed, 6 insertions(+), 50 deletions(-) diff --git a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt index 4c89cd7afcc..38a9078d9f6 100644 --- a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt @@ -1,4 +1,9 @@ op { graph_op_name: "Exp" - visibility: HIDDEN + endpoint { + name: "math.exp" + } + endpoint { + name: "exp" + } } diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 36c6bd86370..527fc850c5f 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4278,52 +4278,3 @@ def ceil(x, name=None): @end_compatibility """ return gen_math_ops.ceil(x, name) - - -# pylint: disable=g-docstring-has-escape -@tf_export("math.exp", "exp") -@dispatch.add_dispatch_support -def exp(x, name=None): - """Computes exponential of x element-wise. \\(y = e^x\\). - - This function computes the exponential of the input tensor element-wise. - i.e. `math.exp(x)` or \\(e^x\\), where `x` is the input tensor. - \\(e\\) denotes Euler's number and is approximately equal to 2.718281. - Output is positive for any real input. - - >>> x = tf.constant(2.0) - >>> tf.math.exp(x) - - - >>> x = tf.constant([2.0, 8.0]) - >>> tf.math.exp(x) - - - For complex numbers, the exponential value is calculated as - \\(e^{x+iy}={e^x}{e^{iy}}={e^x}{\cos(y)+i\sin(y)}\\) - - For `1+1j` the value would be computed as: - \\(e^1{\cos(1)+i\sin(1)} = 2.7182817 \times (0.5403023+0.84147096j)\\) - - >>> x = tf.constant(1 + 1j) - >>> tf.math.exp(x) - - - Args: - x: A `tf.Tensor`. Must be one of the following types: `bfloat16`, `half`, - `float32`, `float64`, `complex64`, `complex128`. - name: A name for the operation (optional). - - Returns: - A `tf.Tensor`. Has the same type as `x`. - - @compatibility(numpy) - Equivalent to np.exp - @end_compatibility - """ - return gen_math_ops.exp(x, name) - - -# pylint: enable=g-docstring-has-escape From dbad943a6390c181a6cbaca727a1ee4ca3e22126 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 21:02:12 -0800 Subject: [PATCH 129/383] Minor formatting fixes to tf.transpose. PiperOrigin-RevId: 283897740 Change-Id: I1195ed04055844074ea860fb3173ae1df8a8319c --- tensorflow/python/ops/array_ops.py | 86 +++++++++++++++++------------- 1 file changed, 48 insertions(+), 38 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index dd058e82223..20249958486 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -1954,16 +1954,17 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"): @tf_export("transpose", v1=[]) def transpose_v2(a, perm=None, conjugate=False, name="transpose"): - """Transposes `a`. + """Transposes `a`, where `a` is a Tensor. - Permutes the dimensions according to `perm`. + Permutes the dimensions according to the value of `perm`. - The returned tensor's dimension i will correspond to the input dimension - `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is - the rank of the input tensor. Hence by default, this operation performs a - regular matrix transpose on 2-D input Tensors. If conjugate is True and - `a.dtype` is either `complex64` or `complex128` then the values of `a` - are conjugated and transposed. + The returned tensor's dimension `i` will correspond to the input dimension + `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is the rank + of the input tensor. Hence by default, this operation performs a regular + matrix transpose on 2-D input Tensors. + + If conjugate is `True` and `a.dtype` is either `complex64` or `complex128` + then the values of `a` are conjugated and transposed. @compatibility(numpy) In `numpy` transposes are memory-efficient constant time operations as they @@ -1975,43 +1976,52 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"): For example: - ```python - x = tf.constant([[1, 2, 3], [4, 5, 6]]) - tf.transpose(x) # [[1, 4] - # [2, 5] - # [3, 6]] + >>> x = tf.constant([[1, 2, 3], [4, 5, 6]]) + >>> tf.transpose(x) + - # Equivalently - tf.transpose(x, perm=[1, 0]) # [[1, 4] - # [2, 5] - # [3, 6]] + Equivalently, you could call `tf.transpose(x, perm=[1, 0])`. - # If x is complex, setting conjugate=True gives the conjugate transpose - x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j], - [4 + 4j, 5 + 5j, 6 + 6j]]) - tf.transpose(x, conjugate=True) # [[1 - 1j, 4 - 4j], - # [2 - 2j, 5 - 5j], - # [3 - 3j, 6 - 6j]] + If `x` is complex, setting conjugate=True gives the conjugate transpose: - # 'perm' is more useful for n-dimensional tensors, for n > 2 - x = tf.constant([[[ 1, 2, 3], - [ 4, 5, 6]], - [[ 7, 8, 9], - [10, 11, 12]]]) + >>> x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j], + ... [4 + 4j, 5 + 5j, 6 + 6j]]) + >>> tf.transpose(x, conjugate=True) + - # Take the transpose of the matrices in dimension-0 - # (this common operation has a shorthand `linalg.matrix_transpose`) - tf.transpose(x, perm=[0, 2, 1]) # [[[1, 4], - # [2, 5], - # [3, 6]], - # [[7, 10], - # [8, 11], - # [9, 12]]] - ``` + 'perm' is more useful for n-dimensional tensors where n > 2: + + >>> x = tf.constant([[[ 1, 2, 3], + ... [ 4, 5, 6]], + ... [[ 7, 8, 9], + ... [10, 11, 12]]]) + + As above, simply calling `tf.transpose` will default to `perm=[2,1,0]`. + + To take the transpose of the matrices in dimension-0 (such as when you are + transposing matrices where 0 is the batch dimesnion), you would set + `perm=[0,2,1]`. + + >>> tf.transpose(x, perm=[0, 2, 1]) + + + Note: This has a shorthand `linalg.matrix_transpose`): Args: a: A `Tensor`. - perm: A permutation of the dimensions of `a`. + perm: A permutation of the dimensions of `a`. This should be a vector. conjugate: Optional bool. Setting it to `True` is mathematically equivalent to tf.math.conj(tf.transpose(input)). name: A name for the operation (optional). From fa2e8d3080b5397070def0aa929d763e5dfe68a8 Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Wed, 4 Dec 2019 21:08:12 -0800 Subject: [PATCH 130/383] Add simple 0/1 arithmetic optimizations Roll-forward without divide optimization PiperOrigin-RevId: 283898690 Change-Id: If0ab826d4971a9569922f127e4b416fc0fd98b0d --- tensorflow/python/ops/math_ops.py | 28 +++++++++++++++---- tensorflow/python/ops/math_ops_test.py | 37 ++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 527fc850c5f..d890d4266ba 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -341,12 +341,18 @@ def divide(x, y, name=None): # override names. Use a dummy class to track the runtime division behavior return DivideDelegateWithName(x, name) / y else: + # We could short-circuit when y is 1, but we'd still have to cast to float, + # hence it doesn't seem to be worth optimizing. return x / y @tf_export("math.multiply", "multiply") @dispatch.add_dispatch_support -def multiply(x, y, name=None): +def multiply(x, y, name=None): # pylint: disable=missing-docstring + # Do an is comparison here since this is cheaper than isinstance or __eq__ + if y is 1: # pylint: disable=literal-comparison + return x + return gen_math_ops.mul(x, y, name) @@ -358,16 +364,28 @@ multiply.__doc__ = gen_math_ops.mul.__doc__.replace("Multiply", "tf.multiply") "2016-12-30", "`tf.mul(x, y)` is deprecated, please use `tf.multiply(x, y)` or `x * y`") def _mul(x, y, name=None): - return gen_math_ops.mul(x, y, name) + return multiply(x, y, name=name) _mul.__doc__ = ( gen_math_ops.mul.__doc__ + ("" if _mul.__doc__ is None else _mul.__doc__)) +def add_v2(x, y, name=None): + # Do an is comparison here since this is cheaper than isinstance or __eq__ + if y is 0: # pylint: disable=literal-comparison + return x + + return gen_math_ops.add_v2(x, y, name=name) + + @tf_export("math.subtract", "subtract") @dispatch.add_dispatch_support def subtract(x, y, name=None): + # Do an is comparison here since this is cheaper than isinstance or __eq__ + if y is 0: # pylint: disable=literal-comparison + return x + return gen_math_ops.sub(x, y, name) @@ -379,7 +397,7 @@ subtract.__doc__ = gen_math_ops.sub.__doc__.replace("`Sub`", "`tf.subtract`") "2016-12-30", "`tf.sub(x, y)` is deprecated, please use `tf.subtract(x, y)` or `x - y`") def _sub(x, y, name=None): - return gen_math_ops.sub(x, y, name) + return subtract(x, y, name) _sub.__doc__ = ( @@ -1207,7 +1225,7 @@ def _add_dispatch(x, y, name=None): if x.dtype == dtypes.string: return gen_math_ops.add(x, y, name=name) else: - return gen_math_ops.add_v2(x, y, name=name) + return add_v2(x, y, name=name) def _mul_dispatch(x, y, name=None): @@ -1233,7 +1251,7 @@ _OverrideBinaryOperatorHelper(gen_sparse_ops.sparse_dense_cwise_mul, "mul", sparse_tensor.SparseTensor) _OverrideBinaryOperatorHelper(_add_dispatch, "add") -_OverrideBinaryOperatorHelper(gen_math_ops.sub, "sub") +_OverrideBinaryOperatorHelper(subtract, "sub") _OverrideBinaryOperatorHelper(_mul_dispatch, "mul") _OverrideBinaryOperatorHelper(_div_python2, "div") _OverrideBinaryOperatorHelper(_truediv_python3, "truediv") diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index f49ba3dd2a3..87ab39b97fd 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -689,5 +689,42 @@ class RangeTest(test_util.TensorFlowTestCase): self.assertAllEqual(values, self.evaluate(tensor)) +@test_util.run_all_in_graph_and_eager_modes +class ScalarOptimizationTest(test_util.TensorFlowTestCase): + + def testAddZero(self): + x = constant_op.constant(1) + y = math_ops.add_v2(x, 0) + self.assertAllEqual(x, y) + self.assertIs(x, y) + + # Optimization not applied + y = math_ops.add_v2(x, constant_op.constant(0)) + self.assertAllEqual(x, y) + self.assertIsNot(x, y) + + def testSubtractZero(self): + x = constant_op.constant(1) + y = math_ops.subtract(x, 0) + self.assertAllEqual(x, y) + self.assertIs(x, y) + + # Optimization not applied + y = math_ops.subtract(x, constant_op.constant(0)) + self.assertAllEqual(x, y) + self.assertIsNot(x, y) + + def testMultiplyOne(self): + x = constant_op.constant(1) + y = math_ops.multiply(x, 1) + self.assertAllEqual(x, y) + self.assertIs(x, y) + + # Optimization not applied + y = math_ops.multiply(x, constant_op.constant(1)) + self.assertAllEqual(x, y) + self.assertIsNot(x, y) + + if __name__ == "__main__": googletest.main() From 0071950c073eb78e8935ad680f557e11f52b76a0 Mon Sep 17 00:00:00 2001 From: YoungSeok Yoon Date: Wed, 4 Dec 2019 21:13:39 -0800 Subject: [PATCH 131/383] Fix the eigen archive download path PiperOrigin-RevId: 283899224 Change-Id: Ibd2ca25f9339de143e17569d9296c7f23ae4135c --- tensorflow/lite/tools/make/download_dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh index 4b4df1e9f9d..cea13f8d9dd 100755 --- a/tensorflow/lite/tools/make/download_dependencies.sh +++ b/tensorflow/lite/tools/make/download_dependencies.sh @@ -29,7 +29,7 @@ if [ ! -f $BZL_FILE_PATH ]; then exit 1; fi -EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.tensorflow | head -n1)" +EIGEN_URL="$(grep -o 'http.*github.com/eigenteam/eigen-git-mirror/archive/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.tensorflow | head -n1)" GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" From 5a74d6f0d7929fa73c41362df18c01ac57ec1788 Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Wed, 4 Dec 2019 21:17:41 -0800 Subject: [PATCH 132/383] [TF:MLIR] Extend the graph to MLIR importer to convert function calls to LegacyCallOp. Add LegacyCallOp to the Tensorflow dialect. The MLIR importer converts graph function calls without any attribute other than the _disable_call_shape_inference attribute to LegacyCallOp while the MLIR exporter converts LegacyCallOp back to graph function calls. Fix affected tests. Add test cases. PiperOrigin-RevId: 283899541 Change-Id: I8c5685b1217ac3d79c76ce589be6b05a4ec52f97 --- tensorflow/compiler/mlir/tensorflow/BUILD | 2 + .../compiler/mlir/tensorflow/ir/tf_ops.td | 36 ++++++++++ .../graph-custom-operation.pbtxt | 2 +- .../graphdef2mlir/graph-function-call.pbtxt | 65 +++++++++++++++++++ .../graph-function-name-bug.pbtxt | 4 +- .../tests/graphdef2mlir/graph-library.pbtxt | 6 +- .../tests/mlir2graphdef/tf-legacy-call.mlir | 26 ++++++++ .../tensorflow/translate/export_graphdef.cc | 25 +++++-- .../mlir/tensorflow/translate/import_model.cc | 54 ++++++++++++--- .../mlir/tensorflow/utils/export_utils.cc | 39 +++++++---- .../mlir/tensorflow/utils/export_utils.h | 11 ++++ 11 files changed, 236 insertions(+), 34 deletions(-) create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index 5484988d0f5..24031c3c4cd 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -368,12 +368,14 @@ cc_library( ":convert_tensor", ":convert_type", ":mangling_util", + ":tensorflow", "//tensorflow/compiler/xla:status_macros", "//tensorflow/core:core_cpu", "//tensorflow/core:framework", "//tensorflow/core:graph", "//tensorflow/core:lib", "//tensorflow/core:protos_all_cc", + "//tensorflow/core/platform:protobuf", "//tensorflow/stream_executor/lib", "@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/memory", diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td index 8d975e909bb..9b6196cda5b 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td @@ -196,6 +196,42 @@ retained with length 1. TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>; } +def TF_LegacyCallOp : TF_Op<"LegacyCall", + [CallOpInterface, NoSideEffect]> { + let summary = + "returns `f(inputs)`, where `f` is a function."; + + let description = [{ + The LegacyCall operation represents a direct call to a function that is + within the same symbol scope as the call and is mapped to a GraphDef node + with the function name as the op name. Unlike a PartitionedCall which + represents asynchronously executing a function across multiple devices, a + LegacyCall represents a function call with the only attribute + _diable_call_shape_inference. + }]; + + let arguments = (ins + Variadic:$args, + + FlatSymbolRefAttr:$f, + DefaultValuedAttr:$_disable_call_shape_inference + ); + + let results = (outs + Variadic:$output + ); + + let extraClassDeclaration = [{ + // Gets the argument operands to the called function. + operand_range getArgOperands() { return args(); } + + // Returns the callee of this operation. + CallInterfaceCallable getCallableForCallee() { + return getAttrOfType("f"); + } + }]; +} + def TF_PartitionedCallOp : TF_Op<"PartitionedCall", [CallOpInterface, NoSideEffect]> { let summary = diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt index 9ce15315832..207d6676f61 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-custom-operation.pbtxt @@ -54,5 +54,5 @@ versions { # the names are matching between the function definition and the uses / call # site (a numerical suffix may be appended). -# CHECK: "tf.foo0"( +# CHECK: "tf.LegacyCall"(%outputs) {_disable_call_shape_inference = false, f = @foo0} # CHECK: func @foo0 diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt new file mode 100644 index 00000000000..f0a7a574ae3 --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-call.pbtxt @@ -0,0 +1,65 @@ +# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=x -tf-input-data-types=DT_INT32 -tf-input-shapes=10 -tf-output-arrays=func_call -o - | FileCheck %s + +node { + name: "x" + op: "Const" + attr { + key: "dtype" + value { + type: DT_INT32 + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_INT32 + tensor_shape { + dim { + size: 1 + } + } + int_val: 1 + } + } + } +} +node { + name: "func_call" + op: "test_func_name" + input: "x" + attr { + key: "_disable_call_shape_inference" + value { + b: true + } + } +} +library { + function { + signature { + name: "test_func_name" + input_arg { + name: "a_0" + type: DT_INT32 + } + output_arg { + name: "a" + type: DT_INT32 + } + } + ret { + key: "a" + value: "a_0" + } + attr { + key: "_disable_call_shape_inference" + value { + b: true + } + } + } +} + +# CHECK: func @main +# CHECK: "tf.LegacyCall"(%arg0) {_disable_call_shape_inference = true, f = @test_func_name0} diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt index dcdbe67ccb6..563007f4305 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-function-name-bug.pbtxt @@ -121,8 +121,8 @@ versions { # Verify that functions from the library are properly imported. # CHECK-LABEL: func @main() { -# CHECK: "tf.foo110"() -# CHECK: "tf.foo111"() +# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @foo110} +# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @foo111} # CHECK-LABEL: func @foo110() { # CHECK-LABEL: func @foo111() { diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt index 17b2655aa5d..b65984227f6 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-library.pbtxt @@ -39,10 +39,10 @@ versions { # Verify that functions from the library are properly imported. # CHECK-LABEL: func @main() { -# CHECK: "tf.foo0"() -# CHECK: "tf.bar0"() +# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @foo0} +# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @bar0} # CHECK-LABEL: func @foo0() { -# CHECK: "tf.bar0"() +# CHECK: "tf.LegacyCall"() {_disable_call_shape_inference = false, f = @bar0} # CHECK-LABEL: func @bar0() { diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir new file mode 100644 index 00000000000..6c83b45295e --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-legacy-call.mlir @@ -0,0 +1,26 @@ +// RUN: tf-mlir-translate -mlir-to-graphdef %s -o - | FileCheck %s + +func @main() { + tf_executor.graph { + %outputs, %control = tf_executor.island wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_INT32", name = "Constant", value = dense<0> : tensor} : () -> tensor + %outputs_0, %control_1 = tf_executor.island wraps "tf.LegacyCall"(%outputs) {f = @foo0} : (tensor) -> tensor + tf_executor.fetch + } + return +} +func @foo0(%arg0: tensor<*xi32>) -> tensor<*xi32> { + %0 = tf_executor.graph { + tf_executor.fetch %arg0 : tensor<*xi32> + } + return %0 : tensor<*xi32> +} + +// CHECK: node { +// CHECK: name: "_tf.LegacyCall" +// CHECK-NEXT: op: "foo0" + +// CHECK: library { +// CHECK-NEXT: function { +// CHECK-NEXT: signature { +// CHECK-NEXT: name: "foo0" + diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc index bac3ea22973..58242e62f1c 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc @@ -535,6 +535,18 @@ StatusOr> Exporter::Convert( arg, index, graph_as_function && !input_names.empty() ? input_names[index] : "")); } + + auto convert_called_function = [&](llvm::StringRef name) { + auto func = + function.getParentOfType().lookupSymbol( + name); + if (func != nullptr) { + TF_RETURN_IF_ERROR(ConvertLibFunction(configs, tf_dialect, func, flib)); + TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(*flib)); + } + return Status::OK(); + }; + // Adds nodes for operations. for (Operation& inst : block) { auto op_name = GetTensorFlowOpName(inst.getName().getStringRef()); @@ -544,13 +556,12 @@ StatusOr> Exporter::Convert( // definition library // TODO(prakalps): If two functions have cyclic dependence, this will // introduce an infinite loop. - auto func = - function.getParentOfType().lookupSymbol( - op_name.ValueOrDie()); - if (func != nullptr) { - TF_RETURN_IF_ERROR(ConvertLibFunction(configs, tf_dialect, func, flib)); - TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(*flib)); - } + TF_RETURN_IF_ERROR(convert_called_function(op_name.ValueOrDie().str())); + } + + if (IsLegacyCallInstruction(&inst)) { + TF_RETURN_IF_ERROR(convert_called_function( + inst.getAttrOfType("f").getLeafReference())); } for (auto type : inst.getResultTypes()) { diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc index da2e6a67445..965f8c811d0 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc @@ -97,6 +97,9 @@ using stream_executor::port::StatusOr; namespace { +const char* disable_call_shape_inference_attribute_name = + "_disable_call_shape_inference"; + // This class is used to generate new MLIR function name strings that are both // unique in the TF function library `flib_` and unique among the name strings // generated by the class object during its lifetime. @@ -246,11 +249,14 @@ class ImporterBase { llvm::SmallVector* attributes); // Helper to create either a tf_executor operation or a TF operation wrapped - // in an island. + // in an island. When convert_to_legacy_call is true, converts the operation + // representing a call to a library function with a name represented in + // node_type_name to LegacyCallOp. mlir::Operation* createOperation( - const Node& node, llvm::StringRef op_name, + const Node& node, llvm::StringRef node_type_name, const mlir::OperationState& result, - const llvm::SmallVectorImpl& control_operands); + const llvm::SmallVectorImpl& control_operands, + bool convert_to_legacy_call = false); // Converts one NodeDef from the input GraphDef into an Operation and // inserts it into the MLIR module using builder_. @@ -1210,9 +1216,10 @@ std::string ImporterBase::GetLocationStr(const Node& node, } mlir::Operation* ImporterBase::createOperation( - const Node& node, llvm::StringRef op_name, + const Node& node, llvm::StringRef node_type_name, const mlir::OperationState& result, - const llvm::SmallVectorImpl& control_operands) { + const llvm::SmallVectorImpl& control_operands, + bool convert_to_legacy_call) { // For the tf.executor specific operations (not wrapped in an island), we // have an extra returned value for the control result, and we concatenate // control and non-control operands. @@ -1274,7 +1281,27 @@ mlir::Operation* ImporterBase::createOperation( mlir::OpBuilder island_builder(&island.GetBody()); // Create the operation inside the island now. - mlir::Operation* inner_op = island_builder.createOperation(result); + mlir::Operation* inner_op; + if (convert_to_legacy_call) { + bool disable_call_shape_inference = false; + for (const auto& name_and_value : node.attrs()) { + const auto& attr_name = name_and_value.first; + const AttrValue& attr_value = name_and_value.second; + if (strcmp(attr_name.c_str(), + disable_call_shape_inference_attribute_name) == 0 && + attr_value.value_case() == AttrValue::kB) { + disable_call_shape_inference = attr_value.b(); + } + } + + mlir::BoolAttr attribute = + builder_.getBoolAttr(disable_call_shape_inference); + inner_op = island_builder.create( + result.location, result.types, result.operands, + island_builder.getSymbolRefAttr(node_type_name), attribute); + } else { + inner_op = island_builder.createOperation(result); + } // Add the terminator for the island mlir::SmallVector ret_vals(inner_op->getResults()); @@ -1293,9 +1320,11 @@ Status ImporterBase::ConvertNode(const Node& node) { // create the MLIR function and insert it to the module if it doesn't exist. std::string node_type_name = node.type_string(); const auto* func_def = graph_flib_.Find(node_type_name); + bool convert_to_legacy_call = false; if (func_def) { TF_RETURN_IF_ERROR(ConvertLibFunction(node_type_name)); node_type_name = (*tf_name_to_mlir_name_)[node_type_name]; + convert_to_legacy_call = true; } auto get_full_op_name = [&](const std::string& op_name) { @@ -1380,6 +1409,14 @@ Status ImporterBase::ConvertNode(const Node& node) { for (const auto& name_and_value : node.attrs()) { const auto& attr_name = name_and_value.first; const AttrValue& attr_value = name_and_value.second; + // LegacyCall can only represent _diable_call_shape_inference attribute. + // If a call has other attributes, can't convert it to LegacyCall. + if (convert_to_legacy_call && + (strcmp(attr_name.c_str(), + disable_call_shape_inference_attribute_name) || + attr_value.value_case() != AttrValue::kB)) { + convert_to_legacy_call = false; + } if (attr_value.value_case() == AttrValue::kFunc) { // Attribute iteration order is not defined for protocol buffer Map. // Process function attributes separately in the lexicographical order to @@ -1423,9 +1460,8 @@ Status ImporterBase::ConvertNode(const Node& node) { } // Register the mapping between the TF node and the newly created operation. - node_values_[node.id()] = - createOperation(node, op_name, result, control_operands); - + node_values_[node.id()] = createOperation( + node, node_type_name, result, control_operands, convert_to_legacy_call); return Status::OK(); } diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc index 69b309f0632..e35b7130de8 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc @@ -34,6 +34,7 @@ limitations under the License. #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir #include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "mlir/Support/DebugStringHelper.h" // TF:local_config_mlir +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" #include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h" #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h" @@ -253,21 +254,30 @@ StatusOr> GetOperationNodeDef( // Note: we do not use NodeBuilder or NodeDefBuilder as that would require // mapping back from the inputs to the input arguments. - // Some control flow ops in TensorFlow Graph have their respective "Ref" ops - // as well. For example there is Enter and RefEnter op. RefEnter forwards - // the input ref buffer to output. However both Enter and RefEnter are - // mapped to tf_executor::EnterOp during import and then to _tf.Enter op in - // control dialect. Check if it is a Ref op to correctly map to the TensorFlow - // Graph op. llvm::SmallString<64> op_name; - if (IsRefTypeControlOp(inst)) op_name = "Ref"; - - TF_ASSIGN_OR_RETURN(auto tf_name, - GetTensorFlowOpName(inst->getName().getStringRef())); - op_name.append(tf_name); + if (IsLegacyCallInstruction(inst)) { + // The op_name is the name of the function. + op_name.append( + inst->getAttrOfType("f").getLeafReference()); + // Remove the attribute from the instruction as it is already converted to + // op_name. + auto attr_id = mlir::Identifier::get("f", inst->getContext()); + inst->removeAttr(attr_id); + } else { + // Some control flow ops in TensorFlow Graph have their respective "Ref" ops + // as well. For example there is Enter and RefEnter op. RefEnter forwards + // the input ref buffer to output. However both Enter and RefEnter are + // mapped to tf_executor::EnterOp during import and then to _tf.Enter op in + // control dialect. Check if it is a Ref op to correctly map to the + // TensorFlow Graph op. + if (IsRefTypeControlOp(inst)) op_name = "Ref"; + TF_ASSIGN_OR_RETURN(auto tf_name, + GetTensorFlowOpName(inst->getName().getStringRef())); + op_name.append(tf_name); + } + node_def->set_name(name.str()); node_def->set_op(op_name.str()); - node_def->set_name(name); // Add inputs to the NodeDef based on the number of operands. This is required // as later when edges are added to the Node using Graph::AddEdge the @@ -454,4 +464,9 @@ Status SetSizeAttribute(absl::string_view name, size_t size, return Status::OK(); } +bool IsLegacyCallInstruction(mlir::Operation* inst) { + return llvm::dyn_cast(inst) || + inst->getName().getStringRef().compare("_tf.LegacyCall") == 0; +} + } // namespace tensorflow diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h index 8d813b53bd8..df176762c07 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h +++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h @@ -73,5 +73,16 @@ Status SetShapeAttribute(absl::string_view name, mlir::ShapedType shape, // If the attribute already exists with a different value, returns an error. Status SetSizeAttribute(absl::string_view name, size_t size, AttrValueMap* values); + +// Returns true if the given instruction is an mlir::TF::LegacyCallOp or the +// result of such an operation transformed by the +// ExecutorToControlDialectConversion pass. +// +// TODO(b/145706023): When the ExecutorToControlDialectConversion pass runs +// before the exporter, it mutates an mlir::TF::LegacyCallOp instruction to +// an instruction with a different operation name. As such, this routine checks +// both forms of a LegacyCall instruction. We only need to check for +// mlir::TF::LegacyCallOp when the ticket is resolved. +bool IsLegacyCallInstruction(mlir::Operation* inst); } // namespace tensorflow #endif // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_EXPORTER_UTILS_H_ From e74eba4ced41b9a8178880a9748d382b19b9eb7a Mon Sep 17 00:00:00 2001 From: Renjie Liu Date: Wed, 4 Dec 2019 21:38:28 -0800 Subject: [PATCH 133/383] Refactor quantize per-channel function for raw accumulators. PiperOrigin-RevId: 283901695 Change-Id: I1e9b4a4a438778efbbf1a48f76082d0aea0efc89 --- .../optimized/integer_ops/depthwise_conv.h | 95 +------------------ .../optimized/integer_ops/transpose_conv.h | 5 +- .../internal/optimized/optimized_ops.h | 7 +- 3 files changed, 11 insertions(+), 96 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h index ac731ad152b..1ece0146a34 100644 --- a/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h +++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv.h @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" #include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h" #include "tensorflow/lite/kernels/internal/optimized/integer_ops/depthwise_conv_3x3_filter.h" +#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" #include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h" #include "tensorflow/lite/kernels/internal/types.h" @@ -1768,97 +1769,11 @@ inline void DepthwiseConvGeneral( // the final 8bit form and store them. gemmlowp::ScopedProfilingLabel label("downquantize+store"); const int num_output_values = output_depth * num_output_pixels; - int c = 0; - while (c < output_depth) { - int target_output_depth = output_depth; -#ifdef USE_NEON - using gemmlowp::RoundingDivideByPOT; - const int32x4_t output_offset_vec = vdupq_n_s32(output_offset); - const int32x4_t output_activation_min_vec = - vdupq_n_s32(output_activation_min); - const int32x4_t output_activation_max_vec = - vdupq_n_s32(output_activation_max); - const int32x4_t ones = vdupq_n_s32(1); - const int32x4_t minus_ones = vdupq_n_s32(-1); - const int32x4_t zeros = vdupq_n_s32(0); - - for (; c <= output_depth - 4; c += 4) { - int32x4_t out_shift = vld1q_s32(output_shift + c); - const bool out_shift_all_less_than_zero = - (vgetq_lane_s32(out_shift, 0) < 0) && - (vgetq_lane_s32(out_shift, 1) < 0) && - (vgetq_lane_s32(out_shift, 2) < 0) && - (vgetq_lane_s32(out_shift, 3) < 0); - const bool out_shift_all_greater_equal_than_zero = - (vgetq_lane_s32(out_shift, 0) >= 0) && - (vgetq_lane_s32(out_shift, 1) >= 0) && - (vgetq_lane_s32(out_shift, 2) >= 0) && - (vgetq_lane_s32(out_shift, 3) >= 0); - if (!out_shift_all_less_than_zero && - !out_shift_all_greater_equal_than_zero) { - // Fallback to general path. - // Then go ahead for next 4. - target_output_depth = c + 4; - break; - } - int32x4_t out_mul = vld1q_s32(output_multiplier + c); - for (int n = 0; n < num_output_pixels; ++n) { - int loc = n * output_depth + c; - int32x4_t acc = vld1q_s32(acc_buffer + loc); - if (out_shift_all_less_than_zero) { // output_shift all < 0 case. - acc = vqrdmulhq_s32(acc, out_mul); - // TODO(renjieliu): Optimize this path, also consider inverse - // output_shift since most models have output_shift < 0. - int32x4_t negative_out_shift = vmulq_n_s32(out_shift, -1); - int32x4_t mask = - vaddq_s32(vshlq_s32(ones, negative_out_shift), minus_ones); - int32x4_t remainder = vandq_s32(acc, mask); - int32x4_t shifted_right_mask = vshlq_s32(mask, minus_ones); - int32x4_t temp = vandq_s32( - vreinterpretq_s32_u32(vcltq_s32(acc, zeros)), ones); - int32x4_t threshold = vaddq_s32(shifted_right_mask, temp); - temp = vandq_s32( - vreinterpretq_s32_u32(vcgtq_s32(remainder, threshold)), - ones); - int32x4_t shifted_right_acc = vshlq_s32(acc, out_shift); - acc = vaddq_s32(shifted_right_acc, temp); - } else { // output_shift all > 0 case. - int32x4_t multiplier_power_of_two = vshlq_s32(ones, out_shift); - acc = vmulq_s32(acc, multiplier_power_of_two); - acc = vqrdmulhq_s32(acc, out_mul); - } - // Add the output offset. - acc = vaddq_s32(acc, output_offset_vec); - // Apply the activation function. - acc = vmaxq_s32(acc, output_activation_min_vec); - acc = vminq_s32(acc, output_activation_max_vec); - // Saturating cast to int8 and store to destination. - const int16x4_t acc_s16 = vqmovn_s32(acc); - const int16x8_t res_s16 = vcombine_s16(acc_s16, acc_s16); - const int8x8_t res_s8 = vqmovn_s16(res_s16); - vst1_lane_s8(output_ptr + loc + 0, res_s8, 0); - vst1_lane_s8(output_ptr + loc + 1, res_s8, 1); - vst1_lane_s8(output_ptr + loc + 2, res_s8, 2); - vst1_lane_s8(output_ptr + loc + 3, res_s8, 3); - } - } - -#endif // USE_NEON - // Handle leftover values, one by one. This is very slow. - for (; c < target_output_depth; c++) { - for (int n = 0; n < num_output_pixels; ++n) { - int loc = n * output_depth + c; - int32 acc = acc_buffer[loc]; - acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[c], - output_shift[c]); - acc += output_offset; - acc = std::max(acc, output_activation_min); - acc = std::min(acc, output_activation_max); - output_ptr[loc] = static_cast(acc); - } - } - } + optimized_ops::Quantize(output_multiplier, output_shift, output_depth, + num_output_values, output_offset, + output_activation_min, output_activation_max, + acc_buffer, output_ptr); output_ptr += num_output_values; } diff --git a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h index 4d24ff65250..2001bf648e4 100644 --- a/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h +++ b/tensorflow/lite/kernels/internal/optimized/integer_ops/transpose_conv.h @@ -94,9 +94,12 @@ inline void TransposeConvV2( scratch_data_p += output_offset; } + const int32_t output_min = std::numeric_limits::min(); + const int32_t output_max = std::numeric_limits::max(); + optimized_ops::Quantize(output_multiplier, output_shift, output_depth, output_shape.FlatSize(), params.output_offset, - scratch_data, output_data); + output_min, output_max, scratch_data, output_data); } } // namespace optimized_integer_ops diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h index b5ee08dd7f2..6236116ad95 100644 --- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h +++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h @@ -5617,15 +5617,12 @@ inline void Quantize(int32_t multiplier, int32_t shift, int32_t total_size, } } -// TODO(b/145632530): Refactor other quantize per-channel to use this one. inline void Quantize(const int32_t* multiplier, const int32_t* shift, int32_t channel_size, int32_t total_size, - int32_t output_zp, int32_t* scratch, int8_t* output) { + int32_t output_zp, int32_t output_min, int32_t output_max, + int32_t* scratch, int8_t* output) { gemmlowp::ScopedProfilingLabel label("Quantize/int8"); - const int32_t output_min = std::numeric_limits::min(); - const int32_t output_max = std::numeric_limits::max(); - // Here we're trying to quantize the raw accumulators: // output_channels // data data data data data From 089514d0ee9c403fd5e7fff7dca229f3e02723ef Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Wed, 4 Dec 2019 21:43:34 -0800 Subject: [PATCH 134/383] Add an `--experimental_legacy_converter` CLI argument in tflite_convert. PiperOrigin-RevId: 283902395 Change-Id: I2631f89125127cad52147f11cab26203a1936127 --- tensorflow/lite/python/tflite_convert.py | 32 +++++++++++++++---- tensorflow/lite/python/tflite_convert_test.py | 21 ++++++++++++ 2 files changed, 46 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py index 59e43be807a..5a3e9961e5a 100644 --- a/tensorflow/lite/python/tflite_convert.py +++ b/tensorflow/lite/python/tflite_convert.py @@ -205,9 +205,10 @@ def _convert_tf1_model(flags): if flags.conversion_summary_dir: converter.conversion_summary_dir = flags.conversion_summary_dir - # TODO(b/145312675): Enable the new converter by default. It requires to - # add a new command line argument like `experimental_legacy_converter`. - converter.experimental_new_converter = flags.experimental_new_converter + if flags.experimental_new_converter: + converter.experimental_new_converter = True + if flags.experimental_legacy_converter: + converter.experimental_new_converter = False # Convert model. output_data = converter.convert() @@ -231,9 +232,10 @@ def _convert_tf2_model(flags): model = keras.models.load_model(flags.keras_model_file) converter = lite.TFLiteConverterV2.from_keras_model(model) - # TODO(b/145312675): Enable the new converter by default. It requires to - # add a new command line argument like `experimental_legacy_converter`. - converter.experimental_new_converter = flags.experimental_new_converter + if flags.experimental_new_converter: + converter.experimental_new_converter = True + if flags.experimental_legacy_converter: + converter.experimental_new_converter = False # Convert the model. tflite_model = converter.convert() @@ -308,6 +310,10 @@ def _check_tf1_flags(flags, unparsed): "--experimental_new_converter") if flags.custom_opdefs and not flags.allow_custom_ops: raise ValueError("--custom_opdefs must be used with --allow_custom_ops") + if flags.experimental_new_converter and flags.experimental_legacy_converter: + raise ValueError( + "--experimental_new_converter and experimental_legacy_converter " + "cannot be used together") def _check_tf2_flags(flags): @@ -322,6 +328,10 @@ def _check_tf2_flags(flags): if not flags.keras_model_file and not flags.saved_model_dir: raise ValueError("one of the arguments --saved_model_dir " "--keras_model_file is required") + if flags.experimental_new_converter and flags.experimental_legacy_converter: + raise ValueError( + "--experimental_new_converter and experimental_legacy_converter " + "cannot be used together") def _get_tf1_flags(parser): @@ -554,12 +564,20 @@ def _get_parser(use_v2_converter): else: _get_tf1_flags(parser) - # Enable MLIR-TFLite converter. + # Note: When neither of the following command line argument is passed, + # it will use the default behavior defined in `lite.py`. + # Enable MLIR-based TFLite converter. parser.add_argument( "--experimental_new_converter", action="store_true", help=("Experimental flag, subject to change. Enables MLIR-based " "conversion instead of TOCO conversion.")) + # Explicitly disable the MLIR-based TFLite converter. + parser.add_argument( + "--experimental_legacy_converter", + action="store_true", + help=("Experimental flag, subject to change. Disable MLIR-based " + "conversion and use the legacy converter.")) return parser diff --git a/tensorflow/lite/python/tflite_convert_test.py b/tensorflow/lite/python/tflite_convert_test.py index 610f5c5e98b..298b606cfe7 100644 --- a/tensorflow/lite/python/tflite_convert_test.py +++ b/tensorflow/lite/python/tflite_convert_test.py @@ -101,6 +101,27 @@ class TfLiteConvertV1Test(TestModels): self._run(flags_str, should_succeed=True) os.remove(graph_def_file) + # Run `tflite_convert` explicitly with the legacy converter. + # Before the new converter is enabled by default, this flag has no real + # effects. + def testFrozenGraphDefWithLegacyConverter(self): + with ops.Graph().as_default(): + in_tensor = array_ops.placeholder( + shape=[1, 16, 16, 3], dtype=dtypes.float32) + _ = in_tensor + in_tensor + sess = session.Session() + + # Write graph to file. + graph_def_file = self._getFilepath('model.pb') + write_graph(sess.graph_def, '', graph_def_file, False) + sess.close() + + flags_str = ('--graph_def_file={0} --input_arrays={1} ' + '--output_arrays={2} --experimental_legacy_converter'.format( + graph_def_file, 'Placeholder', 'add')) + self._run(flags_str, should_succeed=True) + os.remove(graph_def_file) + def testFrozenGraphDefNonPlaceholder(self): with ops.Graph().as_default(): in_tensor = random_ops.random_normal(shape=[1, 16, 16, 3], name='random') From f7dfb32849c481a6fbd099017587cc9ab18275f2 Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Wed, 4 Dec 2019 22:14:25 -0800 Subject: [PATCH 135/383] Add support for non 0 index output tensors as feeds in Graph/GraphDef -> TF MLIR importer. This adds support for non 0 index output tensors as feeds while preserving the existing behavior of replacing nodes for output tensors as feeds for single output nodes. Similar to single output nodes as feeds, multiple output nodes will have their feed output tensors replaced with individual Placeholder nodes, along with their uses. Some special handling has been added to remap those new Placeholder nodes back to feeds and fetches. Command line flags remain the same, but feeds in TensorId format (node:index) is now supported. PiperOrigin-RevId: 283906640 Change-Id: I8bd9fe3b9b277b3a1ae259b62360b55ad887b4c9 --- .../graphdef2mlir/multi-output-feeds.pbtxt | 300 ++++++++++++++++++ .../mlir/tensorflow/translate/import_model.cc | 276 +++++++++++----- 2 files changed, 501 insertions(+), 75 deletions(-) create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt new file mode 100644 index 00000000000..b28e2818730 --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/multi-output-feeds.pbtxt @@ -0,0 +1,300 @@ +# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:2,z:1,a:0 -o - | FileCheck %s --dump-input=fail +# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:2,z:1,a:0 -o - | FileCheck --check-prefix=PRUNE %s --dump-input=fail +# RUN: tf-mlir-translate -graphdef-to-mlir %s -tf-prune-unused-nodes -tf-input-arrays=z:1,z:2 -tf-input-shapes=':' -tf-output-arrays=z:0,a:0 -o - | FileCheck --check-prefix=PRESERVE %s --dump-input=fail + +# Generated in Python via +# ``` +# import tensorflow as tf +# +# with tf.compat.v1.Graph().as_default() as g: +# w = tf.constant(2.0) +# x = tf.constant(3.0) +# y = tf.constant(4.0) +# var = tf.Variable(2.0) +# var_add = var.assign_add(3.0) +# with g.control_dependencies([var_add]): +# z0, z1, z2 = tf.identity_n((w, x, y)) +# +# a = tf.add(z1, z2) +# ``` + +node { + name: "w" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 2.0 + } + } + } +} +node { + name: "x" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 3.0 + } + } + } +} +node { + name: "y" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 4.0 + } + } + } +} +node { + name: "var/initial_value" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 2.0 + } + } + } +} +node { + name: "var" + op: "VariableV2" + attr { + key: "container" + value { + s: "" + } + } + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "shape" + value { + shape { + } + } + } + attr { + key: "shared_name" + value { + s: "" + } + } +} +node { + name: "var/Assign" + op: "Assign" + input: "var" + input: "var/initial_value" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_class" + value { + list { + s: "loc:@var" + } + } + } + attr { + key: "use_locking" + value { + b: true + } + } + attr { + key: "validate_shape" + value { + b: true + } + } +} +node { + name: "var/read" + op: "Identity" + input: "var" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_class" + value { + list { + s: "loc:@var" + } + } + } +} +node { + name: "var_add/value" + op: "Const" + attr { + key: "dtype" + value { + type: DT_FLOAT + } + } + attr { + key: "value" + value { + tensor { + dtype: DT_FLOAT + tensor_shape { + } + float_val: 3.0 + } + } + } +} +node { + name: "var_add" + op: "AssignAdd" + input: "var" + input: "var_add/value" + attr { + key: "T" + value { + type: DT_FLOAT + } + } + attr { + key: "_class" + value { + list { + s: "loc:@var" + } + } + } + attr { + key: "use_locking" + value { + b: false + } + } +} +node { + name: "z" + op: "IdentityN" + input: "w" + input: "x" + input: "y" + input: "^var_add" + attr { + key: "T" + value { + list { + type: DT_FLOAT + type: DT_FLOAT + type: DT_FLOAT + } + } + } +} +node { + name: "a" + op: "Add" + input: "z:1" + input: "z:2" + attr { + key: "T" + value { + type: DT_FLOAT + } + } +} +versions { + producer: 230 +} + +# Test non zero index output tensors as feeds. Original ops where their outputs +# are replaced with feeds are preserved and args and rets are lifted to the +# function. Rets that happen to coincide with a feed should have its value be +# of the feed. +# +# CHECK: func @main(%[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> (tensor, tensor, tensor) +# CHECK: attributes {tf.entry_function = {inputs = "z:1,z:2", outputs = "z:2,z:1,a:0"}} +# CHECK: %{{.*}}, %[[ASSIGN_ADD_CTRL:.*]] = tf_executor.island wraps "tf.AssignAdd" +# CHECK: %{{.*}}, %{{.*}} = tf_executor.island(%[[ASSIGN_ADD_CTRL]]) wraps "tf.IdentityN" +# CHECK: %[[ADD:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_0]], %[[ARG_1]]) +# CHECK: tf_executor.fetch %[[ARG_1]], %[[ARG_0]], %[[ADD]] + +# Test when non zero index output tensors are feeds, remaining ops that are +# unreachable are pruned if pruning is enabled. +# +# PRUNE: func @main(%[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> (tensor, tensor, tensor) +# PRUNE: attributes {tf.entry_function = {inputs = "z:1,z:2", outputs = "z:2,z:1,a:0"}} +# PRUNE-NOT: "tf.Const" +# PRUNE-NOT: "tf.VariableV2" +# PRUNE-NOT: "tf.Assign" +# PRUNE-NOT: "tf.Identity" +# PRUNE-NOT: "tf.AssignAdd" +# PRUNE-NOT: "tf.IdentityN" +# PRUNE: %[[ADD:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_0]], %[[ARG_1]]) +# PRUNE: tf_executor.fetch %[[ARG_1]], %[[ARG_0]], %[[ADD]] + +# Test when non zero index output tensors are feeds, remaining ops that are +# unreachable are preserved if pruning is not enabled. +# +# PRESERVE: func @main(%[[ARG_0:.*]]: tensor, %[[ARG_1:.*]]: tensor) -> (tensor, tensor) +# PRESERVE: attributes {tf.entry_function = {inputs = "z:1,z:2", outputs = "z:0,a:0"}} +# PRESERVE: %{{.*}}, %[[ASSIGN_ADD_CTRL:.*]] = tf_executor.island wraps "tf.AssignAdd" +# PRESERVE: %[[IDENTITY_N:.*]]:3, %{{.*}} = tf_executor.island(%[[ASSIGN_ADD_CTRL]]) wraps "tf.IdentityN" +# PRESERVE: %[[ADD:.*]], %{{.*}} = tf_executor.island wraps "tf.Add"(%[[ARG_0]], %[[ARG_1]]) +# PRESERVE: tf_executor.fetch %[[IDENTITY_N]]#0, %[[ADD]] diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc index 965f8c811d0..247e5f59de2 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc @@ -16,8 +16,11 @@ limitations under the License. #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h" #include +#include #include #include +#include +#include #include "absl/algorithm/container.h" #include "absl/container/flat_hash_map.h" @@ -303,19 +306,24 @@ class ImporterBase { // Gets the location information string for the given node. std::string GetLocationStr(const Node& node, bool includeNodeName = false); - // Inserts a placeholder node in the graph to replace the input node. Replaces - // all the output edges of the input_node with the placeholder node, and - // removes the input_node from the graph. The new node has the same name as - // the input_node, so Nodespecs do not need any modification. + // Inserts a placeholder node in the graph to replace a feed output tensor, + // and returns the new placeholder node and a boolean indicating if the + // original input node was removed from the graph. Uses of the feed output + // tensor are replaced with this placeholder node. If the feed output tensor + // is of a single output node, the control dependencies are forwarded to the + // the placeholder node, and the original node will be removed. // Note: This modifies the graph, and so any list of ordered nodes needs to be // reconstructed. - StatusOr ReplaceWithPlaceholderNode(const TensorShapeProto& shape, - DataType dtype, Node* input_node); + StatusOr> CreatePlaceholderNodeForFeed( + const TensorShapeProto& shape, DataType dtype, Node* node, int index, + const std::unordered_map& node_name_map); // Gets the input and output nodes corresponding to the specified input and // output nodes in specs_. If there are no input or output nodes specified, - // nodes will be empty - Status GetInputOutputNodes(std::unordered_set* nodes); + // nodes will be empty. + Status GetInputOutputNodes( + const std::unordered_map& node_name_map, + std::unordered_set* nodes); // The input graph with backedges removed. The removed backedges are stored // in the back_edge_helper. @@ -345,6 +353,10 @@ class ImporterBase { NodeValueMap node_values_; std::unique_ptr shape_refiner_; NameUniquifier* function_name_uniquifier_; + + protected: + // Maps feed as TensorId to new Placeholder node name. + absl::flat_hash_map remapped_feeds_; }; // Returns true if the node with given name has a non primary output that is @@ -425,6 +437,49 @@ Status PreprocessGraphDef(const GraphImportConfig* specs, GraphDef* graph_def) { return Status::OK(); } +// Mapping from node name to feed (index and ArrayInfo). Node name must outlive +// this map. +using FeedsByNode = absl::flat_hash_map< + absl::string_view, + absl::flat_hash_map*>>; + +// Creates from a `GraphImportConfig::InputArrays` a mapping from a feeds output +// tensor name to index and ArrayInfo. Keys and values are backed by +// `GraphImportConfig::InputArrays`. +StatusOr GetFeedsByNode( + const GraphImportConfig::InputArrays& inputs) { + FeedsByNode feeds_by_node; + feeds_by_node.reserve(inputs.size()); + + for (const auto& input : inputs) { + TensorId tensor = ParseTensorName(input.first); + if (tensor.index() < 0) + return errors::FailedPrecondition( + "Feed output tensor must be a data output '", tensor.ToString(), "'"); + + auto& node = feeds_by_node[tensor.node()]; + if (!node.insert({tensor.index(), &input}).second) + return errors::FailedPrecondition( + "Multiple feeds for the same output tensor '", tensor.ToString(), + "'"); + } + + return feeds_by_node; +} + +// Creates a unique name for a node that will be replacing a feed output tensor. +std::string GetUniqueNodeName( + absl::string_view node_name, int index, + const std::unordered_map& node_name_map) { + std::string new_node_name_base = absl::StrCat(node_name, "_", index); + int count = 0; + std::string new_node_name = new_node_name_base; + while (node_name_map.find(new_node_name) != node_name_map.end()) { + new_node_name = absl::StrCat(new_node_name_base, "_", count++); + } + return new_node_name; +} + Status ImporterBase::RemoveBackedges(const Graph& graph) { // TODO(fengliuai): Converting to GraphDef and back is the easiest way to // clone a graph. @@ -465,37 +520,54 @@ Status ImporterBase::RemoveBackedges(const Graph& graph) { return Status::OK(); } -StatusOr ImporterBase::ReplaceWithPlaceholderNode( - const TensorShapeProto& shape, DataType dtype, Node* input_node) { +StatusOr> ImporterBase::CreatePlaceholderNodeForFeed( + const TensorShapeProto& shape, DataType dtype, Node* node, int index, + const std::unordered_map& node_name_map) { + DCHECK_LT(index, node->num_outputs()); + const bool update_inplace = node->num_outputs() == 1 && index == 0; + std::string new_node_name = + update_inplace ? node->name() + : GetUniqueNodeName(node->name(), index, node_name_map); + Node* placeholder_node; - NodeBuilder builder(input_node->name(), "Placeholder"); + NodeBuilder builder(new_node_name, "Placeholder"); builder.Attr("shape", shape); builder.Attr("dtype", dtype); TF_RETURN_IF_ERROR(builder.Finalize(graph_.get(), &placeholder_node)); - while (!input_node->out_edges().empty()) { - const Edge* oe = *input_node->out_edges().begin(); - // UpdateEdge cannot be used with control edges. - if (oe->src_output() == Graph::kControlSlot) { - graph_->AddControlEdge(placeholder_node, oe->dst()); - graph_->RemoveControlEdge(oe); - continue; + // Update edges from original feed with Placeholder node. + std::vector data_edges; + std::vector control_edges; + for (const tensorflow::Edge* edge : node->out_edges()) { + if (edge->src_output() == index) { + data_edges.push_back(edge); + } else if (update_inplace && edge->IsControlEdge()) { + control_edges.push_back(edge); } - - TF_RETURN_IF_ERROR( - graph_->UpdateEdge(placeholder_node, 0, oe->dst(), oe->dst_input())); } - graph_->RemoveNode(input_node); + for (const auto* edge : data_edges) { + TF_RETURN_IF_ERROR(graph_->UpdateEdge(placeholder_node, 0, edge->dst(), + edge->dst_input())); + } - return placeholder_node; + for (const auto* edge : control_edges) { + graph_->AddControlEdge(placeholder_node, edge->dst()); + graph_->RemoveControlEdge(edge); + } + + if (update_inplace) { + graph_->RemoveNode(node); + } + + return std::pair(placeholder_node, update_inplace); } Status ImporterBase::GetInputOutputNodes( + const std::unordered_map& node_name_map, std::unordered_set* nodes) { - auto node_name_map = graph_->BuildNodeNameIndex(); - auto add_node = [&](const string& name) { - auto it = node_name_map.find(name); + auto add_node = [&](absl::string_view name) { + auto it = node_name_map.find(std::string(name)); if (it == node_name_map.end()) { return errors::FailedPrecondition( absl::StrCat("Graph does not contain node: ", name)); @@ -504,13 +576,25 @@ Status ImporterBase::GetInputOutputNodes( return Status::OK(); }; + // Remap feeds and fetches to newly created Placeholder nodes. for (const auto& input : specs_.inputs) { - TF_RETURN_IF_ERROR(add_node(input.first)); + TensorId tensor = ParseTensorName(input.first); + auto remapped_it = remapped_feeds_.find(tensor); + if (remapped_it != remapped_feeds_.end()) { + TF_RETURN_IF_ERROR(add_node(remapped_it->second)); + } else { + TF_RETURN_IF_ERROR(add_node(tensor.node())); + } } for (const auto& output : specs_.outputs) { - auto output_node_name = std::string(ParseTensorName(output).first); - TF_RETURN_IF_ERROR(add_node(output_node_name)); + TensorId tensor = ParseTensorName(output); + auto remapped_it = remapped_feeds_.find(tensor); + if (remapped_it != remapped_feeds_.end()) { + TF_RETURN_IF_ERROR(add_node(remapped_it->second)); + } else { + TF_RETURN_IF_ERROR(add_node(tensor.node())); + } } return Status::OK(); @@ -526,6 +610,9 @@ Status ImporterBase::AddNodesToShapeRefiner() { shape_refiner_->set_require_shape_inference_fns(false); shape_refiner_->set_function_library_for_shape_inference(&graph_flib_); + TF_ASSIGN_OR_RETURN(auto feeds_by_node, GetFeedsByNode(specs_.inputs)); + auto node_name_map = graph_->BuildNodeNameIndex(); + // First add all nodes to the refiner. for (Node* node : ordered_nodes_) { // We need to use a TensorFlow node to teach the shape refiner that user @@ -539,28 +626,49 @@ Status ImporterBase::AddNodesToShapeRefiner() { // it to replace the original input node, so the shape refiner can // successfully propagate the user's input type and shape to the rest of the // graph. - auto it = specs_.inputs.find(node->name()); - if (it != specs_.inputs.end()) { - auto node_name = node->op_def().name(); - if (node_name != "Placeholder" && node_name != "LegacyFedInput" && - node_name != FunctionLibraryDefinition::kArgOp) { - // We do not handle the case where the input node has multiple outputs - if (node->num_outputs() > 1) { - return errors::FailedPrecondition(absl::StrCat( - "Input arrays can only have op with single output. Node op:", - node_name)); + bool node_added_to_shape_refiner = false; + auto it = feeds_by_node.find(node->name()); + if (it != feeds_by_node.end()) { + auto op_name = node->op_def().name(); + if (op_name != "Placeholder" && op_name != "LegacyFedInput" && + op_name != FunctionLibraryDefinition::kArgOp) { + for (const auto& output_tensor : it->second) { + const int index = output_tensor.first; + const ArrayInfo& array_info = output_tensor.second->second; + + DataType dtype = array_info.imported_dtype; + // Uses the existing output type if it isn't specified by the user. + if (dtype == DT_INVALID) { + dtype = node->output_type(0); + } + + TF_ASSIGN_OR_RETURN( + auto placeholder_node_and_removed, + CreatePlaceholderNodeForFeed(array_info.shape, dtype, node, index, + node_name_map)); + + Node* placeholder_node = placeholder_node_and_removed.first; + if (placeholder_node_and_removed.second) { + // Original node has been removed from the graph. + node = placeholder_node; + node_added_to_shape_refiner = true; + } + remapped_feeds_[{it->first, index}] = placeholder_node->name(); + node_name_map[placeholder_node->name()] = placeholder_node; + // Add the new placeholder node to the shape refiner. + TF_RETURN_WITH_CONTEXT_IF_ERROR( + shape_refiner_->AddNode(placeholder_node), + GetLocationStr(*placeholder_node)); } - // For single output nodes, replace them with Placeholder node. - DataType dtype = it->second.imported_dtype; - // Uses the existing output type if it isn't specified by the user. - if (dtype == DT_INVALID) { - dtype = node->output_type(0); - } - TF_ASSIGN_OR_RETURN( - node, ReplaceWithPlaceholderNode(it->second.shape, dtype, node)); } else { - node->AddAttr("shape", it->second.shape); - DataType dtype = it->second.imported_dtype; + auto index_it = it->second.find(0); + if (index_it == it->second.end()) { + return errors::FailedPrecondition( + "Missing feed output tensor at index 0 for node '", node->name(), + "'"); + } + node->AddAttr("shape", index_it->second->second.shape); + DataType dtype = index_it->second->second.imported_dtype; // Uses the existing output type if it isn't specified by the user. if (dtype == DT_INVALID) { dtype = node->output_type(0); @@ -568,9 +676,11 @@ Status ImporterBase::AddNodesToShapeRefiner() { node->AddAttr("dtype", dtype); } } - // Adds the node to the shape refiner. - TF_RETURN_WITH_CONTEXT_IF_ERROR(shape_refiner_->AddNode(node), - GetLocationStr(*node)); + if (!node_added_to_shape_refiner) { + // Add the node to the shape refiner if the node hasn't been removed. + TF_RETURN_WITH_CONTEXT_IF_ERROR(shape_refiner_->AddNode(node), + GetLocationStr(*node)); + } auto set_shape_from_list_attr = [&](const AttrValue* attr) { auto& list = attr->list(); @@ -631,7 +741,7 @@ Status ImporterBase::AddNodesToShapeRefiner() { // Prune nodes in the graph that are not reachable from the output. if (specs_.prune_unused_nodes) { std::unordered_set prune_start; - TF_RETURN_IF_ERROR(GetInputOutputNodes(&prune_start)); + TF_RETURN_IF_ERROR(GetInputOutputNodes(node_name_map, &prune_start)); if (!prune_start.empty()) { if (PruneForReverseReachability(graph_.get(), prune_start)) { VLOG(1) << "Pruned unused nodes in graphdef"; @@ -1703,36 +1813,52 @@ StatusOr GraphDefImporter::InferMainFunctionType( const GraphImportConfig& specs, mlir::MLIRContext* context, absl::InlinedVector* arg_nodes, absl::InlinedVector* ret_nodes) { - // Finds out all the input nodes and output nodes. - absl::flat_hash_set output_node_names; - for (const auto& output_tensor : specs.outputs) { - output_node_names.insert(ParseTensorName(output_tensor).node()); + // Find all the input nodes and output nodes. + // Feeds have been remapped to single output nodes (Placeholder), so an exact + // name match is sufficient. + absl::flat_hash_map inputs; + for (auto input_and_idx : llvm::enumerate(specs.inputs)) { + TensorId tensor = ParseTensorName(input_and_idx.value().first); + auto remapped_it = remapped_feeds_.find(tensor); + if (remapped_it != remapped_feeds_.end()) { + inputs.insert({remapped_it->second, input_and_idx.index()}); + } else { + inputs.insert({tensor.node(), input_and_idx.index()}); + } } - if (!specs.inputs.empty() || !specs.outputs.empty()) { - arg_nodes->resize(specs.inputs.size()); - ret_nodes->resize(specs.outputs.size()); + + absl::flat_hash_set output_node_names; + std::vector outputs; + output_node_names.reserve(specs.outputs.size()); + for (const auto& output : specs.outputs) { + TensorId tensor = ParseTensorName(output); + auto remapped_it = remapped_feeds_.find(tensor); + if (remapped_it != remapped_feeds_.end()) { + output_node_names.insert(remapped_it->second); + outputs.push_back({remapped_it->second, 0}); + } else { + output_node_names.insert(tensor.node()); + outputs.push_back(tensor); + } + } + + if (!inputs.empty() || !outputs.empty()) { + arg_nodes->resize(inputs.size()); + ret_nodes->resize(outputs.size()); for (Node* n : GetOrderedNodes()) { // Handle inputs/arguments. - auto input_it = specs.inputs.find(n->name()); - if (input_it != specs.inputs.end()) { - (*arg_nodes)[std::distance(specs.inputs.begin(), input_it)] = {n, 0}; + auto input_it = inputs.find(n->name()); + if (input_it != inputs.end()) { + (*arg_nodes)[input_it->second] = {n, 0}; } // Handle outputs/returns. if (output_node_names.contains(n->name())) { - for (int i = 0, e = specs.outputs.size(); i != e; ++i) { - std::pair name_and_port = - absl::StrSplit(specs.outputs[i], ':'); - auto name = name_and_port.first; - if (name != n->name()) continue; - int port = 0; - if (!name_and_port.second.empty() && - !absl::SimpleAtoi(name_and_port.second, &port)) { - return errors::InvalidArgument("Invalid port specification: ", - specs.outputs[i]); - } - (*ret_nodes)[i] = {n, port}; + for (int i = 0, e = outputs.size(); i != e; ++i) { + TensorId tensor = outputs[i]; + if (n->name() != tensor.node()) continue; + (*ret_nodes)[i] = {n, tensor.index()}; } } } From a7e16bbcad88a3b2613367c7c26c3b3edc5cbc7c Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Wed, 4 Dec 2019 22:32:28 -0800 Subject: [PATCH 136/383] Add/Fix docs for metrics. PiperOrigin-RevId: 283908162 Change-Id: Id4b08ab53a77751ec679dd1182e9a3772b53d6ce --- tensorflow/python/keras/metrics.py | 766 +++++++++++++++++------------ 1 file changed, 458 insertions(+), 308 deletions(-) diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py index db8f897ec67..2fd79fc0f2e 100644 --- a/tensorflow/python/keras/metrics.py +++ b/tensorflow/python/keras/metrics.py @@ -88,8 +88,8 @@ class Metric(base_layer.Layer): model.add(tf.keras.layers.Dense(64, activation='relu')) model.add(tf.keras.layers.Dense(10, activation='softmax')) - model.compile(optimizer=tf.compat.v1.train.RMSPropOptimizer(0.01), - loss=tf.keras.losses.categorical_crossentropy, + model.compile(optimizer=tf.keras.optimizers.RMSprop(0.01), + loss=tf.keras.losses.CategoricalCrossentropy(), metrics=[tf.keras.metrics.CategoricalAccuracy()]) data = np.random.random((1000, 32)) @@ -97,9 +97,8 @@ class Metric(base_layer.Layer): dataset = tf.data.Dataset.from_tensor_slices((data, labels)) dataset = dataset.batch(32) - dataset = dataset.repeat() - model.fit(dataset, epochs=10, steps_per_epoch=30) + model.fit(dataset, epochs=10) ``` To be implemented by subclasses: @@ -112,7 +111,7 @@ class Metric(base_layer.Layer): Example subclass implementation: - ``` + ```python class BinaryTruePositives(tf.keras.metrics.Metric): def __init__(self, name='binary_true_positives', **kwargs): @@ -389,11 +388,10 @@ class Sum(Reduce): Usage: - ```python - m = tf.keras.metrics.Sum() - m.update_state([1, 3, 5, 7]) - print('Final result: ', m.result().numpy()) # Final result: 16.0 - ``` + >>> m = tf.keras.metrics.Sum() + >>> _ = m.update_state([1, 3, 5, 7]) + >>> m.result().numpy() + 16.0 Usage with tf.keras API: @@ -465,8 +463,8 @@ class MeanRelativeError(Mean): """Computes the mean relative error by normalizing with the given values. This metric creates two local variables, `total` and `count` that are used to - compute the mean relative absolute error. This average is weighted by - `sample_weight`, and it is ultimately returned as `mean_relative_error`: + compute the mean relative error. This is weighted by `sample_weight`, and + it is ultimately returned as `mean_relative_error`: an idempotent operation that simply divides `total` by `count`. If `sample_weight` is `None`, weights default to 1. @@ -474,15 +472,14 @@ class MeanRelativeError(Mean): Usage: - ```python - m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3]) - m.update_state([1, 3, 2, 3], [2, 4, 6, 8]) + >>> m = tf.keras.metrics.MeanRelativeError(normalizer=[1, 3, 2, 3]) + >>> _ = m.update_state([1, 3, 2, 3], [2, 4, 6, 8]) - # metric = mean(|y_pred - y_true| / normalizer) - # = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3]) - # = 5/4 = 1.25 - print('Final result: ', m.result().numpy()) # Final result: 1.25 - ``` + >>> # metric = mean(|y_pred - y_true| / normalizer) + >>> # = mean([1, 1, 4, 5] / [1, 3, 2, 3]) = mean([1, 1/3, 2, 5/3]) + >>> # = 5/4 = 1.25 + >>> m.result().numpy() + 1.25 Usage with tf.keras API: @@ -567,11 +564,17 @@ class MeanMetricWrapper(Mean): `y_true` and `y_pred` should have the same shape. Args: - y_true: The ground truth values. - y_pred: The predicted values. - sample_weight: Optional weighting of each example. Defaults to 1. Can be - a `Tensor` whose rank is either 0, or the same rank as `y_true`, - and must be broadcastable to `y_true`. + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + sample_weight: Optional `sample_weight` acts as a + coefficient for the metric. If a scalar is provided, then the metric is + simply scaled by the given value. If `sample_weight` is a tensor of size + `[batch_size]`, then the metric for each sample of the batch is rescaled + by the corresponding element in the `sample_weight` vector. If the shape + of `sample_weight` is `[batch_size, d0, .. dN-1]` (or can be broadcasted + to this shape), then each metric element of `y_pred` is scaled by the + corresponding value of `sample_weight`. (Note on `dN-1`: all metric + functions reduce by 1 dimension, usually the last axis (-1)). Returns: Update op. @@ -598,11 +601,7 @@ class MeanMetricWrapper(Mean): @keras_export('keras.metrics.Accuracy') class Accuracy(MeanMetricWrapper): - """Calculates how often predictions matches labels. - - For example, if `y_true` is [1, 2, 3, 4] and `y_pred` is [0, 2, 3, 4] - then the accuracy is 3/4 or .75. If the weights were specified as - [1, 1, 0, 0] then the accuracy would be 1/2 or .5. + """Calculates how often predictions equals labels. This metric creates two local variables, `total` and `count` that are used to compute the frequency with which `y_pred` matches `y_true`. This frequency is @@ -615,11 +614,13 @@ class Accuracy(MeanMetricWrapper): Usage: >>> m = tf.keras.metrics.Accuracy() - >>> _ = m.update_state([1, 2, 3, 4], [0, 2, 3, 4]) + >>> _ = m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]]) >>> m.result().numpy() 0.75 + >>> m.reset_states() - >>> _ = m.update_state([1, 2, 3, 4], [0, 2, 3, 4], sample_weight=[1, 1, 0, 0]) + >>> _ = m.update_state([[1], [2], [3], [4]], [[0], [2], [3], [4]], + ... sample_weight=[1, 1, 0, 0]) >>> m.result().numpy() 0.5 @@ -637,11 +638,7 @@ class Accuracy(MeanMetricWrapper): @keras_export('keras.metrics.BinaryAccuracy') class BinaryAccuracy(MeanMetricWrapper): - """Calculates how often predictions matches labels. - - For example, if `y_true` is [1, 1, 0, 0] and `y_pred` is [0.98, 1, 0, 0.6] - then the binary accuracy is 3/4 or .75. If the weights were specified as - [1, 0, 0, 1] then the binary accuracy would be 1/2 or .5. + """Calculates how often predictions matches binary labels. This metric creates two local variables, `total` and `count` that are used to compute the frequency with which `y_pred` matches `y_true`. This frequency is @@ -653,11 +650,16 @@ class BinaryAccuracy(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.BinaryAccuracy() - m.update_state([1, 1, 0, 0], [0.98, 1, 0, 0.6]) - print('Final result: ', m.result().numpy()) # Final result: 0.75 - ``` + >>> m = tf.keras.metrics.BinaryAccuracy() + >>> _ = m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]]) + >>> m.result().numpy() + 0.75 + + >>> m.reset_states() + >>> _ = m.update_state([[1], [1], [0], [0]], [[0.98], [1], [0], [0.6]], + ... sample_weight=[1, 0, 0, 1]) + >>> m.result().numpy() + 0.5 Usage with tf.keras API: @@ -682,12 +684,9 @@ class BinaryAccuracy(MeanMetricWrapper): @keras_export('keras.metrics.CategoricalAccuracy') class CategoricalAccuracy(MeanMetricWrapper): - """Calculates how often predictions matches labels. + """Calculates how often predictions matches one-hot labels. - For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is - [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5. - If the weights were specified as [0.7, 0.3] then the categorical accuracy - would be .3. You can provide logits of classes as `y_pred`, since argmax of + You can provide logits of classes as `y_pred`, since argmax of logits and probabilities are same. This metric creates two local variables, `total` and `count` that are used to @@ -709,6 +708,13 @@ class CategoricalAccuracy(MeanMetricWrapper): >>> m.result().numpy() 0.5 + >>> m.reset_states() + >>> _ = m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], + ... [0.05, 0.95, 0]], + ... sample_weight=[0.7, 0.3]) + >>> m.result().numpy() + 0.3 + Usage with tf.keras API: ```python @@ -735,10 +741,7 @@ class CategoricalAccuracy(MeanMetricWrapper): class SparseCategoricalAccuracy(MeanMetricWrapper): """Calculates how often predictions matches integer labels. - For example, if `y_true` is [[2], [1]] and `y_pred` is - [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5. - If the weights were specified as [0.7, 0.3] then the categorical accuracy - would be .3. You can provide logits of classes as `y_pred`, since argmax of + You can provide logits of classes as `y_pred`, since argmax of logits and probabilities are same. This metric creates two local variables, `total` and `count` that are used to @@ -756,6 +759,12 @@ class SparseCategoricalAccuracy(MeanMetricWrapper): >>> m.result().numpy() 0.5 + >>> m.reset_states() + >>> _ = m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]], + ... sample_weight=[0.7, 0.3]) + >>> m.result().numpy() + 0.3 + Usage with tf.keras API: ```python @@ -778,11 +787,18 @@ class TopKCategoricalAccuracy(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.TopKCategoricalAccuracy() - m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]) - print('Final result: ', m.result().numpy()) # Final result: 1.0 - ``` + >>> m = tf.keras.metrics.TopKCategoricalAccuracy(k=1) + >>> _ = m.update_state([[0, 0, 1], [0, 1, 0]], + ... [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]) + >>> m.result().numpy() + 0.5 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 0, 1], [0, 1, 0]], + ... [[0.1, 0.9, 0.8], [0.05, 0.95, 0]], + ... sample_weight=[0.7, 0.3]) + >>> m.result().numpy() + 0.3 Usage with tf.keras API: @@ -811,11 +827,16 @@ class SparseTopKCategoricalAccuracy(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.SparseTopKCategoricalAccuracy() - m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]) - print('Final result: ', m.result().numpy()) # Final result: 1.0 - ``` + >>> m = tf.keras.metrics.SparseTopKCategoricalAccuracy(k=1) + >>> _ = m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]]) + >>> m.result().numpy() + 0.5 + + >>> m.reset_states() + >>> _ = m.update_state([2, 1], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]], + ... sample_weight=[0.7, 0.3]) + >>> m.result().numpy() + 0.3 Usage with tf.keras API: @@ -912,10 +933,6 @@ class _ConfusionMatrixConditionCount(Metric): class FalsePositives(_ConfusionMatrixConditionCount): """Calculates the number of false positives. - For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [0, 0, 1, 1] - then the false positives value is 2. If the weights were specified as - [0, 0, 1, 0] then the false positives value would be 1. - If `sample_weight` is given, calculates the sum of the weights of false positives. This metric creates one local variable, `accumulator` that is used to keep track of the number of false positives. @@ -925,11 +942,15 @@ class FalsePositives(_ConfusionMatrixConditionCount): Usage: - ```python - m = tf.keras.metrics.FalsePositives() - m.update_state([0, 1, 0, 0], [0, 0, 1, 1]) - print('Final result: ', m.result().numpy()) # Final result: 2 - ``` + >>> m = tf.keras.metrics.FalsePositives() + >>> _ = m.update_state([0, 1, 0, 0], [0, 0, 1, 1]) + >>> m.result().numpy() + 2.0 + + >>> m.reset_states() + >>> _ = m.update_state([0, 1, 0, 0], [0, 0, 1, 1], sample_weight=[0, 0, 1, 0]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -962,10 +983,6 @@ class FalsePositives(_ConfusionMatrixConditionCount): class FalseNegatives(_ConfusionMatrixConditionCount): """Calculates the number of false negatives. - For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [0, 1, 0, 0] - then the false negatives value is 2. If the weights were specified as - [0, 0, 1, 0] then the false negatives value would be 1. - If `sample_weight` is given, calculates the sum of the weights of false negatives. This metric creates one local variable, `accumulator` that is used to keep track of the number of false negatives. @@ -975,11 +992,15 @@ class FalseNegatives(_ConfusionMatrixConditionCount): Usage: - ```python - m = tf.keras.metrics.FalseNegatives() - m.update_state([0, 1, 1, 1], [0, 1, 0, 0]) - print('Final result: ', m.result().numpy()) # Final result: 2 - ``` + >>> m = tf.keras.metrics.FalseNegatives() + >>> _ = m.update_state([0, 1, 1, 1], [0, 1, 0, 0]) + >>> m.result().numpy() + 2.0 + + >>> m.reset_states() + >>> _ = m.update_state([0, 1, 1, 1], [0, 1, 0, 0], sample_weight=[0, 0, 1, 0]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1012,10 +1033,6 @@ class FalseNegatives(_ConfusionMatrixConditionCount): class TrueNegatives(_ConfusionMatrixConditionCount): """Calculates the number of true negatives. - For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [1, 1, 0, 0] - then the true negatives value is 2. If the weights were specified as - [0, 0, 1, 0] then the true negatives value would be 1. - If `sample_weight` is given, calculates the sum of the weights of true negatives. This metric creates one local variable, `accumulator` that is used to keep track of the number of true negatives. @@ -1025,11 +1042,15 @@ class TrueNegatives(_ConfusionMatrixConditionCount): Usage: - ```python - m = tf.keras.metrics.TrueNegatives() - m.update_state([0, 1, 0, 0], [1, 1, 0, 0]) - print('Final result: ', m.result().numpy()) # Final result: 2 - ``` + >>> m = tf.keras.metrics.TrueNegatives() + >>> _ = m.update_state([0, 1, 0, 0], [1, 1, 0, 0]) + >>> m.result().numpy() + 2.0 + + >>> m.reset_states() + >>> _ = m.update_state([0, 1, 0, 0], [1, 1, 0, 0], sample_weight=[0, 0, 1, 0]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1062,10 +1083,6 @@ class TrueNegatives(_ConfusionMatrixConditionCount): class TruePositives(_ConfusionMatrixConditionCount): """Calculates the number of true positives. - For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1] - then the true positives value is 2. If the weights were specified as - [0, 0, 1, 0] then the true positives value would be 1. - If `sample_weight` is given, calculates the sum of the weights of true positives. This metric creates one local variable, `true_positives` that is used to keep track of the number of true positives. @@ -1075,11 +1092,15 @@ class TruePositives(_ConfusionMatrixConditionCount): Usage: - ```python - m = tf.keras.metrics.TruePositives() - m.update_state([0, 1, 1, 1], [1, 0, 1, 1]) - print('Final result: ', m.result().numpy()) # Final result: 2 - ``` + >>> m = tf.keras.metrics.TruePositives() + >>> _ = m.update_state([0, 1, 1, 1], [1, 0, 1, 1]) + >>> m.result().numpy() + 2.0 + + >>> m.reset_states() + >>> _ = m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1112,10 +1133,6 @@ class TruePositives(_ConfusionMatrixConditionCount): class Precision(Metric): """Computes the precision of the predictions with respect to the labels. - For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1] - then the precision value is 2/(2+1) ie. 0.66. If the weights were specified as - [0, 0, 1, 0] then the precision value would be 1. - The metric creates two local variables, `true_positives` and `false_positives` that are used to compute the precision. This value is ultimately returned as `precision`, an idempotent operation that simply divides `true_positives` @@ -1135,11 +1152,15 @@ class Precision(Metric): Usage: - ```python - m = tf.keras.metrics.Precision() - m.update_state([0, 1, 1, 1], [1, 0, 1, 1]) - print('Final result: ', m.result().numpy()) # Final result: 0.66 - ``` + >>> m = tf.keras.metrics.Precision() + >>> _ = m.update_state([0, 1, 1, 1], [1, 0, 1, 1]) + >>> m.result().numpy() + 0.6666667 + + >>> m.reset_states() + >>> _ = m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1239,10 +1260,6 @@ class Precision(Metric): class Recall(Metric): """Computes the recall of the predictions with respect to the labels. - For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1] - then the recall value is 2/(2+1) ie. 0.66. If the weights were specified as - [0, 0, 1, 0] then the recall value would be 1. - This metric creates two local variables, `true_positives` and `false_negatives`, that are used to compute the recall. This value is ultimately returned as `recall`, an idempotent operation that simply divides @@ -1261,11 +1278,15 @@ class Recall(Metric): Usage: - ```python - m = tf.keras.metrics.Recall() - m.update_state([0, 1, 1, 1], [1, 0, 1, 1]) - print('Final result: ', m.result().numpy()) # Final result: 0.66 - ``` + >>> m = tf.keras.metrics.Recall() + >>> _ = m.update_state([0, 1, 1, 1], [1, 0, 1, 1]) + >>> m.result().numpy() + 0.6666667 + + >>> m.reset_states() + >>> _ = m.update_state([0, 1, 1, 1], [1, 0, 1, 1], sample_weight=[0, 0, 1, 0]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1452,11 +1473,16 @@ class SensitivityAtSpecificity(SensitivitySpecificityBase): Usage: - ```python - m = tf.keras.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1) - m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) - print('Final result: ', m.result().numpy()) # Final result: 0.5 - ``` + >>> m = tf.keras.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1) + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) + >>> m.result().numpy() + 0.5 + + >>> m.reset_states() + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9], + ... sample_weight=[1, 0, 0, 1]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1533,11 +1559,16 @@ class SpecificityAtSensitivity(SensitivitySpecificityBase): Usage: - ```python - m = tf.keras.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1) - m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) - print('Final result: ', m.result().numpy()) # Final result: 1.0 - ``` + >>> m = tf.keras.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1) + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) + >>> m.result().numpy() + 1.0 + + >>> m.reset_states() + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9], + ... sample_weight=[1, 0, 0, 1]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1606,11 +1637,16 @@ class PrecisionAtRecall(SensitivitySpecificityBase): Usage: - ```python - m = tf.keras.metrics.PrecisionAtRecall(0.8, num_thresholds=1) - m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) - print('Final result: ', m.result().numpy()) # Final result: 1.0 - ``` + >>> m = tf.keras.metrics.PrecisionAtRecall(0.8, num_thresholds=1) + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) + >>> m.result().numpy() + 1.0 + + >>> m.reset_states() + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9], + ... sample_weight=[1, 0, 0, 1]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -1696,17 +1732,20 @@ class AUC(Metric): Usage: - ```python - m = tf.keras.metrics.AUC(num_thresholds=3) - m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) + >>> m = tf.keras.metrics.AUC(num_thresholds=3) + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9]) + >>> # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7] + >>> # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2] + >>> # recall = [1, 0.5, 0], fp_rate = [1, 0, 0] + >>> # auc = ((((1+0.5)/2)*(1-0))+ (((0.5+0)/2)*(0-0))) = 0.75 + >>> m.result().numpy() + 0.75 - # threshold values are [0 - 1e-7, 0.5, 1 + 1e-7] - # tp = [2, 1, 0], fp = [2, 0, 0], fn = [0, 1, 2], tn = [0, 2, 2] - # recall = [1, 0.5, 0], fp_rate = [1, 0, 0] - # auc = ((((1+0.5)/2)*(1-0))+ (((0.5+0)/2)*(0-0))) = 0.75 - - print('Final result: ', m.result().numpy()) # Final result: 0.75 - ``` + >>> m.reset_states() + >>> _ = m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9], + ... sample_weight=[1, 0, 0, 1]) + >>> m.result().numpy() + 1.0 Usage with tf.keras API: @@ -2079,24 +2118,26 @@ class CosineSimilarity(MeanMetricWrapper): cosine similarity = (a . b) / ||a|| ||b|| [Cosine Similarity](https://en.wikipedia.org/wiki/Cosine_similarity) - For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine - similarity is 0.5. - This metric keeps the average cosine similarity between `predictions` and `labels` over a stream of data. Usage: - ```python - m = tf.keras.metrics.CosineSimilarity(axis=1) - m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]]) - # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]] - # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]] - # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]] - # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1)) - = ((0. + 0.) + (0.5 + 0.5)) / 2 - print('Final result: ', m.result().numpy()) # Final result: 0.5 - ``` + >>> # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]] + >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]] + >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]] + >>> # result = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1)) + >>> # = ((0. + 0.) + (0.5 + 0.5)) / 2 + >>> m = tf.keras.metrics.CosineSimilarity(axis=1) + >>> _ = m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]]) + >>> m.result().numpy() + 0.49999997 + + >>> m.reset_states() + >>> _ = m.update_state([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]], + ... sample_weight=[0.3, 0.7]) + >>> m.result().numpy() + 0.6999999 Usage with tf.keras API: @@ -2126,21 +2167,25 @@ class CosineSimilarity(MeanMetricWrapper): class MeanAbsoluteError(MeanMetricWrapper): """Computes the mean absolute error between the labels and predictions. - For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.] - the mean absolute error is 3/4 (0.75). - Usage: - >>> m = MeanAbsoluteError() - >>> _ = m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.]) + >>> m = tf.keras.metrics.MeanAbsoluteError() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) >>> m.result().numpy() - 0.75 + 0.25 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.5 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsoluteError()]) + model.compile( + 'sgd', loss='mse', metrics=[tf.keras.metrics.MeanAbsoluteError()]) ``` """ @@ -2153,13 +2198,16 @@ class MeanAbsoluteError(MeanMetricWrapper): class MeanAbsolutePercentageError(MeanMetricWrapper): """Computes the mean absolute percentage error between `y_true` and `y_pred`. - For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.] - the mean absolute percentage error is 5e+08. - Usage: >>> m = tf.keras.metrics.MeanAbsolutePercentageError() - >>> _ = m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.]) + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) + >>> m.result().numpy() + 250000000.0 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) >>> m.result().numpy() 500000000.0 @@ -2167,7 +2215,10 @@ class MeanAbsolutePercentageError(MeanMetricWrapper): ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.MeanAbsolutePercentageError()]) + model.compile( + 'sgd', + loss='mse', + metrics=[tf.keras.metrics.MeanAbsolutePercentageError()]) ``` """ @@ -2180,22 +2231,25 @@ class MeanAbsolutePercentageError(MeanMetricWrapper): class MeanSquaredError(MeanMetricWrapper): """Computes the mean squared error between `y_true` and `y_pred`. - For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.] - the mean squared error is 3/4 (0.75). - Usage: - ```python - m = tf.keras.metrics.MeanSquaredError() - m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Final result: ', m.result().numpy()) # Final result: 0.75 - ``` + >>> m = tf.keras.metrics.MeanSquaredError() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) + >>> m.result().numpy() + 0.25 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.5 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredError()]) + model.compile( + 'sgd', loss='mse', metrics=[tf.keras.metrics.MeanSquaredError()]) ``` """ @@ -2208,22 +2262,27 @@ class MeanSquaredError(MeanMetricWrapper): class MeanSquaredLogarithmicError(MeanMetricWrapper): """Computes the mean squared logarithmic error between `y_true` and `y_pred`. - For example, if `y_true` is [0., 0., 1., 1.], and `y_pred` is [1., 1., 1., 0.] - the mean squared logarithmic error is 0.36034. - Usage: - ```python - m = tf.keras.metrics.MeanSquaredLogarithmicError() - m.update_state([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Final result: ', m.result().numpy()) # Final result: 0.36034 - ``` + >>> m = tf.keras.metrics.MeanSquaredLogarithmicError() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) + >>> m.result().numpy() + 0.12011322 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.24022643 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()]) + model.compile( + 'sgd', + loss='mse', + metrics=[tf.keras.metrics.MeanSquaredLogarithmicError()]) ``` """ @@ -2239,25 +2298,24 @@ class Hinge(MeanMetricWrapper): `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided we will convert them to -1 or 1. - For example, if `y_true` is [-1., 1., 1.], and `y_pred` is [0.6, -0.7, -0.5] - the hinge metric value is 1.6. - Usage: - ```python - m = tf.keras.metrics.Hinge() - m.update_state([-1., 1., 1.], [0.6, -0.7, -0.5]) + >>> m = tf.keras.metrics.Hinge() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> m.result().numpy() + 1.3 - # result = max(0, 1-y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3 - - print('Final result: ', m.result().numpy()) # Final result: 1.6 - ``` + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 1.1 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.Hinge()]) + model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Hinge()]) ``` """ @@ -2272,25 +2330,27 @@ class SquaredHinge(MeanMetricWrapper): `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided we will convert them to -1 or 1. - For example, if `y_true` is [-1., 1., 1.], and `y_pred` is [0.6, -0.7, -0.5] - the squared hinge metric value is 2.6. - Usage: - ```python - m = tf.keras.metrics.SquaredHinge() - m.update_state([-1., 1., 1.], [0.6, -0.7, -0.5]) + >>> m = tf.keras.metrics.SquaredHinge() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> m.result().numpy() + 1.86 - # result = max(0, 1-y_true * y_pred) = [1.6^2 + 1.7^2 + 1.5^2] / 3 - - print('Final result: ', m.result().numpy()) # Final result: 2.6 - ``` + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 1.46 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.SquaredHinge()]) + model.compile( + 'sgd', + loss='mse', + metrics=[tf.keras.metrics.SquaredHinge()]) ``` """ @@ -2302,22 +2362,27 @@ class SquaredHinge(MeanMetricWrapper): class CategoricalHinge(MeanMetricWrapper): """Computes the categorical hinge metric between `y_true` and `y_pred`. - For example, if `y_true` is [0., 1., 1.], and `y_pred` is [1., 0., 1.] - the categorical hinge metric value is 1.0. - Usage: - ```python - m = tf.keras.metrics.CategoricalHinge() - m.update_state([0., 1., 1.], [1., 0., 1.]) - print('Final result: ', m.result().numpy()) # Final result: 1.0 - ``` + >>> m = tf.keras.metrics.CategoricalHinge() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> m.result().numpy() + 1.4000001 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 1.2 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.CategoricalHinge()]) + model.compile( + 'sgd', + loss='mse', + metrics=[tf.keras.metrics.CategoricalHinge()]) ``` """ @@ -2331,17 +2396,25 @@ class RootMeanSquaredError(Mean): Usage: - ```python - m = tf.keras.metrics.RootMeanSquaredError() - m.update_state([2., 4., 6.], [1., 3., 2.]) - print('Final result: ', m.result().numpy()) # Final result: 2.449 - ``` + >>> m = tf.keras.metrics.RootMeanSquaredError() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) + >>> m.result().numpy() + 0.5 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.70710677 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.RootMeanSquaredError()]) + model.compile( + 'sgd', + loss='mse', + metrics=[tf.keras.metrics.RootMeanSquaredError()]) ``` """ @@ -2381,17 +2454,22 @@ class LogCoshError(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.LogCoshError() - m.update_state([0., 1., 1.], [1., 0., 1.]) - print('Final result: ', m.result().numpy()) # Final result: 0.289 - ``` + >>> m = tf.keras.metrics.LogCoshError() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) + >>> m.result().numpy() + 0.10844523 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.21689045 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.LogCoshError()]) + model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.LogCoshError()]) ``` """ @@ -2407,17 +2485,22 @@ class Poisson(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.Poisson() - m.update_state([1, 9, 2], [4, 8, 12]) - print('Final result: ', m.result().numpy()) # Final result: -4.63 - ``` + >>> m = tf.keras.metrics.Poisson() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]]) + >>> m.result().numpy() + 0.49999997 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[1, 1], [0, 0]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.99999994 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.Poisson()]) + model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Poisson()]) ``` """ @@ -2433,17 +2516,22 @@ class KLDivergence(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.KLDivergence() - m.update_state([.4, .9, .2], [.5, .8, .12]) - print('Final result: ', m.result().numpy()) # Final result: -0.043 - ``` + >>> m = tf.keras.metrics.KLDivergence() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> m.result().numpy() + 0.45814306 + + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.9162892 Usage with tf.keras API: ```python model = tf.keras.Model(inputs, outputs) - model.compile('sgd', metrics=[tf.keras.metrics.KLDivergence()]) + model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.KLDivergence()]) ``` """ @@ -2468,17 +2556,21 @@ class MeanIoU(Metric): Usage: - ```python - m = tf.keras.metrics.MeanIoU(num_classes=2) - m.update_state([0, 0, 1, 1], [0, 1, 0, 1]) + >>> # cm = [[1, 1], + >>> # [1, 1]] + >>> # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1] + >>> # iou = true_positives / (sum_row + sum_col - true_positives)) + >>> # result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2 = 0.33 + >>> m = tf.keras.metrics.MeanIoU(num_classes=2) + >>> _ = m.update_state([0, 0, 1, 1], [0, 1, 0, 1]) + >>> m.result().numpy() + 0.33333334 - # cm = [[1, 1], - [1, 1]] - # sum_row = [2, 2], sum_col = [2, 2], true_positives = [1, 1] - # iou = true_positives / (sum_row + sum_col - true_positives)) - # result = (1 / (2 + 2 - 1) + 1 / (2 + 2 - 1)) / 2 = 0.33 - print('Final result: ', m.result().numpy()) # Final result: 0.33 - ``` + >>> m.reset_states() + >>> _ = m.update_state([0, 0, 1, 1], [0, 1, 0, 1], + ... sample_weight=[0.3, 0.3, 0.3, 0.1]) + >>> m.result().numpy() + 0.23809525 Usage with tf.keras API: @@ -2536,8 +2628,10 @@ class MeanIoU(Metric): if y_true.shape.ndims > 1: y_true = array_ops.reshape(y_true, [-1]) - if sample_weight is not None and sample_weight.shape.ndims > 1: - sample_weight = array_ops.reshape(sample_weight, [-1]) + if sample_weight is not None: + sample_weight = math_ops.cast(sample_weight, self._dtype) + if sample_weight.shape.ndims > 1: + sample_weight = array_ops.reshape(sample_weight, [-1]) # Accumulate the prediction to current confusion matrix. current_cm = confusion_matrix.confusion_matrix( @@ -2592,14 +2686,15 @@ class MeanTensor(Metric): Usage: - ```python - m = tf.keras.metrics.MeanTensor() - m.update_state([0, 1, 2, 3]) - m.update_state([4, 5, 6, 7]) - print('Result: ', m.result().numpy()) # Result: [2, 3, 4, 5] - m.update_state([12, 10, 8, 6], sample_weights= [0, 0.2, 0.5, 1]) - print('Result: ', m.result().numpy()) # Result: [2, 3.636, 4.8, 5.333] - ``` + >>> m = tf.keras.metrics.MeanTensor() + >>> _ = m.update_state([0, 1, 2, 3]) + >>> _ = m.update_state([4, 5, 6, 7]) + >>> m.result().numpy() + array([2., 3., 4., 5.], dtype=float32) + + >>> _ = m.update_state([12, 10, 8, 6], sample_weight= [0, 0.2, 0.5, 1]) + >>> m.result().numpy() + array([2. , 3.6363635, 4.8 , 5.3333335], dtype=float32) """ def __init__(self, name='mean_tensor', dtype=None): @@ -2701,22 +2796,16 @@ class BinaryCrossentropy(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.BinaryCrossentropy() - m.update_state([1., 0., 1., 0.], [1., 1., 1., 0.]) + >>> m = tf.keras.metrics.BinaryCrossentropy() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> m.result().numpy() + 0.81492424 - # EPSILON = 1e-7, y = y_true, y` = y_pred, Y_MAX = 0.9999999 - # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON) - # y` = [Y_MAX, Y_MAX, Y_MAX, EPSILON] - - # Metric = -(y log(y` + EPSILON) + (1 - y) log(1 - y` + EPSILON)) - # = [-log(Y_MAX + EPSILON), -log(1 - Y_MAX + EPSILON), - # -log(Y_MAX + EPSILON), -log(1)] - # = [(0 + 15.33) / 2, (0 + 0) / 2] - # Reduced metric = 7.665 / 2 - - print('Final result: ', m.result().numpy()) # Final result: 3.833 - ``` + >>> m.reset_states() + >>> _ = m.update_state([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> m.result().numpy() + 0.9162905 Usage with tf.keras API: @@ -2766,22 +2855,25 @@ class CategoricalCrossentropy(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.CategoricalCrossentropy() - m.update_state([[0, 1, 0], [0, 0, 1]], - [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]) + >>> # EPSILON = 1e-7, y = y_true, y` = y_pred + >>> # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON) + >>> # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]] + >>> # xent = -sum(y * log(y'), axis = -1) + >>> # = -((log 0.95), (log 0.1)) + >>> # = [0.051, 2.302] + >>> # Reduced xent = (0.051 + 2.302) / 2 + >>> m = tf.keras.metrics.CategoricalCrossentropy() + >>> _ = m.update_state([[0, 1, 0], [0, 0, 1]], + ... [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]) + >>> m.result().numpy() + 1.1769392 - # EPSILON = 1e-7, y = y_true, y` = y_pred - # y` = clip_ops.clip_by_value(output, EPSILON, 1. - EPSILON) - # y` = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]] - - # xent = -sum(y * log(y'), axis = -1) - # = -((log 0.95), (log 0.1)) - # = [0.051, 2.302] - # Reduced xent = (0.051 + 2.302) / 2 - - print('Final result: ', m.result().numpy()) # Final result: 1.176 - ``` + >>> m.reset_states() + >>> _ = m.update_state([[0, 1, 0], [0, 0, 1]], + ... [[0.05, 0.95, 0], [0.1, 0.8, 0.1]], + ... sample_weight=tf.constant([0.3, 0.7])) + >>> m.result().numpy() + 1.6271976 Usage with tf.keras API: @@ -2835,26 +2927,28 @@ class SparseCategoricalCrossentropy(MeanMetricWrapper): Usage: - ```python - m = tf.keras.metrics.SparseCategoricalCrossentropy() - m.update_state( - [1, 2], - [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]) + >>> # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]] + >>> # logits = log(y_pred) + >>> # softmax = exp(logits) / sum(exp(logits), axis=-1) + >>> # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]] + >>> # xent = -sum(y * log(softmax), 1) + >>> # log(softmax) = [[-2.9957, -0.0513, -16.1181], + >>> # [-2.3026, -0.2231, -2.3026]] + >>> # y_true * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]] + >>> # xent = [0.0513, 2.3026] + >>> # Reduced xent = (0.0513 + 2.3026) / 2 + >>> m = tf.keras.metrics.SparseCategoricalCrossentropy() + >>> _ = m.update_state([1, 2], + ... [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]) + >>> m.result().numpy() + 1.1769392 - # y_true = one_hot(y_true) = [[0, 1, 0], [0, 0, 1]] - # logits = log(y_pred) - # softmax = exp(logits) / sum(exp(logits), axis=-1) - # softmax = [[0.05, 0.95, EPSILON], [0.1, 0.8, 0.1]] - - # xent = -sum(y * log(softmax), 1) - # log(softmax) = [[-2.9957, -0.0513, -16.1181], [-2.3026, -0.2231, -2.3026]] - # y_true * log(softmax) = [[0, -0.0513, 0], [0, 0, -2.3026]] - - # xent = [0.0513, 2.3026] - # Reduced xent = (0.0513 + 2.3026) / 2 - - print('Final result: ', m.result().numpy()) # Final result: 1.176 - ``` + >>> m.reset_states() + >>> _ = m.update_state([1, 2], + ... [[0.05, 0.95, 0], [0.1, 0.8, 0.1]], + ... sample_weight=tf.constant([0.3, 0.7])) + >>> m.result().numpy() + 1.6271976 Usage with tf.keras API: @@ -2958,6 +3052,17 @@ def accuracy(y_true, y_pred): @keras_export('keras.metrics.binary_accuracy') def binary_accuracy(y_true, y_pred, threshold=0.5): + """Calculates how often predictions matches binary labels. + + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + threshold: (Optional) Float representing the threshold for deciding whether + prediction values are 1 or 0. + + Returns: + Binary accuracy values. shape = `[batch_size, d0, .. dN-1]` + """ threshold = math_ops.cast(threshold, y_pred.dtype) y_pred = math_ops.cast(y_pred > threshold, y_pred.dtype) return K.mean(math_ops.equal(y_true, y_pred), axis=-1) @@ -2965,6 +3070,18 @@ def binary_accuracy(y_true, y_pred, threshold=0.5): @keras_export('keras.metrics.categorical_accuracy') def categorical_accuracy(y_true, y_pred): + """Calculates how often predictions matches one-hot labels. + + You can provide logits of classes as `y_pred`, since argmax of + logits and probabilities are same. + + Args: + y_true: One-hot ground truth values. + y_pred: The prediction values. + + Returns: + Categorical accuracy values. + """ return math_ops.cast( math_ops.equal( math_ops.argmax(y_true, axis=-1), math_ops.argmax(y_pred, axis=-1)), @@ -2973,6 +3090,18 @@ def categorical_accuracy(y_true, y_pred): @keras_export('keras.metrics.sparse_categorical_accuracy') def sparse_categorical_accuracy(y_true, y_pred): + """Calculates how often predictions matches integer labels. + + You can provide logits of classes as `y_pred`, since argmax of + logits and probabilities are same. + + Args: + y_true: Integer ground truth values. + y_pred: The prediction values. + + Returns: + Sparse categorical accuracy values. + """ y_pred_rank = ops.convert_to_tensor(y_pred).shape.ndims y_true_rank = ops.convert_to_tensor(y_true).shape.ndims # If the shape of y_true is (num_samples, 1), squeeze to (num_samples,) @@ -2991,6 +3120,17 @@ def sparse_categorical_accuracy(y_true, y_pred): @keras_export('keras.metrics.top_k_categorical_accuracy') def top_k_categorical_accuracy(y_true, y_pred, k=5): + """Computes how often targets are in the top `K` predictions. + + Args: + y_true: The ground truth values. + y_pred: The prediction values. + k: (Optional) Number of top elements to look at for computing accuracy. + Defaults to 5. + + Returns: + Top K categorical accuracy value. + """ return math_ops.cast( nn.in_top_k(y_pred, math_ops.argmax(y_true, axis=-1), k), K.floatx()) @@ -3022,7 +3162,17 @@ def sparse_top_k_categorical_accuracy(y_true, y_pred, k=5): def cosine_proximity(y_true, y_pred, axis=-1): - """Computes the cosine similarity between labels and predictions.""" + """Computes the cosine similarity between labels and predictions. + + Args: + y_true: The ground truth values. + y_pred: The prediction values. + axis: (Optional) Defaults to -1. The dimension along which the cosine + similarity is computed. + + Returns: + Cosine similarity value. + """ y_true = nn.l2_normalize(y_true, axis=axis) y_pred = nn.l2_normalize(y_pred, axis=axis) return math_ops.reduce_sum(y_true * y_pred, axis=axis) From 1f1c2eb8f30525ddcb819d6fc6941c7a84f33e4c Mon Sep 17 00:00:00 2001 From: Brian Zhao Date: Wed, 4 Dec 2019 22:34:40 -0800 Subject: [PATCH 137/383] Prepare tensorflow to start using rules_cc for all of its c++ build rules. This allows tensorflow to start experimenting with rules_cc's cc_shared_library implementation. This change is part of the Tensorflow Build Improvements RFC: https://github.com/tensorflow/community/pull/179 PiperOrigin-RevId: 283908432 Change-Id: Id497dfee6211650f0e10ccb1332ac625af2f23fd --- tensorflow/core/platform/default/rules_cc.bzl | 19 +++++++++++++++++++ tensorflow/core/platform/rules_cc.bzl | 16 ++++++++++++++++ tensorflow/workspace.bzl | 10 ++++++++++ 3 files changed, 45 insertions(+) create mode 100644 tensorflow/core/platform/default/rules_cc.bzl create mode 100644 tensorflow/core/platform/rules_cc.bzl diff --git a/tensorflow/core/platform/default/rules_cc.bzl b/tensorflow/core/platform/default/rules_cc.bzl new file mode 100644 index 00000000000..803da0c5612 --- /dev/null +++ b/tensorflow/core/platform/default/rules_cc.bzl @@ -0,0 +1,19 @@ +"""This forwards all of rules_cc's relevant rules under a common file""" + +load( + "@rules_cc//cc:defs.bzl", + _cc_binary = "cc_binary", + _cc_import = "cc_import", + _cc_library = "cc_library", + _cc_test = "cc_test", +) +load( + "@rules_cc//examples:experimental_cc_shared_library.bzl", + _cc_shared_library = "cc_shared_library", +) + +cc_binary = _cc_binary +cc_import = _cc_import +cc_library = _cc_library +cc_shared_library = _cc_shared_library +cc_test = _cc_test diff --git a/tensorflow/core/platform/rules_cc.bzl b/tensorflow/core/platform/rules_cc.bzl new file mode 100644 index 00000000000..e1331fab9fc --- /dev/null +++ b/tensorflow/core/platform/rules_cc.bzl @@ -0,0 +1,16 @@ +"""Provides an indirection layer to bazel cc_rules""" + +load( + "//tensorflow/core/platform:google/rules_cc.bzl", + _cc_binary = "cc_binary", + _cc_import = "cc_import", + _cc_library = "cc_library", + _cc_shared_library = "cc_shared_library", + _cc_test = "cc_test", +) + +cc_binary = _cc_binary +cc_import = _cc_import +cc_library = _cc_library +cc_shared_library = _cc_shared_library +cc_test = _cc_test diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 4c73d2e2c94..0a3c7fe4d89 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -848,6 +848,16 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): ], ) + tf_http_archive( + name = "rules_cc", + sha256 = "cf3b76a90c86c0554c5b10f4b160f05af71d252026b71362c4674e2fb9936cf9", + strip_prefix = "rules_cc-01d4a48911d5e7591ecb1c06d3b8af47fe872371", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/bazelbuild/rules_cc/archive/01d4a48911d5e7591ecb1c06d3b8af47fe872371.zip", + "https://github.com/bazelbuild/rules_cc/archive/01d4a48911d5e7591ecb1c06d3b8af47fe872371.zip", + ], + ) + tf_http_archive( name = "build_bazel_rules_android", sha256 = "cd06d15dd8bb59926e4d65f9003bfc20f9da4b2519985c27e190cddc8b7a7806", From 7f7bc2c1a13157aa52fc67c3c4a37263786694ee Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 4 Dec 2019 23:12:35 -0800 Subject: [PATCH 138/383] internal BUILD file cleanup. PiperOrigin-RevId: 283911755 Change-Id: Ie9b298ada8b1b697d48c9abe8b876490309ded39 --- tensorflow/core/BUILD | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 7f042072f49..107a726ea60 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2775,7 +2775,7 @@ tf_cuda_library( deps = [":framework_lite"], ) -# TODO(josh11b): Is this needed, or can we just use ":protos_all"? +# TODO(josh11b): Is this needed, or can we just use ":protos_all_cc"? cc_library( name = "protos_cc", visibility = ["//visibility:public"], From 2534ce1676294325de05b5afaa58a78b39c47783 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Wed, 4 Dec 2019 23:45:01 -0800 Subject: [PATCH 139/383] Add MLIRIR as a dependency to LLVM and related dialects Fixes https://github.com/tensorflow/mlir/issues/289 PiperOrigin-RevId: 283914472 Change-Id: Id5fff4822e4a08310e50f2b4a8bb78056bb4cc53 --- third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt b/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt index 40bcb572e56..8c53e2dcf33 100644 --- a/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt +++ b/third_party/mlir/lib/Dialect/LLVMIR/CMakeLists.txt @@ -5,7 +5,7 @@ add_llvm_library(MLIRLLVMIR ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR ) add_dependencies(MLIRLLVMIR MLIRLLVMOpsIncGen MLIRLLVMConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport) -target_link_libraries(MLIRLLVMIR LLVMAsmParser LLVMCore LLVMSupport) +target_link_libraries(MLIRLLVMIR LLVMAsmParser LLVMCore LLVMSupport MLIRIR) add_llvm_library(MLIRNVVMIR IR/NVVMDialect.cpp @@ -14,7 +14,7 @@ add_llvm_library(MLIRNVVMIR ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR ) add_dependencies(MLIRNVVMIR MLIRNVVMOpsIncGen MLIRNVVMConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport) -target_link_libraries(MLIRNVVMIR LLVMAsmParser LLVMCore LLVMSupport) +target_link_libraries(MLIRNVVMIR LLVMAsmParser LLVMCore LLVMSupport MLIRIR) add_llvm_library(MLIRROCDLIR IR/ROCDLDialect.cpp @@ -23,4 +23,4 @@ add_llvm_library(MLIRROCDLIR ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/LLVMIR ) add_dependencies(MLIRROCDLIR MLIRROCDLOpsIncGen MLIRROCDLConversionsIncGen LLVMAsmParser LLVMCore LLVMSupport) -target_link_libraries(MLIRROCDLIR LLVMAsmParser LLVMCore LLVMSupport) +target_link_libraries(MLIRROCDLIR LLVMAsmParser LLVMCore LLVMSupport MLIRIR) From d3af7187f21ee3395a50c83fd43234430ff6246a Mon Sep 17 00:00:00 2001 From: Anjali Sridhar Date: Thu, 5 Dec 2019 00:55:04 -0800 Subject: [PATCH 140/383] Remove redundant doc string which seems to be a remnent from previous cleanup. PiperOrigin-RevId: 283921901 Change-Id: I34c6fd2bec097234d16231e596bdd0018fd7af78 --- tensorflow/python/distribute/distribute_lib.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 4a2a8af1840..df9eccc8038 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -652,8 +652,7 @@ class Strategy(object): `.shard` operation to the end of the processing pipeline. This will cause the entire preprocessing pipeline for all the data to be run on every worker, and each worker will do redundant work. We will print a warning - if this method of sharding is selected. In this case, consider using - `experimental_distribute_datasets_from_function` instead. + if this method of sharding is selected. You can disable dataset sharding across workers using the `auto_shard` option in `tf.data.experimental.DistributeOptions`. From 149c9681592c9915c385c3576725624c32a89734 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 01:01:05 -0800 Subject: [PATCH 141/383] Migrate tensorflow/core:protos_all_go_proto dependencies to tensorflow/core/framework:*_go_proto targets where appropriate. PiperOrigin-RevId: 283922635 Change-Id: Iff617716be25710f0bdbbd5d02be7044f81febdd --- tensorflow/go/genop/internal/api_def_map.go | 9 ++++--- tensorflow/go/genop/internal/genop.go | 29 +++++++++++---------- tensorflow/go/genop/internal/genop_test.go | 11 ++++---- tensorflow/go/saved_model.go | 2 +- tensorflow/go/signature.go | 2 +- tensorflow/go/signature_test.go | 2 +- 6 files changed, 29 insertions(+), 26 deletions(-) diff --git a/tensorflow/go/genop/internal/api_def_map.go b/tensorflow/go/genop/internal/api_def_map.go index 0bbd88b61c3..ad28df399f6 100644 --- a/tensorflow/go/genop/internal/api_def_map.go +++ b/tensorflow/go/genop/internal/api_def_map.go @@ -31,7 +31,8 @@ import ( "unsafe" "github.com/golang/protobuf/proto" - pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework" + adpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/api_def_go_proto" + odpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/op_def_go_proto" ) // Encapsulates a collection of API definitions. @@ -50,7 +51,7 @@ type apiDefMap struct { // https://www.tensorflow.org/code/tensorflow/core/framework/op_def.proto // for OpList proto definition). -func newAPIDefMap(oplist *pb.OpList) (*apiDefMap, error) { +func newAPIDefMap(oplist *odpb.OpList) (*apiDefMap, error) { // Create a buffer containing the serialized OpList. opdefSerialized, err := proto.Marshal(oplist) if err != nil { @@ -97,7 +98,7 @@ func (m *apiDefMap) Put(data string) error { // Returns ApiDef proto instance for the TensorFlow operation // named `opname`. -func (m *apiDefMap) Get(opname string) (*pb.ApiDef, error) { +func (m *apiDefMap) Get(opname string) (*adpb.ApiDef, error) { cname := C.CString(opname) defer C.free(unsafe.Pointer(cname)) status := C.TF_NewStatus() @@ -113,7 +114,7 @@ func (m *apiDefMap) Get(opname string) (*pb.ApiDef, error) { } var ( - apidef = new(pb.ApiDef) + apidef = new(adpb.ApiDef) size = int(apidefBuf.length) // A []byte backed by C memory. // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go index 1c05715a1a2..15c125e3cf7 100644 --- a/tensorflow/go/genop/internal/genop.go +++ b/tensorflow/go/genop/internal/genop.go @@ -47,7 +47,8 @@ import ( "unsafe" "github.com/golang/protobuf/proto" - pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework" + adpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/api_def_go_proto" + odpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/op_def_go_proto" ) // GenerateFunctionsForRegisteredOps writes a Go source code file to w @@ -69,11 +70,11 @@ func GenerateFunctionsForRegisteredOps( return generateFunctionsForOps(w, ops, apimap) } -func registeredOps() (*pb.OpList, *apiDefMap, error) { +func registeredOps() (*odpb.OpList, *apiDefMap, error) { buf := C.TF_GetAllOpList() defer C.TF_DeleteBuffer(buf) var ( - list = new(pb.OpList) + list = new(odpb.OpList) size = int(buf.length) // A []byte backed by C memory. // See: https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices @@ -104,7 +105,7 @@ func updateAPIDefs(m *apiDefMap, dir string) error { return nil } -func generateFunctionsForOps(w io.Writer, ops *pb.OpList, apimap *apiDefMap) error { +func generateFunctionsForOps(w io.Writer, ops *odpb.OpList, apimap *apiDefMap) error { thisPackage := reflect.TypeOf(tmplArgs{}).PkgPath() if err := tmplHeader.Execute(w, thisPackage); err != nil { return err @@ -129,7 +130,7 @@ func generateFunctionsForOps(w io.Writer, ops *pb.OpList, apimap *apiDefMap) err return nil } -func generateFunctionForOp(w io.Writer, op *pb.OpDef, apidef *pb.ApiDef) error { +func generateFunctionForOp(w io.Writer, op *odpb.OpDef, apidef *adpb.ApiDef) error { if strings.HasPrefix(op.Name, "_") { // Internal operation return nil } @@ -355,8 +356,8 @@ func {{.Op.Name}} ) type attrWrapper struct { - op *pb.OpDef_AttrDef - api *pb.ApiDef_Attr + op *odpb.OpDef_AttrDef + api *adpb.ApiDef_Attr } func (a *attrWrapper) Name() string { return a.api.Name } @@ -369,8 +370,8 @@ func (a *attrWrapper) Minimum() int64 { return a.op.Minimum } func (a *attrWrapper) DefaultValue() interface{} { return a.api.DefaultValue } type argWrapper struct { - op *pb.OpDef_ArgDef - api *pb.ApiDef_Arg + op *odpb.OpDef_ArgDef + api *adpb.ApiDef_Arg } func (a *argWrapper) Name() string { return a.api.Name } @@ -379,8 +380,8 @@ func (a *argWrapper) Description() string { return a.api.Description } func (a *argWrapper) IsListArg() bool { return isListArg(a.op) } type tmplArgs struct { - Op *pb.OpDef - APIDef *pb.ApiDef + Op *odpb.OpDef + APIDef *adpb.ApiDef // Op.Attr is split into two categories // (1) Required: These must be specified by the client and are thus // included in the function signature. @@ -394,7 +395,7 @@ type tmplArgs struct { OutArgs []*argWrapper } -func newTmplArgs(op *pb.OpDef, apidef *pb.ApiDef) (*tmplArgs, error) { +func newTmplArgs(op *odpb.OpDef, apidef *adpb.ApiDef) (*tmplArgs, error) { ret := tmplArgs{Op: op, APIDef: apidef} // Setup InArgs field @@ -552,11 +553,11 @@ func identifier(s string) string { return s } -func isListArg(argdef *pb.OpDef_ArgDef) bool { +func isListArg(argdef *odpb.OpDef_ArgDef) bool { return argdef.TypeListAttr != "" || argdef.NumberAttr != "" } -func isListAttr(attrdef *pb.OpDef_AttrDef) bool { +func isListAttr(attrdef *odpb.OpDef_AttrDef) bool { list, _ := parseTFType(attrdef.Type) return list } diff --git a/tensorflow/go/genop/internal/genop_test.go b/tensorflow/go/genop/internal/genop_test.go index acce6dea67c..51f3878f919 100644 --- a/tensorflow/go/genop/internal/genop_test.go +++ b/tensorflow/go/genop/internal/genop_test.go @@ -22,13 +22,14 @@ import ( "testing" "github.com/golang/protobuf/proto" - pb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework" + adpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/api_def_go_proto" + odpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/op_def_go_proto" ) // Creates an ApiDef based on opdef and applies overrides // from apidefText (ApiDef text proto). -func GetAPIDef(t *testing.T, opdef *pb.OpDef, apidefText string) *pb.ApiDef { - opdefList := &pb.OpList{Op: []*pb.OpDef{opdef}} +func GetAPIDef(t *testing.T, opdef *odpb.OpDef, apidefText string) *adpb.ApiDef { + opdefList := &odpb.OpList{Op: []*odpb.OpDef{opdef}} apimap, err := newAPIDefMap(opdefList) if err != nil { t.Fatal(err) @@ -538,8 +539,8 @@ func TestOp(scope *Scope, bb tf.Output, aa tf.Output, optional ...TestOpAttr) (c for _, test := range testdata { t.Run(test.tag, func(t *testing.T) { - var opdef pb.OpDef - var apidef *pb.ApiDef + var opdef odpb.OpDef + var apidef *adpb.ApiDef var buf bytes.Buffer if err := proto.UnmarshalText(test.opdef, &opdef); err != nil { t.Fatal(err) diff --git a/tensorflow/go/saved_model.go b/tensorflow/go/saved_model.go index 5ccb22388d9..90fe3655585 100644 --- a/tensorflow/go/saved_model.go +++ b/tensorflow/go/saved_model.go @@ -22,7 +22,7 @@ import ( "github.com/golang/protobuf/proto" - tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework" + tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core" ) // #include diff --git a/tensorflow/go/signature.go b/tensorflow/go/signature.go index 2a4842be6aa..67d62577125 100644 --- a/tensorflow/go/signature.go +++ b/tensorflow/go/signature.go @@ -16,7 +16,7 @@ limitations under the License. package tensorflow -import tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework" +import tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core" // #include "tensorflow/c/c_api.h" import "C" diff --git a/tensorflow/go/signature_test.go b/tensorflow/go/signature_test.go index 37d1650d9ed..a13bbb15aab 100644 --- a/tensorflow/go/signature_test.go +++ b/tensorflow/go/signature_test.go @@ -20,7 +20,7 @@ import ( "fmt" "testing" - tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework" + tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core" ) func TestSignatureFromProto(t *testing.T) { From 7dfc119b802aff529b7f141738e2394b626c0c47 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 01:02:47 -0800 Subject: [PATCH 142/383] compat: Update forward compatibility horizon to 2019-12-05 PiperOrigin-RevId: 283923158 Change-Id: I94c9f00690031746370f269c9879e4a3f162d9f3 --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 71427c9c237..1152256b463 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 12, 4) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 12, 5) _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" _FORWARD_COMPATIBILITY_DATE_NUMBER = None From d1fb203f18d94cb8179d3529f367e8079ded50ff Mon Sep 17 00:00:00 2001 From: Stephan Herhut Date: Thu, 5 Dec 2019 01:07:04 -0800 Subject: [PATCH 143/383] Add simple transformations to complete lowering of fused XLA operations. The transformations are at a very basic pattern stage and need generalization but give a good roadmap of the kind of transformations that are required. PiperOrigin-RevId: 283924132 Change-Id: I3f529f16b923ee8679ea1c9f4fff272f9606b660 --- .../compiler/xla/service/mlir_gpu/BUILD | 5 + .../xla/service/mlir_gpu/kernel_lowering.cc | 217 +++++++++++++++++- .../xla/service/mlir_gpu/mlir_compiler.cc | 4 + .../mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc | 150 ++++++------ 4 files changed, 292 insertions(+), 84 deletions(-) diff --git a/tensorflow/compiler/xla/service/mlir_gpu/BUILD b/tensorflow/compiler/xla/service/mlir_gpu/BUILD index e5b6138257b..f7d0aa6b669 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/BUILD +++ b/tensorflow/compiler/xla/service/mlir_gpu/BUILD @@ -139,6 +139,9 @@ cc_library( hdrs = ["kernel_lowering.h"], deps = [ "//tensorflow/compiler/mlir/xla:hlo", + "//tensorflow/compiler/mlir/xla:hlo_legalize_to_lhlo", + "//tensorflow/compiler/mlir/xla:lhlo", + "//tensorflow/compiler/mlir/xla:lhlo_fuse_linalg", "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_affine", "//tensorflow/compiler/mlir/xla:lhlo_legalize_to_linalg", "//tensorflow/compiler/mlir/xla:xla_dialect_registration", @@ -157,10 +160,12 @@ cc_library( "@local_config_mlir//:Linalg", "@local_config_mlir//:LinalgDialectRegistration", "@local_config_mlir//:LoopDialectRegistration", + "@local_config_mlir//:LoopOps", "@local_config_mlir//:LoopsToGPUPass", "@local_config_mlir//:NVVMDialect", "@local_config_mlir//:Pass", "@local_config_mlir//:StandardDialectRegistration", + "@local_config_mlir//:StandardOps", "@local_config_mlir//:Transforms", ], ) diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc index 7cbbb3ec44e..87042f51ac0 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc @@ -25,32 +25,234 @@ limitations under the License. #include "mlir/Dialect/GPU/Passes.h" // TF:local_config_mlir #include "mlir/Dialect/LLVMIR/LLVMDialect.h" // TF:local_config_mlir #include "mlir/Dialect/LLVMIR/NVVMDialect.h" // TF:local_config_mlir +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" // TF:local_config_mlir #include "mlir/Dialect/Linalg/Passes.h" // TF:local_config_mlir +#include "mlir/Dialect/LoopOps/LoopOps.h" // TF:local_config_mlir +#include "mlir/Dialect/StandardOps/Ops.h" // TF:local_config_mlir #include "mlir/IR/Attributes.h" // TF:local_config_mlir +#include "mlir/IR/BlockAndValueMapping.h" // TF:local_config_mlir +#include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/Function.h" // TF:local_config_mlir #include "mlir/IR/Module.h" // TF:local_config_mlir #include "mlir/IR/OperationSupport.h" // TF:local_config_mlir +#include "mlir/IR/PatternMatch.h" // TF:local_config_mlir +#include "mlir/IR/Region.h" // TF:local_config_mlir #include "mlir/Pass/Pass.h" // TF:local_config_mlir #include "mlir/Pass/PassManager.h" // TF:local_config_mlir #include "mlir/Transforms/DialectConversion.h" // TF:local_config_mlir #include "mlir/Transforms/Passes.h" // TF:local_config_mlir +#include "tensorflow/compiler/mlir/xla/ir/lhlo_ops.h" #include "tensorflow/compiler/mlir/xla/transforms/passes.h" +#include "tensorflow/compiler/mlir/xla/transforms/rewriters.h" #include "tensorflow/compiler/xla/util.h" namespace xla { namespace mlir_gpu { +namespace { + +using ::mlir::xla_lhlo::FusionOp; + +// Following are some small transformations that are required to clean up code +// after lowering from linalg to loops. + +// A simple pass that applies lowering of HLO to LHLO only within Fusion +// operations. This is needed, as FusionOp is not closed from above and hence +// nested pass managers can not be applied. +struct FusionToLhloConverter + : public mlir::FunctionPass { + void runOnFunction() override { + auto& ctx = getContext(); + mlir::OwningRewritePatternList patterns; + mlir::ConversionTarget target(ctx); + target.addLegalDialect<::mlir::xla_lhlo::XlaLhloDialect>(); + + ::mlir::xla_hlo::populateHLOToLHLOConversionPattern(&ctx, &patterns); + getFunction().walk([&](FusionOp op) { + if (failed(applyPartialConversion(op, target, patterns, nullptr))) { + signalPassFailure(); + } + }); + } +}; + +// Replaces a FusionOp by the operations contained in its region. +struct FusionOpRemover : public mlir::FunctionPass { + void runOnFunction() override { + getFunction().walk([&](FusionOp op) { + mlir::OpBuilder builder(op); + // FusionOp has a single region with a single block, so we can just walk + // over it and clone operations to the outside. + mlir::BlockAndValueMapping mapping; + for (auto& nested_op : op.region().front().without_terminator()) { + auto clone = builder.clone(nested_op, mapping); + for (auto pair : + llvm::zip(nested_op.getResults(), clone->getResults())) { + mapping.map(std::get<0>(pair), std::get<1>(pair)); + } + } + op.erase(); + }); + } +}; + +// Rewrite the single-trip loops we get out of linalg into just their bodies. +// TODO(herhut): Make this a general pattern. +struct SingleTripLoopRemoval + : public mlir::FunctionPass { + void runOnFunction() override { + auto getConstantValue = [](mlir::Value* value) -> llvm::Optional { + auto definingOp = value->getDefiningOp(); + if (!definingOp) return llvm::None; + auto constantOp = llvm::dyn_cast(definingOp); + if (!constantOp) return llvm::None; + auto integer = constantOp.getValue().dyn_cast(); + if (!integer) return llvm::None; + return integer.getInt(); + }; + getFunction().walk([&](mlir::loop::ForOp forOp) { + auto lower = getConstantValue(forOp.lowerBound()); + auto upper = getConstantValue(forOp.upperBound()); + auto step = getConstantValue(forOp.step()); + if (!lower || !upper || !step) return; + if ((lower.getValue() < upper.getValue()) && + (lower.getValue() + step.getValue() >= upper.getValue())) { + // This loop has a single trip, so we can move the body in front. + mlir::BlockAndValueMapping mapping; + mlir::OpBuilder b(forOp); + mapping.map(forOp.getInductionVar(), forOp.lowerBound()); + for (auto& nested_op : forOp.getBody()->without_terminator()) { + auto clone = b.clone(nested_op, mapping); + for (auto pair : + llvm::zip(nested_op.getResults(), clone->getResults())) { + mapping.map(std::get<0>(pair), std::get<1>(pair)); + } + } + forOp.erase(); + } + }); + } +}; + +// Simple pass that replaces a load that immediately follows a store to the +// same address with the stored value. This needs generalization. +struct StoreForwardingPass : mlir::FunctionPass { + void runOnFunction() override { + getFunction().walk([&](mlir::LoadOp loadOp) { + auto block = loadOp.getOperation()->getBlock(); + auto iterator = std::find_if(block->rbegin(), block->rend(), + [&loadOp](mlir::Operation& other) { + return &other == loadOp.getOperation(); + }); + if (++iterator == block->rend()) return; + mlir::StoreOp storeOp = llvm::dyn_cast(&*(iterator)); + if (!storeOp) return; + // Check both store to the same value. + if (storeOp.memref() != loadOp.memref()) return; + auto storeIndices = storeOp.getIndices(); + auto loadIndices = loadOp.getIndices(); + if (!std::equal(storeIndices.begin(), storeIndices.end(), + loadIndices.begin(), loadIndices.end())) { + return; + } + loadOp.replaceAllUsesWith(storeOp.getValueToStore()); + loadOp.erase(); + }); + }; +}; + +// Simple pass that removes temporary buffers that are only written to but +// never read from or that are read but the read value is not used. +// Needs an analysis that proves that loads and stores are side-effect free +// (in bounds, no aliasing, etc.). +struct DeadTempBufferRemoval : mlir::FunctionPass { + bool operationConsideredDead(mlir::Operation* op) { + for (auto result : op->getResults()) { + if (!llvm::all_of(result->getUsers(), [&](mlir::Operation* op) { + // Store and Dealloc is OK. + if (llvm::isa(op) || + llvm::isa(op)) { + return true; + } + // Load without uses is also ok. + if (auto loadOp = llvm::dyn_cast(op)) { + return loadOp.use_empty(); + } + // Subview is ok if it is dead itself. + if (llvm::isa(op)) { + return operationConsideredDead(op); + } + return false; + })) { + return false; + } + } + return true; + } + + void recursiveErase(mlir::Operation* op) { + for (auto result : op->getResults()) { + for (auto user : llvm::make_early_inc_range(result->getUsers())) { + recursiveErase(user); + } + } + op->erase(); + } + + void runOnFunction() override { + getFunction().walk([&](mlir::AllocOp allocOp) { + if (!operationConsideredDead(allocOp)) { + return; + } + + // TODO(herhut): There should be a generic helper for this. + recursiveErase(allocOp); + }); + } +}; + +// Neat little helper pass to dump the IR inbetween passes. +struct DumpPass : public mlir::ModulePass { + void runOnModule() override { +#if DEBUG + getModule().dump(); +#endif + } +}; + +} // namespace Status LowerLHLOToGPU(mlir::ModuleOp module) { mlir::PassManager pm(module.getContext()); - // Transform element-wise operations to LinAlg. + // First, lower bodies of fusion operations from hlo to lhlo. + pm.addPass(absl::make_unique()); + // Next, we can strip the outer fusion operation. + pm.addPass(absl::make_unique()); + // Transform lhlo operations to LinAlg. pm.addPass(::mlir::xla_lhlo::createLegalizeToLinalgPass()); - // Go from affine to normal loops. + // Fuse linalg operations. This will yield a single tiled loop nest where + // the inner loops are single trip. + pm.addPass(::mlir::xla_lhlo::createLhloFuseLinalg()); + pm.addPass(absl::make_unique()); + // Go from linalg to normal loops. pm.addPass(::mlir::linalg::createConvertLinalgToLoopsPass()); - // Lower affine to ordinary loops. - pm.addPass(::mlir::createLowerAffinePass()); - // Move constants out of the loop. - pm.addPass(::mlir::createLoopInvariantCodeMotionPass()); + pm.addPass(absl::make_unique()); + // Canonicalize the code to simplify index computations. + pm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass()); + pm.addPass(absl::make_unique()); + // The innermost loops will be single-trip. + pm.addPass(absl::make_unique()); + pm.addPass(absl::make_unique()); + // Run CSE to ensure that loads and stores to the same subview get + // recognized as such. + pm.addNestedPass<::mlir::FuncOp>(::mlir::createCSEPass()); + pm.addPass(absl::make_unique()); + // Forward stores to buffers to loads. + pm.addPass(absl::make_unique()); + pm.addPass(absl::make_unique()); + // Remove now unused temporary buffers. + pm.addPass(absl::make_unique()); + pm.addPass(absl::make_unique()); // Coalesce generated loops to have 1d loops. pm.addPass(::mlir::createLoopCoalescingPass()); // Transform the now 1d loops to gpu launches. @@ -65,6 +267,7 @@ Status LowerLHLOToGPU(mlir::ModuleOp module) { if (failed(pm.run(module))) { return InternalError("Lowering to GPU kernels failed."); } + return Status::OK(); } @@ -73,7 +276,7 @@ Status LowerKernelBodiesToNVVM(mlir::ModuleOp module) { ::mlir::PassManager pm(module.getContext(), /*verifyPasses=*/false); // Rewrite kernel functions to LLVM IR. - auto &kernelPm = pm.nest<::mlir::ModuleOp>(); + auto& kernelPm = pm.nest<::mlir::ModuleOp>(); kernelPm.addPass(::mlir::createLowerGpuOpsToNVVMOpsPass()); // Some basic cleanup. kernelPm.addNestedPass<::mlir::FuncOp>(::mlir::createCanonicalizerPass()); diff --git a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc index b035a8ddcb5..92f7e5a08ac 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/mlir_compiler.cc @@ -522,6 +522,10 @@ StatusOr> MlirCompiler::RunBackend( auto llvmModule = mlir::translateModuleToNVVMIR(*kernel_module); + if (!llvmModule) { + return InternalError("Translation to LLVM failed"); + } + llvmModule->setModuleIdentifier(emission_context.getHloModule()->name()); // TODO(herhut): Why is this needed and does not come from the template? llvmModule->setDataLayout(gpu::nvptx::kDataLayout); diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc index 3ad958dfe6d..e63f9484ddc 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc @@ -113,41 +113,20 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] { ;CHECK: "gpu.launch_func"(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[ARG0]], %[[ARG1]], %[[ARG2]] ;CHECK: } ;CHECK: func @add_kernel(%[[ARG0]]: [[TYPE]], %[[ARG1]]: [[TYPE]], %[[ARG2]]: [[TYPE]] -;CHECK: load %[[ARG0]][[INDEX:.*]] -;CHECK: load %[[ARG1]][[INDEX]] -;CHECK: store %{{.*}}, %[[ARG2]][[INDEX]] +;CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]] +;CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]] +;CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]] +;CHECK: %[[VAL1:.*]] = load %{{.*\[}}[[INDEX:.*]]] +;CHECK: %[[VAL2:.*]] = load %{{.*\[}}[[INDEX]]] +;CHECK: %[[RES:.*]] = addf %[[VAL1]], %[[VAL2]] +;CHECK: store %[[RES]], %{{.*\[}}[[INDEX]]] )", LoweringStage::GPU); } -TEST_F(LhloGenTest, AddInLVVMDialect) { - CompileAndVerifyIr(R"( -HloModule Add - -ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] { - %x = f32[2,2]{1,0} parameter(0) - %y = f32[2,2]{1,0} parameter(1) - ROOT %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y) -})", - R"( -;CHECK: func @add_kernel(%[[ARG0:.*]]: [[TYPE:!llvm<.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]] -;CHECK: %[[LD0:.*]] = llvm.load %[[ARG0]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*"> -;CHECK: %[[LD1:.*]] = llvm.load %[[ARG1]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*"> -;CHECK: %[[LD2:.*]] = llvm.load %[[ARG2]] : !llvm<"{ float*, float*, i64, [2 x i64], [2 x i64] }*"> -;CHECK: %[[PTR0:.*]] = llvm.extractvalue %[[LD0]][1] -;CHECK: %[[GEP0:.*]] = llvm.getelementptr %[[PTR0]] -;CHECK: %[[VAL0:.*]] = llvm.load %[[GEP0]] -;CHECK: %[[PTR1:.*]] = llvm.extractvalue %[[LD1]][1] -;CHECK: %[[GEP1:.*]] = llvm.getelementptr %[[PTR1]] -;CHECK: %[[VAL1:.*]] = llvm.load %[[GEP1]] -;CHECK: %[[VAL2:.*]] = llvm.fadd %[[VAL0]], %[[VAL1]] -;CHECK: %[[PTR2:.*]] = llvm.extractvalue %[[LD2]][1] -;CHECK: %[[GEP2:.*]] = llvm.getelementptr %[[PTR2]] -;CHECK: llvm.store %[[VAL2]], %[[GEP2]] - )", - LoweringStage::LLVM); -} - +// This test verifies that the kernel signature is amended correctly. The actual +// body of the generated function does not matter, it is already checked at the +// GPU level above. TEST_F(LhloGenTest, AddAsKernel) { CompileAndVerifyIr(R"( HloModule Add @@ -219,20 +198,6 @@ ENTRY %Add (x: f32[2,2], y: f32[2,2]) -> f32[2,2] { ;CHECK: llvm.store %{{.*}}, %[[GEP2ST0]] ;CHECK: %[[GEP2ST1:.*]] = llvm.getelementptr %[[DESC2]] ;CHECK: llvm.store %{{.*}}, %[[GEP2ST1]] - -;CHECK: %[[VL0:.*]] = llvm.load %[[DESC0]] -;CHECK: %[[VL1:.*]] = llvm.load %[[DESC1]] -;CHECK: %[[VL2:.*]] = llvm.load %[[DESC2]] -;CHECK: %[[EV0:.*]] = llvm.extractvalue %[[VL0]][1] -;CHECK: %[[VGEP0:.*]] = llvm.getelementptr %[[EV0]] -;CHECK: %[[VAL0:.*]] = llvm.load %[[VGEP0]] -;CHECK: %[[EV1:.*]] = llvm.extractvalue %[[VL1]][1] -;CHECK: %[[VGEP1:.*]] = llvm.getelementptr %[[EV1]] -;CHECK: %[[VAL1:.*]] = llvm.load %[[VGEP1]] -;CHECK: %[[VAL2:.*]] = llvm.fadd %[[VAL0]], %[[VAL1]] -;CHECK: %[[EV2:.*]] = llvm.extractvalue %[[VL2]][1] -;CHECK: %[[SGEP:.*]] = llvm.getelementptr %[[EV2]] -;CHECK: llvm.store %[[VAL2]], %[[SGEP]] )", LoweringStage::KERNEL); } @@ -262,43 +227,74 @@ ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] { )"); } -TEST_F(LhloGenTest, FusedReduce) { +TEST_F(LhloGenTest, AddMultiplyGPU) { CompileAndVerifyIr(R"( -HloModule FusedReduce +HloModule AddMultiply -%add (x: f32[], y: f32[]) -> f32[] { - %x = f32[] parameter(0) - %y = f32[] parameter(1) - ROOT %add = f32[] add(f32[] %x, f32[] %y) -} - -%fused_computation (param: f32[100,10]) -> f32[10] { - %param = f32[100,10] parameter(0) - %constant = f32[] constant(0) - ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant), dimensions={0}, to_apply=%add -} - -ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] { - %x = f32[100,10] parameter(0) - ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput, calls=%fused_computation -} -)", +ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] { + %x = f32[2,2]{1,0} parameter(0) + %y = f32[2,2]{1,0} parameter(1) + %z = f32[2,2]{1,0} parameter(2) + %add = f32[2,2]{1,0} add(f32[2,2]{1,0} %x, f32[2,2]{1,0} %y) + ROOT %mul = f32[2,2]{1,0} multiply(f32[2,2]{1,0} %add, f32[2,2]{1,0} %z) +})", R"( -;CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) -;CHECK: "xla_lhlo.fusion"() ( { -;CHECK: %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]] -;CHECK: %[[CT0:.*]] = xla_hlo.constant dense<0.000000e+00> -;CHECK: %[[RED:.*]] = "xla_hlo.reduce"(%0, %1) ( { -;CHECK: ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]]) -;CHECK: %[[ADD:.*]] = xla_hlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]] -;CHECK: "xla_hlo.return"(%[[ADD]]) -;CHECK: }) -;CHECK: tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]] -;CHECK: "xla_lhlo.terminator"() -;CHECK-NEXT: }) - )"); +;CHECK: func @fusion_kernel(%[[ARG0:.*]]: [[TYPE:.*]], %[[ARG1:.*]]: [[TYPE]], %[[ARG2:.*]]: [[TYPE]], %[[RESULT:.*]]: [[TYPE]]) +;CHECK-DAG: std.subview %[[ARG0]]{{\[}}[[INDEX:.*]]] +;CHECK-DAG: std.subview %[[ARG1]]{{\[}}[[INDEX]]] +;CHECK-DAG: std.subview %[[ARG2]]{{\[}}[[INDEX]]] +;CHECK-DAG: std.subview %[[RESULT]]{{\[}}[[INDEX]]] +;CHECK: %[[V0:.*]] = load %{{.*\[}}[[CSTIDX:.*]]] +;CHECK: %[[V1:.*]] = load %{{.*\[}}[[CSTIDX:.*]]] +;CHECK: %[[ADD:.*]] = addf %[[V0]], %[[V1]] +;CHECK: %[[V2:.*]] = load %{{.*\[}}[[CSTIDX:.*]]] +;CHECK: %[[MUL:.*]] = mulf %[[ADD]], %[[V2]] +;CHECK: store %[[MUL]], %{{.*\[}}[[CSTIDX:.*]]] +;CHECK-NEXT: return + )", + LoweringStage::GPU); } +// TODO(herhut): Re-enable once we can lower hlo_reduce to proper lhlo_reduce. +// TEST_F(LhloGenTest, FusedReduce) { +// CompileAndVerifyIr(R"( +// HloModule FusedReduce +// +// %add (x: f32[], y: f32[]) -> f32[] { +// %x = f32[] parameter(0) +// %y = f32[] parameter(1) +// ROOT %add = f32[] add(f32[] %x, f32[] %y) +// } +// +// %fused_computation (param: f32[100,10]) -> f32[10] { +// %param = f32[100,10] parameter(0) +// %constant = f32[] constant(0) +// ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant), +// dimensions={0}, to_apply=%add +// } +// +// ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] { +// %x = f32[100,10] parameter(0) +// ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput, +// calls=%fused_computation +// } +// )", +// R"( +// ;CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) +// ;CHECK: "xla_lhlo.fusion"() ( { +// ;CHECK: %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]] +// ;CHECK: %[[CT0:.*]] = xla_hlo.constant dense<0.000000e+00> +// ;CHECK: %[[RED:.*]] = "xla_hlo.reduce"(%0, %1) ( { +// ;CHECK: ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]]) +// ;CHECK: %[[ADD:.*]] = xla_hlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]] +// ;CHECK: "xla_hlo.return"(%[[ADD]]) +// ;CHECK: }) +// ;CHECK: tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]] +// ;CHECK: "xla_lhlo.terminator"() +// ;CHECK-NEXT: }) +// )"); +// } + TEST_F(LhloGenTest, Broadcast) { CompileAndVerifyIr(R"( HloModule Broadcast From 2c2f30c7d4d4dd90672cf384e4d0f4e6be5bff0c Mon Sep 17 00:00:00 2001 From: Khanh LeViet Date: Thu, 5 Dec 2019 01:37:28 -0800 Subject: [PATCH 144/383] Fixed broken doc link. PiperOrigin-RevId: 283927478 Change-Id: Ia43343523b227de5b2e4bcb3b64bd22caabb3ead --- tensorflow/lite/g3doc/guide/ios.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/g3doc/guide/ios.md b/tensorflow/lite/g3doc/guide/ios.md index fc997bccf9d..dab8e0f0ca2 100644 --- a/tensorflow/lite/g3doc/guide/ios.md +++ b/tensorflow/lite/g3doc/guide/ios.md @@ -7,7 +7,7 @@ example: image classification example For an explanation of the source code, you should also read -[TensorFlow Lite iOS image classification](https://www.tensorflow.org/lite/models/image_classification/ios). +[TensorFlow Lite iOS image classification](https://github.com/tensorflow/examples/blob/master/lite/examples/image_classification/ios/EXPLORE_THE_CODE.md). This example app uses [image classification](https://www.tensorflow.org/lite/models/image_classification/overview) From d364d465d7926f8cdbe4991f96418f16a1ffa92a Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Thu, 5 Dec 2019 01:37:29 -0800 Subject: [PATCH 145/383] Open source hlo_module_loader. This is in preparation of open sourcing another tool. PiperOrigin-RevId: 283927480 Change-Id: I0f38a0e6a1fcdded1b0e1c28ff62d07e51bb1cc9 --- tensorflow/compiler/xla/tools/BUILD | 27 ++++ .../compiler/xla/tools/hlo_module_loader.cc | 125 ++++++++++++++++++ .../compiler/xla/tools/hlo_module_loader.h | 79 +++++++++++ .../xla/tools/hlo_module_loader_test.cc | 48 +++++++ 4 files changed, 279 insertions(+) create mode 100644 tensorflow/compiler/xla/tools/hlo_module_loader.cc create mode 100644 tensorflow/compiler/xla/tools/hlo_module_loader.h create mode 100644 tensorflow/compiler/xla/tools/hlo_module_loader_test.cc diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index da20d28ea81..d18cf667848 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -252,3 +252,30 @@ sh_test( srcs = ["interactive_graphviz_test.sh"], data = [":interactive_graphviz"], ) + +cc_library( + name = "hlo_module_loader", + srcs = ["hlo_module_loader.cc"], + hdrs = ["hlo_module_loader.h"], + deps = [ + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/core:lib", + "//tensorflow/core:regexp_internal", + "@com_google_absl//absl/strings", + "@com_google_protobuf//:protobuf_headers", + ], +) + +tf_cc_test( + name = "hlo_module_loader_test", + srcs = ["hlo_module_loader_test.cc"], + deps = [ + ":hlo_module_loader", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:xla_internal_test_main", # fixdeps: keep + "//tensorflow/core:test", + ], +) diff --git a/tensorflow/compiler/xla/tools/hlo_module_loader.cc b/tensorflow/compiler/xla/tools/hlo_module_loader.cc new file mode 100644 index 00000000000..8eb170b25e5 --- /dev/null +++ b/tensorflow/compiler/xla/tools/hlo_module_loader.cc @@ -0,0 +1,125 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// Emits an HLO module in a text form suitable for diffing. + +#include "tensorflow/compiler/xla/tools/hlo_module_loader.h" + +#include +#include +#include + +#include "google/protobuf/text_format.h" +#include "absl/strings/str_cat.h" +#include "absl/strings/str_join.h" +#include "absl/strings/str_split.h" +#include "tensorflow/compiler/xla/debug_options_flags.h" +#include "tensorflow/compiler/xla/service/hlo_computation.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/core/lib/io/path.h" +#include "tensorflow/core/platform/env.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/regexp.h" + +namespace xla { +namespace { + +Status OverrideConfig(const hlo_module_loader_details::Config& ovr_config, + HloModuleConfig* config) { + config->set_replica_count(ovr_config.num_replicas); + return Status::OK(); +} + +} // namespace + +string StripLogHeaders(const string& hlo_string) { + // I0521 12:04:45.883483 1509 service.cc:186] ... + static RE2* matcher = new RE2( + "[IWEF]\\d{4} " + "\\d{2}:\\d{2}:\\d{2}\\.\\d+\\s+\\d+\\s+[^:]+:\\d+\\]\\s?(.*)"); + absl::string_view matches[4]; + std::vector lines = absl::StrSplit(hlo_string, '\n'); + for (auto& line : lines) { + if (matcher->Match(line, 0, line.size(), RE2::ANCHOR_START, matches, 4)) { + line = string(matches[1]); + } + } + return absl::StrJoin(lines, "\n", [](string* out, const string& line) { + absl::StrAppend(out, line); + }); +} + +StatusOr> LoadModuleFromData( + const string& data, const string& format, + hlo_module_loader_details::Config ovr_config, + const std::function& config_modifier_hook) { + DebugOptions debug_options = GetDebugOptionsFromFlags(); + std::unique_ptr module; + if (format == "hlo" || format == "txt") { + string hlo_string = StripLogHeaders(data); + HloModuleConfig config; + config.set_debug_options(debug_options); + TF_RETURN_IF_ERROR(OverrideConfig(ovr_config, &config)); + if (config_modifier_hook) { + config_modifier_hook(&config); + } + TF_ASSIGN_OR_RETURN(module, + ParseAndReturnUnverifiedModule(hlo_string, config)); + } else { + HloSnapshot proto; + if (format == "pb") { + if (!proto.ParseFromString(data) && + !proto.mutable_hlo()->ParseFromString(data)) { + return InvalidArgument("Failed to parse input as HLO protobuf binary"); + } + } else if (format == "pbtxt") { + if (!proto2::TextFormat::ParseFromString(data, &proto) && + !proto2::TextFormat::ParseFromString(data, proto.mutable_hlo())) { + return InvalidArgument("Failed to parse input as HLO protobuf text"); + } + } else { + return InvalidArgument( + "Invalid format from file extension: '%s'. Expected: hlo, txt, pb, " + "or pbtxt", + format); + } + TF_ASSIGN_OR_RETURN(HloModuleConfig config, + HloModule::CreateModuleConfigFromProto( + proto.hlo().hlo_module(), debug_options)); + TF_RETURN_IF_ERROR(OverrideConfig(ovr_config, &config)); + if (config_modifier_hook) { + config_modifier_hook(&config); + } + TF_ASSIGN_OR_RETURN( + module, HloModule::CreateFromProto(proto.hlo().hlo_module(), config)); + } + return std::move(module); +} + +StatusOr> LoadModuleFromFile( + const string& path, hlo_module_loader_details::Config ovr_config, + string format, + const std::function& config_modifier_hook) { + string data; + if (format.empty()) { + format = string(tensorflow::io::Extension(path)); + } + TF_RETURN_IF_ERROR( + tensorflow::ReadFileToString(tensorflow::Env::Default(), path, &data)); + return LoadModuleFromData(data, format, ovr_config, config_modifier_hook); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/tools/hlo_module_loader.h b/tensorflow/compiler/xla/tools/hlo_module_loader.h new file mode 100644 index 00000000000..8e174cef08f --- /dev/null +++ b/tensorflow/compiler/xla/tools/hlo_module_loader.h @@ -0,0 +1,79 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_MODULE_LOADER_H_ +#define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_MODULE_LOADER_H_ + +#include +#include + +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/statusor.h" + +namespace xla { +namespace hlo_module_loader_details { + +struct Config { + Config() {} + int64 num_replicas = 1; +}; + +} // namespace hlo_module_loader_details + +// Given a string composed by multiple lines, strip the log headers, if present +// at the beginning of each line. +string StripLogHeaders(const string& hlo_string); + +// Loads an HLO module from a string. +// The data can have the followings formats: +// 1) A binary of text proto file, the proto should be in xla.HloProto type. It +// can be a binary proto (format must be "pb"), or a text proto (format must +// be "pbtxt"). +// 2) A hlo text dump, the string should be in HloModule::ToString() format +// (format must be "txt" or "hlo"). The input data can also contain log +// headers, which will be stripped. +// The ovr_config data can be used to override certain fields of the +// HloModuleConfig. +// The HloModuleConfig is passed to config_modifier_hook for custom +// modifications before use. +StatusOr> LoadModuleFromData( + const string& data, const string& format, + hlo_module_loader_details::Config ovr_config = + hlo_module_loader_details::Config(), + const std::function& config_modifier_hook = {}); + +// Loads an HLO module from file. +// The file can be one of the followings: +// 1) A binary of text proto file, the proto should be in xla.HloProto type. It +// can be a binary proto (with .pb extension), or a text proto (with a .pbtxt +// extension). +// 2) A hlo text dump, the string should be in HloModule::ToString() format +// (with a .hlo or .txt extension). A text file can also contain log headers, +// which will be stripped. +// If the format is specified (not empty), it overrides the one guessed from the +// file extension. The ovr_config data can be used to override certain fields of +// the HloModuleConfig. +// The HloModuleConfig is passed to config_modifier_hook for custom +// modifications before use. +StatusOr> LoadModuleFromFile( + const string& path, + hlo_module_loader_details::Config ovr_config = + hlo_module_loader_details::Config(), + string format = "", + const std::function& config_modifier_hook = {}); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_TOOLS_HLO_MODULE_LOADER_H_ diff --git a/tensorflow/compiler/xla/tools/hlo_module_loader_test.cc b/tensorflow/compiler/xla/tools/hlo_module_loader_test.cc new file mode 100644 index 00000000000..e88d03e6b33 --- /dev/null +++ b/tensorflow/compiler/xla/tools/hlo_module_loader_test.cc @@ -0,0 +1,48 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/tools/hlo_module_loader.h" + +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace { + +class HloModuleLoaderTest : public HloTestBase {}; + +TEST_F(HloModuleLoaderTest, StripsLogHeaders) { + const string& hlo_string = R"( +I0521 12:04:45.883483 1509 service.cc:186] HloModule test_log_stripping +I0521 12:04:45.883483 1509 service.cc:186] +I0521 12:04:45.883483 1509 service.cc:186] ENTRY entry { +I0521 12:04:45.883483 1509 service.cc:186] p0 = f32[4]{0} parameter(0) +I0521 12:04:45.883483 1509 service.cc:186] p1 = f32[4]{0} parameter(1) +I0521 12:04:45.883483 1509 service.cc:186] add = f32[4]{0} add(p0, p1) +I0521 12:04:45.883483 1509 service.cc:186] ROOT rooty = (f32[4]{0}, f32[4]{0}) tuple(p1, add) +I0521 12:04:45.883483 1509 service.cc:186] } +)"; + + TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr hlo_module, + LoadModuleFromData(hlo_string, "txt")); + EXPECT_NE(FindInstruction(hlo_module.get(), "p0"), nullptr); + EXPECT_NE(FindInstruction(hlo_module.get(), "p1"), nullptr); + EXPECT_NE(FindInstruction(hlo_module.get(), "add"), nullptr); + EXPECT_NE(FindInstruction(hlo_module.get(), "rooty"), nullptr); +} + +} // namespace +} // namespace xla From 8057c58eef5b848b53f6cbe8f3c57d69113f7733 Mon Sep 17 00:00:00 2001 From: Tiezhen WANG Date: Thu, 5 Dec 2019 01:43:32 -0800 Subject: [PATCH 146/383] TFLM: nit: use more robust way to initialize context_; This is future proof once we have more than one constructors. PiperOrigin-RevId: 283928214 Change-Id: Ic23d112b9d29351921ade6ee6735199eca71aa25 --- tensorflow/lite/experimental/micro/micro_interpreter.cc | 1 - tensorflow/lite/experimental/micro/micro_interpreter.h | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 5cc545f1460..7185d643514 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -68,7 +68,6 @@ MicroInterpreter::MicroInterpreter(const Model* model, : model_(model), op_resolver_(op_resolver), error_reporter_(error_reporter), - context_(), allocator_(&context_, model_, tensor_arena, tensor_arena_size, error_reporter_), tensors_allocated_(false) { diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.h b/tensorflow/lite/experimental/micro/micro_interpreter.h index 3e8a969874a..f34e29e06ad 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.h +++ b/tensorflow/lite/experimental/micro/micro_interpreter.h @@ -119,7 +119,8 @@ class MicroInterpreter { const Model* model_; const OpResolver& op_resolver_; ErrorReporter* error_reporter_; - TfLiteContext context_; + // Explicitly initialize TfLiteContext POD struct. + TfLiteContext context_ = {}; MicroAllocator allocator_; bool tensors_allocated_; From 389fa0598c5a1d0ffdfefa8ce24aab7d5d0f8864 Mon Sep 17 00:00:00 2001 From: Stephan Herhut Date: Thu, 5 Dec 2019 01:45:39 -0800 Subject: [PATCH 147/383] Add legalization of HLO reduce to LHLO reduce. PiperOrigin-RevId: 283928453 Change-Id: Ib4d878e41473fe41c1ef20f269542aa0f248b723 --- .../xla/transforms/hlo_legalize_to_lhlo.cc | 85 ++++++++++++++++++- .../xla/service/mlir_gpu/kernel_lowering.cc | 2 +- .../mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc | 77 +++++++++-------- 3 files changed, 121 insertions(+), 43 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc index 58d5b7aa02b..af5fb599dca 100644 --- a/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc +++ b/tensorflow/compiler/mlir/xla/transforms/hlo_legalize_to_lhlo.cc @@ -18,6 +18,7 @@ limitations under the License. #include "absl/memory/memory.h" #include "mlir/Dialect/StandardOps/Ops.h" // TF:local_config_mlir #include "mlir/IR/Attributes.h" // TF:local_config_mlir +#include "mlir/IR/BlockAndValueMapping.h" // TF:local_config_mlir #include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/Function.h" // TF:local_config_mlir #include "mlir/IR/Location.h" // TF:local_config_mlir @@ -38,13 +39,19 @@ namespace { constexpr StringRef kTempBufferAttr = "temp"; -Value* GetTensorStoreMemRef(Value* value) { +Value* GetTensorStoreOrReturnMemRef(Value* value) { for (const auto& user : value->getUsers()) { if (auto tensor_store = dyn_cast(user)) { if (tensor_store.getOperand(0) == value) { return tensor_store.getOperand(1); } } + if (auto return_op = dyn_cast(user)) { + if (return_op.getOperand(0) == value) { + auto block = return_op.getOperation()->getBlock(); + return *block->args_rbegin(); + } + } } return nullptr; } @@ -88,8 +95,8 @@ Value* InsertAllocAndDealloc(Location loc, Value* result, /// function to store that values held in the tensor. Value* GetBufferForResultValue(Location loc, Value* result, ConversionPatternRewriter* rewriter) { - if (auto tensor_store_memref = GetTensorStoreMemRef(result)) { - return tensor_store_memref; + if (auto existing_memref = GetTensorStoreOrReturnMemRef(result)) { + return existing_memref; } return InsertAllocAndDealloc(loc, result, rewriter); } @@ -122,6 +129,62 @@ class HloToLhloOpConverter : public ConversionPattern { } }; +struct HloToLHloReduceConverter + : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + + PatternMatchResult matchAndRewrite( + xla_hlo::ReduceOp op, ArrayRef operands, + ConversionPatternRewriter& rewriter) const final { + auto loc = op.getLoc(); + // TODO(b/137624192) Implement variadic reduce. + if (op.getNumResults() != 1) return matchFailure(); + if (op.getParentRegion()->getBlocks().size() != 1) { + emitError(loc, + "tensor to buffer conversion expects a single block in the " + "region containing the operation"); + } + const auto& original_results = op.getResults(); + SmallVector buffer_args(operands.begin(), operands.end()); + for (auto result : original_results) { + buffer_args.push_back(GetBufferForResultValue(loc, result, &rewriter)); + } + auto new_op = rewriter.create( + loc, llvm::None, buffer_args, op.getAttrs()); + + // Copy over the operations inside the region. + rewriter.inlineRegionBefore(op.body(), new_op.body(), new_op.body().end()); + + // Create new block arguments with correct type. + auto& entry_block = new_op.body().front(); + int original_arg_count = entry_block.getNumArguments(); + for (int i = 0; i < original_arg_count; ++i) { + auto old_arg = entry_block.getArgument(i); + auto old_type = old_arg->getType().cast(); + auto new_type = + MemRefType::get(old_type.getShape(), old_type.getElementType()); + auto new_arg = entry_block.addArgument(new_type); + rewriter.replaceUsesOfBlockArgument(old_arg, new_arg); + } + // Add an argument for the result. + entry_block.addArgument( + entry_block.getArgument(original_arg_count)->getType()); + // Remove the old arguments. + for (int i = original_arg_count - 1; i >= 0; --i) { + entry_block.eraseArgument(i); + } + // Insert terminator at the end. + rewriter.setInsertionPointToEnd(&entry_block); + rewriter.create(loc); + + rewriter.replaceOp(op, ArrayRef(buffer_args).slice(operands.size()), + llvm::to_vector<4>(original_results)); + + return matchSuccess(); + } +}; + class HloToLhloTensorLoadConverter : public ConversionPattern { public: explicit HloToLhloTensorLoadConverter(MLIRContext* context) @@ -135,6 +198,7 @@ class HloToLhloTensorLoadConverter : public ConversionPattern { } }; +// TODO(b/137624192): Rewrite into a copy and elide copy if possible. class HloToLhloTensorStoreConverter : public ConversionPattern { public: explicit HloToLhloTensorStoreConverter(MLIRContext* context) @@ -148,6 +212,19 @@ class HloToLhloTensorStoreConverter : public ConversionPattern { } }; +// TODO(b/137624192): Rewrite into a copy and elide copy if possible. +class HloToLhloReturnConverter : public OpConversionPattern { + public: + using OpConversionPattern::OpConversionPattern; + + PatternMatchResult matchAndRewrite( + xla_hlo::ReturnOp op, ArrayRef operands, + ConversionPatternRewriter& rewriter) const final { + rewriter.eraseOp(op); + return matchSuccess(); + } +}; + // Lowers from HLO dialect to LHLO dialect allocating/deallocating temporary // buffers if necessary. // @@ -215,6 +292,7 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context, xla_lhlo::BroadcastInDimOp>, HloToLhloOpConverter, HloToLhloOpConverter, + HloToLhloOpConverter, HloToLhloOpConverter, HloToLhloOpConverter, HloToLhloOpConverter, @@ -229,6 +307,7 @@ void populateHLOToLHLOConversionPattern(MLIRContext* context, HloToLhloOpConverter, HloToLhloOpConverter, HloToLhloOpConverter, + HloToLHloReduceConverter, HloToLhloReturnConverter, HloToLhloTensorLoadConverter, HloToLhloTensorStoreConverter >(context); // clang-format on diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc index 87042f51ac0..c749af3a1c3 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc @@ -65,8 +65,8 @@ struct FusionToLhloConverter mlir::OwningRewritePatternList patterns; mlir::ConversionTarget target(ctx); target.addLegalDialect<::mlir::xla_lhlo::XlaLhloDialect>(); - ::mlir::xla_hlo::populateHLOToLHLOConversionPattern(&ctx, &patterns); + getFunction().walk([&](FusionOp op) { if (failed(applyPartialConversion(op, target, patterns, nullptr))) { signalPassFailure(); diff --git a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc index e63f9484ddc..e3b736dc6c6 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/tests/mlir_gpu_lhlo_gen_test.cc @@ -255,45 +255,44 @@ ENTRY %AddMultiply (x: f32[2,2], y: f32[2,2], z: f32[2,2]) -> f32[2,2] { LoweringStage::GPU); } -// TODO(herhut): Re-enable once we can lower hlo_reduce to proper lhlo_reduce. -// TEST_F(LhloGenTest, FusedReduce) { -// CompileAndVerifyIr(R"( -// HloModule FusedReduce -// -// %add (x: f32[], y: f32[]) -> f32[] { -// %x = f32[] parameter(0) -// %y = f32[] parameter(1) -// ROOT %add = f32[] add(f32[] %x, f32[] %y) -// } -// -// %fused_computation (param: f32[100,10]) -> f32[10] { -// %param = f32[100,10] parameter(0) -// %constant = f32[] constant(0) -// ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant), -// dimensions={0}, to_apply=%add -// } -// -// ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] { -// %x = f32[100,10] parameter(0) -// ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput, -// calls=%fused_computation -// } -// )", -// R"( -// ;CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) -// ;CHECK: "xla_lhlo.fusion"() ( { -// ;CHECK: %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]] -// ;CHECK: %[[CT0:.*]] = xla_hlo.constant dense<0.000000e+00> -// ;CHECK: %[[RED:.*]] = "xla_hlo.reduce"(%0, %1) ( { -// ;CHECK: ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]]) -// ;CHECK: %[[ADD:.*]] = xla_hlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]] -// ;CHECK: "xla_hlo.return"(%[[ADD]]) -// ;CHECK: }) -// ;CHECK: tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]] -// ;CHECK: "xla_lhlo.terminator"() -// ;CHECK-NEXT: }) -// )"); -// } +TEST_F(LhloGenTest, FusedReduce) { + CompileAndVerifyIr(R"( +HloModule FusedReduce + +%add (x: f32[], y: f32[]) -> f32[] { + %x = f32[] parameter(0) + %y = f32[] parameter(1) + ROOT %add = f32[] add(f32[] %x, f32[] %y) +} + +%fused_computation (param: f32[100,10]) -> f32[10] { + %param = f32[100,10] parameter(0) + %constant = f32[] constant(0) + ROOT %reduce = f32[10]{0} reduce(f32[100,10]{1,0} %param, f32[] %constant), + dimensions={0}, to_apply=%add +} + +ENTRY %FusedReduce (x: f32[100,10]) -> f32[10] { + %x = f32[100,10] parameter(0) + ROOT %fusion = f32[10]{0} fusion(f32[100,10]{1,0} %x), kind=kInput, + calls=%fused_computation +} +)", + R"( +;CHECK: func @fusion(%[[ARG0:.*]]: [[TYPE:.*]], %[[RESULT:.*]]: [[RTYPE:.*]]) +;CHECK: "xla_lhlo.fusion"() ( { +;CHECK: %[[REF0:.*]] = tensor_load %arg0 : [[TYPE]] +;CHECK: %[[CT0:.*]] = xla_hlo.constant dense<0.000000e+00> +;CHECK: %[[RED:.*]] = "xla_hlo.reduce"(%0, %1) ( { +;CHECK: ^bb0(%[[BARG0:.*]]: [[ETYPE:.*]], %[[BARG1:.*]]: [[ETYPE]]) +;CHECK: %[[ADD:.*]] = xla_hlo.add %[[BARG0]], %[[BARG1]] : [[ETYPE]] +;CHECK: "xla_hlo.return"(%[[ADD]]) +;CHECK: }) +;CHECK: tensor_store %[[RED]], %[[RESULT]] : [[RTYPE]] +;CHECK: "xla_lhlo.terminator"() +;CHECK-NEXT: }) + )"); +} TEST_F(LhloGenTest, Broadcast) { CompileAndVerifyIr(R"( From 0e3f480596c3b5dbbc0203839bb46bc17c0739fc Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 02:46:51 -0800 Subject: [PATCH 148/383] Add TPU to saved_model/integration_tests. Along the way, pass --alsologtostderr to subprocesses, so that their logs it shows up in the main test's output. PiperOrigin-RevId: 283935606 Change-Id: If0320bcb3b0e1ac1650591b8d8161f143e41d140 --- .../distribution_strategy_utils.py | 11 ++++++++++- .../integration_tests/integration_scripts.py | 14 +++++++++++++- .../saved_model/integration_tests/mnist_util.py | 2 +- .../integration_tests/saved_model_test.py | 3 ++- 4 files changed, 26 insertions(+), 4 deletions(-) diff --git a/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py b/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py index ea91c97a5a0..1501e61d475 100644 --- a/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py +++ b/tensorflow/examples/saved_model/integration_tests/distribution_strategy_utils.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import collections +import sys from tensorflow.python.distribute import strategy_combinations @@ -28,11 +29,19 @@ _strategies = [ strategy_combinations.mirrored_strategy_with_one_gpu, strategy_combinations.mirrored_strategy_with_gpu_and_cpu, strategy_combinations.mirrored_strategy_with_two_gpus, + strategy_combinations.tpu_strategy, ] +# TODO(b/145386854): The presence of GPU strategies upsets TPU initialization, +# despite their test instances being skipped early on. +if "test_tpu" in sys.argv[0]: + _strategies = [s for s in _strategies if "GPU" not in str(s)] + named_strategies = collections.OrderedDict( - [(None, None)] + [(str(s), s) for s in _strategies]) + [(None, None)] + + [(str(s), s) for s in _strategies] +) class MaybeDistributionScope(object): diff --git a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py index fae61ddbd9a..b4e37fba5bc 100644 --- a/tensorflow/examples/saved_model/integration_tests/integration_scripts.py +++ b/tensorflow/examples/saved_model/integration_tests/integration_scripts.py @@ -34,6 +34,7 @@ import subprocess import sys from absl import app +from absl import flags as absl_flags import tensorflow.compat.v2 as tf from tensorflow.python.platform import tf_logging as logging @@ -49,10 +50,21 @@ class TestCase(tf.test.TestCase): command_parts = [sys.executable, run_script] else: command_parts = [run_script] + command_parts.append("--alsologtostderr") # For visibility in sponge. for flag_key, flag_value in flags.items(): command_parts.append("--%s=%s" % (flag_key, flag_value)) + + # TODO(b/143247229): Remove forwarding this flag once the BUILD rule + # `distribute_py_test()` stops setting it. + deepsea_flag_name = "register_deepsea_platform" + deepsea_flag_value = getattr(absl_flags.FLAGS, deepsea_flag_name, None) + if deepsea_flag_value is not None: + command_parts.append("--%s=%s" % (deepsea_flag_name, + str(deepsea_flag_value).lower())) + env = dict(TF2_BEHAVIOR="enabled", SCRIPT_NAME=script_name) - logging.info("Running: %s with environment flags %s" % (command_parts, env)) + logging.info("Running %s with added environment variables %s" % + (command_parts, env)) subprocess.check_call(command_parts, env=dict(os.environ, **env)) diff --git a/tensorflow/examples/saved_model/integration_tests/mnist_util.py b/tensorflow/examples/saved_model/integration_tests/mnist_util.py index 8e4cdac748f..9770c849603 100644 --- a/tensorflow/examples/saved_model/integration_tests/mnist_util.py +++ b/tensorflow/examples/saved_model/integration_tests/mnist_util.py @@ -33,7 +33,7 @@ def _load_random_data(num_train_and_test): def load_reshaped_data(use_fashion_mnist=False, fake_tiny_data=False): """Returns MNIST or Fashion MNIST or fake train and test data.""" - load = ((lambda: _load_random_data([16, 128])) if fake_tiny_data else + load = ((lambda: _load_random_data([128, 128])) if fake_tiny_data else tf.keras.datasets.fashion_mnist.load_data if use_fashion_mnist else tf.keras.datasets.mnist.load_data) (x_train, y_train), (x_test, y_test) = load() diff --git a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py index 232a5b5e1ba..d97b93418af 100644 --- a/tensorflow/examples/saved_model/integration_tests/saved_model_test.py +++ b/tensorflow/examples/saved_model/integration_tests/saved_model_test.py @@ -89,7 +89,8 @@ class SavedModelTest(scripts.TestCase, parameterized.TestCase): retrain_flag_value=["true", "false"], regularization_loss_multiplier=[None, 2], # Test for b/134528831. )), - test_combinations=[combinations.NamedGPUCombination()]) + test_combinations=(combinations.NamedGPUCombination(), + combinations.NamedTPUCombination())) @combinations.generate(**TEST_MNIST_CNN_GENERATE_KWARGS) def test_mnist_cnn(self, use_keras_save_api, named_strategy, From e0c933f2b6bc1aadaf81fdc2b2747429f850a973 Mon Sep 17 00:00:00 2001 From: Tres Popp Date: Thu, 5 Dec 2019 03:56:18 -0800 Subject: [PATCH 149/383] Move ModuleManager functionality into mlir::SymbolTable. Note for broken code, the following transformations occurred: ModuleManager::insert(Block::iterator, Operation*) - > SymbolTable::insert(Operation*, Block::iterator) ModuleManager::lookupSymbol -> SymbolTable::lookup ModuleManager::getModule() -> SymbolTable::getOp() ModuleManager::getContext() -> SymbolTable::getOp()->getContext() ModuleManager::* -> SymbolTable::* PiperOrigin-RevId: 283944635 Change-Id: Ic85c75df6aa855ff6f82b3265f78265c8055f1ee --- .../transforms/lower_static_tensor_list.cc | 4 +- .../transforms/optimize_functional_ops.cc | 6 +- .../transforms/cluster_outlining.cc | 14 ++--- .../tensorflow/transforms/tpu_rewrite_pass.cc | 6 +- third_party/mlir/bindings/python/pybind.cpp | 8 +-- third_party/mlir/include/mlir/IR/Module.h | 50 ----------------- .../mlir/include/mlir/IR/SymbolTable.h | 32 ++++++----- .../GPU/Transforms/KernelOutlining.cpp | 20 +++---- third_party/mlir/lib/IR/SymbolTable.cpp | 55 +++++++++++++------ third_party/mlir/test/BUILD | 1 + .../mlir/test/lib/IR/TestSymbolUses.cpp | 18 +++++- .../mlir/test/lib/TestDialect/TestOps.td | 6 ++ 12 files changed, 107 insertions(+), 113 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc index bf0e7169584..89be9ee8442 100644 --- a/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc +++ b/tensorflow/compiler/mlir/lite/transforms/lower_static_tensor_list.cc @@ -484,9 +484,9 @@ struct ConvertTensorListResize : public ConversionPattern { &rewriter); // Inserts the two blocks' names into the symbol table held by the module. - // Using ModuleManager will ensure that the inserted symbol names are + // Using SymbolTable will ensure that the inserted symbol names are // unique. - ModuleManager manager(resize_op.getParentOfType()); + SymbolTable manager(resize_op.getParentOfType()); manager.insert(then_branch_op); manager.insert(else_branch_op); diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc index c8b54d26653..173785ba5b0 100644 --- a/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc +++ b/tensorflow/compiler/mlir/lite/transforms/optimize_functional_ops.cc @@ -135,15 +135,15 @@ class FoldIfOp : public OpRewritePattern { static void EraseDeadFuncs(const FuncSet& candiate_funcs, ModuleOp module) { if (candiate_funcs.empty()) return; - ModuleManager manager(module); + SymbolTable manager(module); // Identify the functions that are used as symbols in the module and shouldn't // be erased. FuncSet in_use_funcs; - manager.getModule().walk([&](Operation* op) { + manager.getOp()->walk([&](Operation* op) { for (auto attr : op->getAttrs()) { if (auto symbol = attr.second.dyn_cast()) { - auto func = manager.lookupSymbol(symbol.getValue()); + auto func = manager.lookup(symbol.getValue()); in_use_funcs.insert(func); } } diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc index 7dab06124dc..67517129f33 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/cluster_outlining.cc @@ -52,8 +52,8 @@ void ReplaceLaunchReturnWithReturn(tf_device::ReturnOp launch_return_op, // Builds a function that outlines region attached to launch_op and inserts // built function into given module. FuncOp BuildFunction(StringRef device, llvm::ArrayRef live_ins, - tf_device::LaunchOp launch_op, - ModuleManager* module_manager, OpBuilder* builder) { + tf_device::LaunchOp launch_op, SymbolTable* symbol_table, + OpBuilder* builder) { llvm::SmallVector operand_types; operand_types.reserve(live_ins.size()); for (Value* v : live_ins) operand_types.emplace_back(v->getType()); @@ -92,14 +92,14 @@ FuncOp BuildFunction(StringRef device, llvm::ArrayRef live_ins, builder->setInsertionPoint(launch_return_op); ReplaceLaunchReturnWithReturn(launch_return_op, builder); - module_manager->insert(outlined_func); + symbol_table->insert(outlined_func); return outlined_func; } // Outlines body of `tf_device.launch` into a function and create a // `tf_device.launch_func` to invoke that function. `tf_device.launch` is // removed afterwards.` -void OutlineLaunch(tf_device::LaunchOp launch_op, ModuleManager* module_manager, +void OutlineLaunch(tf_device::LaunchOp launch_op, SymbolTable* symbol_table, OpBuilder* builder) { llvm::SetVector live_ins; getUsedValuesDefinedAbove(launch_op.body(), launch_op.body(), live_ins); @@ -108,7 +108,7 @@ void OutlineLaunch(tf_device::LaunchOp launch_op, ModuleManager* module_manager, launch_op.getAttrOfType(kDeviceAttr).getValue(); FuncOp outlined_func = BuildFunction(device, live_ins.getArrayRef(), - launch_op, module_manager, builder); + launch_op, symbol_table, builder); launch_op.setAttr(builder->getIdentifier(kFuncAttr), builder->getSymbolRefAttr(outlined_func.getName())); @@ -124,10 +124,10 @@ void OutlineLaunch(tf_device::LaunchOp launch_op, ModuleManager* module_manager, void ClusterOutliningPass::runOnModule() { ModuleOp m = getModule(); - ModuleManager module_manager(m); + SymbolTable symbol_table(m); OpBuilder builder(m.getContext()); m.walk([&](tf_device::LaunchOp launch) { - OutlineLaunch(launch, &module_manager, &builder); + OutlineLaunch(launch, &symbol_table, &builder); }); } diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc index c5bf918a496..1033670dd1c 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_rewrite_pass.cc @@ -109,13 +109,13 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func, return parent_module.emitError(CreateMissingAttributeMsg(kVersionsAttr)); module_for_func.get().getOperation()->setAttr(kVersionsAttr, versions_attr); - ModuleManager module_manager(module_for_func.get()); + SymbolTable symbol_table(module_for_func.get()); while (!referenced.empty()) { auto func = referenced.pop_back_val(); // Skip functions that have already been cloned into new module. - if (module_manager.lookupSymbol(func.getName())) continue; + if (symbol_table.lookup(func.getName())) continue; // Find any SymbolRefAttr in func that maps to a FuncOp. We need to clone // all found FuncOps to new_module to make sure new_module is @@ -138,7 +138,7 @@ LogicalResult EncapsulateFuncAndSerialize(FuncOp entry_func, // should be no other reference to it. clone.setName("main"); } - module_manager.insert(clone); + symbol_table.insert(clone); } // Serialize module and return. diff --git a/third_party/mlir/bindings/python/pybind.cpp b/third_party/mlir/bindings/python/pybind.cpp index b1be0d21336..e3333b669be 100644 --- a/third_party/mlir/bindings/python/pybind.cpp +++ b/third_party/mlir/bindings/python/pybind.cpp @@ -153,7 +153,7 @@ struct PythonMLIRModule { PythonMLIRModule() : mlirContext(), module(mlir::ModuleOp::create(mlir::UnknownLoc::get(&mlirContext))), - moduleManager(*module) {} + symbolTable(*module) {} PythonType makeMemRefType(PythonType elemType, std::vector sizes) { return ::makeMemRefType(mlir_context_t{&mlirContext}, elemType, @@ -270,7 +270,7 @@ struct PythonMLIRModule { } PythonFunction getNamedFunction(const std::string &name) { - return moduleManager.lookupSymbol(name); + return symbolTable.lookup(name); } PythonFunctionContext @@ -282,7 +282,7 @@ private: mlir::MLIRContext mlirContext; // One single module in a python-exposed MLIRContext for now. mlir::OwningModuleRef module; - mlir::ModuleManager moduleManager; + mlir::SymbolTable symbolTable; // An execution engine and an associated target machine. The latter must // outlive the former since it may be used by the transformation layers. @@ -692,7 +692,7 @@ PythonMLIRModule::declareFunction(const std::string &name, UnknownLoc::get(&mlirContext), name, mlir::Type::getFromOpaquePointer(funcType).cast(), attrs, inputAttrs); - moduleManager.insert(func); + symbolTable.insert(func); return func; } diff --git a/third_party/mlir/include/mlir/IR/Module.h b/third_party/mlir/include/mlir/IR/Module.h index 9ac985ff586..1ff885d4b66 100644 --- a/third_party/mlir/include/mlir/IR/Module.h +++ b/third_party/mlir/include/mlir/IR/Module.h @@ -118,56 +118,6 @@ public: static void build(Builder *, OperationState &) {} }; -//===----------------------------------------------------------------------===// -// Module Manager. -//===----------------------------------------------------------------------===// - -/// A class used to manage the symbols held by a module. This class handles -/// ensures that symbols inserted into a module have a unique name, and provides -/// efficient named lookup to held symbols. -class ModuleManager { -public: - ModuleManager(ModuleOp module) : module(module), symbolTable(module) {} - - /// Look up a symbol with the specified name, returning null if no such - /// name exists. Names must never include the @ on them. - template T lookupSymbol(NameTy &&name) const { - return symbolTable.lookup(name); - } - - /// Look up a symbol with the specified name, returning null if no such - /// name exists. Names must never include the @ on them. - template Operation *lookupSymbol(NameTy &&name) const { - return symbolTable.lookup(name); - } - - /// Insert a new symbol into the module, auto-renaming it as necessary. - void insert(Operation *op) { - symbolTable.insert(op); - module.push_back(op); - } - void insert(Block::iterator insertPt, Operation *op) { - symbolTable.insert(op); - module.insert(insertPt, op); - } - - /// Remove the given symbol from the module symbol table and then erase it. - void erase(Operation *op) { - symbolTable.erase(op); - op->erase(); - } - - /// Return the internally held module. - ModuleOp getModule() const { return module; } - - /// Return the context of the internal module. - MLIRContext *getContext() { return module.getContext(); } - -private: - ModuleOp module; - SymbolTable symbolTable; -}; - /// This class acts as an owning reference to a module, and will automatically /// destroy the held module if valid. class OwningModuleRef { diff --git a/third_party/mlir/include/mlir/IR/SymbolTable.h b/third_party/mlir/include/mlir/IR/SymbolTable.h index 58084183dac..ea7986172cb 100644 --- a/third_party/mlir/include/mlir/IR/SymbolTable.h +++ b/third_party/mlir/include/mlir/IR/SymbolTable.h @@ -23,15 +23,16 @@ namespace mlir { class Identifier; -class MLIRContext; class Operation; /// This class allows for representing and managing the symbol table used by -/// operations with the 'SymbolTable' trait. +/// operations with the 'SymbolTable' trait. Inserting into and erasing from +/// this SymbolTable will also insert and erase from the Operation given to it +/// at construction. class SymbolTable { public: /// Build a symbol table with the symbols within the given operation. - SymbolTable(Operation *op); + SymbolTable(Operation *symbolTableOp); /// Look up a symbol with the specified name, returning null if no such /// name exists. Names never include the @ on them. @@ -44,15 +45,16 @@ public: void erase(Operation *symbol); /// Insert a new symbol into the table, and rename it as necessary to avoid - /// collisions. - void insert(Operation *symbol); - - /// Returns the context held by this symbol table. - MLIRContext *getContext() const { return context; } + /// collisions. Also insert at the specified location in the body of the + /// associated operation. + void insert(Operation *symbol, Block::iterator insertPt = {}); /// Return the name of the attribute used for symbol names. static StringRef getSymbolAttrName() { return "sym_name"; } + /// Returns the associated operation. + Operation *getOp() const { return symbolTableOp; } + //===--------------------------------------------------------------------===// // Symbol Utilities //===--------------------------------------------------------------------===// @@ -60,7 +62,7 @@ public: /// Returns the operation registered with the given symbol name with the /// regions of 'symbolTableOp'. 'symbolTableOp' is required to be an operation /// with the 'OpTrait::SymbolTable' trait. - static Operation *lookupSymbolIn(Operation *symbolTableOp, StringRef symbol); + static Operation *lookupSymbolIn(Operation *op, StringRef symbol); /// Returns the operation registered with the given symbol name within the /// closest parent operation of, or including, 'from' with the @@ -118,11 +120,11 @@ public: /// are any unknown operations that may potentially be symbol tables. static Optional getSymbolUses(StringRef symbol, Operation *from); - /// Return if the given symbol is known to have no uses that are nested within - /// the given operation 'from'. This does not traverse into any nested symbol - /// tables, and will also only count uses on 'from' if it does not also define - /// a symbol table. This is because we treat the region as the boundary of - /// the symbol table, and not the op itself. This function will also return + /// Return if the given symbol is known to have no uses that are nested + /// within the given operation 'from'. This does not traverse into any nested + /// symbol tables, and will also only count uses on 'from' if it does not also + /// define a symbol table. This is because we treat the region as the boundary + /// of the symbol table, and not the op itself. This function will also return /// false if there are any unknown operations that may potentially be symbol /// tables. This doesn't necessarily mean that there are no uses, we just /// can't convervatively prove it. @@ -141,7 +143,7 @@ public: Operation *from); private: - MLIRContext *context; + Operation *symbolTableOp; /// This is a mapping from a name to the symbol with that name. llvm::StringMap symbolTable; diff --git a/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp b/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp index 235a74ba1c3..81d585219a1 100644 --- a/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp +++ b/third_party/mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp @@ -24,6 +24,7 @@ #include "mlir/Dialect/StandardOps/Ops.h" #include "mlir/IR/BlockAndValueMapping.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/SymbolTable.h" #include "mlir/Pass/Pass.h" using namespace mlir; @@ -155,7 +156,7 @@ namespace { class GpuKernelOutliningPass : public ModulePass { public: void runOnModule() override { - ModuleManager moduleManager(getModule()); + SymbolTable symbolTable(getModule()); bool modified = false; for (auto func : getModule().getOps()) { // Insert just after the function. @@ -166,8 +167,8 @@ public: // Create nested module and insert outlinedFunc. The module will // originally get the same name as the function, but may be renamed on // insertion into the parent module. - auto kernelModule = createKernelModule(outlinedFunc, moduleManager); - moduleManager.insert(insertPt, kernelModule); + auto kernelModule = createKernelModule(outlinedFunc, symbolTable); + symbolTable.insert(kernelModule, insertPt); // Potentially changes signature, pulling in constants. convertToLaunchFuncOp(op, outlinedFunc); @@ -185,16 +186,15 @@ public: private: // Returns a module containing kernelFunc and all callees (recursive). ModuleOp createKernelModule(FuncOp kernelFunc, - const ModuleManager &parentModuleManager) { + const SymbolTable &parentSymbolTable) { auto context = getModule().getContext(); Builder builder(context); auto kernelModule = ModuleOp::create(builder.getUnknownLoc(), kernelFunc.getName()); kernelModule.setAttr(gpu::GPUDialect::getKernelModuleAttrName(), builder.getUnitAttr()); - ModuleManager moduleManager(kernelModule); - - moduleManager.insert(kernelFunc); + SymbolTable symbolTable(kernelModule); + symbolTable.insert(kernelFunc); llvm::SmallVector symbolDefWorklist = {kernelFunc}; while (!symbolDefWorklist.empty()) { @@ -203,13 +203,13 @@ private: for (SymbolTable::SymbolUse symbolUse : *symbolUses) { StringRef symbolName = symbolUse.getSymbolRef().cast().getValue(); - if (moduleManager.lookupSymbol(symbolName)) + if (symbolTable.lookup(symbolName)) continue; Operation *symbolDefClone = - parentModuleManager.lookupSymbol(symbolName)->clone(); + parentSymbolTable.lookup(symbolName)->clone(); symbolDefWorklist.push_back(symbolDefClone); - moduleManager.insert(symbolDefClone); + symbolTable.insert(symbolDefClone); } } } diff --git a/third_party/mlir/lib/IR/SymbolTable.cpp b/third_party/mlir/lib/IR/SymbolTable.cpp index 44d56071479..b61308b74af 100644 --- a/third_party/mlir/lib/IR/SymbolTable.cpp +++ b/third_party/mlir/lib/IR/SymbolTable.cpp @@ -31,23 +31,24 @@ static bool isPotentiallyUnknownSymbolTable(Operation *op) { //===----------------------------------------------------------------------===// /// Build a symbol table with the symbols within the given operation. -SymbolTable::SymbolTable(Operation *op) : context(op->getContext()) { - assert(op->hasTrait() && +SymbolTable::SymbolTable(Operation *symbolTableOp) + : symbolTableOp(symbolTableOp) { + assert(symbolTableOp->hasTrait() && "expected operation to have SymbolTable trait"); - assert(op->getNumRegions() == 1 && + assert(symbolTableOp->getNumRegions() == 1 && "expected operation to have a single region"); + assert(has_single_element(symbolTableOp->getRegion(0)) && + "expected operation to have a single block"); - for (auto &block : op->getRegion(0)) { - for (auto &op : block) { - auto nameAttr = op.getAttrOfType(getSymbolAttrName()); - if (!nameAttr) - continue; + for (auto &op : symbolTableOp->getRegion(0).front()) { + auto nameAttr = op.getAttrOfType(getSymbolAttrName()); + if (!nameAttr) + continue; - auto inserted = symbolTable.insert({nameAttr.getValue(), &op}); - (void)inserted; - assert(inserted.second && - "expected region to contain uniquely named symbol operations"); - } + auto inserted = symbolTable.insert({nameAttr.getValue(), &op}); + (void)inserted; + assert(inserted.second && + "expected region to contain uniquely named symbol operations"); } } @@ -61,18 +62,32 @@ Operation *SymbolTable::lookup(StringRef name) const { void SymbolTable::erase(Operation *symbol) { auto nameAttr = symbol->getAttrOfType(getSymbolAttrName()); assert(nameAttr && "expected valid 'name' attribute"); + assert(symbol->getParentOp() == symbolTableOp && + "expected this operation to be inside of the operation with this " + "SymbolTable"); auto it = symbolTable.find(nameAttr.getValue()); - if (it != symbolTable.end() && it->second == symbol) + if (it != symbolTable.end() && it->second == symbol) { symbolTable.erase(it); + symbol->erase(); + } } -/// Insert a new symbol into the table, and rename it as necessary to avoid -/// collisions. -void SymbolTable::insert(Operation *symbol) { +/// Insert a new symbol into the table and associated operation, and rename it +/// as necessary to avoid collisions. +void SymbolTable::insert(Operation *symbol, Block::iterator insertPt) { auto nameAttr = symbol->getAttrOfType(getSymbolAttrName()); assert(nameAttr && "expected valid 'name' attribute"); + auto &body = symbolTableOp->getRegion(0).front(); + if (insertPt == Block::iterator() || insertPt == body.end()) + insertPt = Block::iterator(body.getTerminator()); + + assert(insertPt->getParentOp() == symbolTableOp && + "expected insertPt to be in the associated module operation"); + + body.getOperations().insert(insertPt, symbol); + // Add this symbol to the symbol table, uniquing the name if a conflict is // detected. if (symbolTable.insert({nameAttr.getValue(), symbol}).second) @@ -89,7 +104,8 @@ void SymbolTable::insert(Operation *symbol) { nameBuffer += '_'; nameBuffer += std::to_string(uniquingCounter++); } while (!symbolTable.insert({nameBuffer, symbol}).second); - symbol->setAttr(getSymbolAttrName(), StringAttr::get(nameBuffer, context)); + symbol->setAttr(getSymbolAttrName(), + StringAttr::get(nameBuffer, symbolTableOp->getContext())); } /// Returns the operation registered with the given symbol name with the @@ -136,6 +152,9 @@ LogicalResult OpTrait::impl::verifySymbolTable(Operation *op) { if (op->getNumRegions() != 1) return op->emitOpError() << "Operations with a 'SymbolTable' must have exactly one region"; + if (!has_single_element(op->getRegion(0))) + return op->emitOpError() + << "Operations with a 'SymbolTable' must have exactly one block"; // Check that all symbols are uniquely named within child regions. llvm::StringMap nameToOrigLoc; diff --git a/third_party/mlir/test/BUILD b/third_party/mlir/test/BUILD index 25f7b8399eb..63138125ed0 100644 --- a/third_party/mlir/test/BUILD +++ b/third_party/mlir/test/BUILD @@ -101,6 +101,7 @@ cc_library( "lib/IR/TestSymbolUses.cpp", ], deps = [ + ":TestDialect", "@llvm//:support", "@local_config_mlir//:IR", "@local_config_mlir//:Pass", diff --git a/third_party/mlir/test/lib/IR/TestSymbolUses.cpp b/third_party/mlir/test/lib/IR/TestSymbolUses.cpp index c8e1da18760..8ef4bb48a1c 100644 --- a/third_party/mlir/test/lib/IR/TestSymbolUses.cpp +++ b/third_party/mlir/test/lib/IR/TestSymbolUses.cpp @@ -15,6 +15,7 @@ // limitations under the License. // ============================================================================= +#include "TestDialect.h" #include "mlir/IR/Function.h" #include "mlir/Pass/Pass.h" @@ -22,10 +23,11 @@ using namespace mlir; namespace { /// This is a symbol test pass that tests the symbol uselist functionality -/// provided by the symbol table. +/// provided by the symbol table along with erasing from the symbol table. struct SymbolUsesPass : public ModulePass { void runOnModule() override { auto module = getModule(); + std::vector ops_to_delete; for (FuncOp func : module.getOps()) { // Test computing uses on a non symboltable op. @@ -45,6 +47,8 @@ struct SymbolUsesPass : public ModulePass { // Test the functionality of symbolKnownUseEmpty. if (func.symbolKnownUseEmpty(module)) { func.emitRemark() << "function has no uses"; + if (func.getBody().empty()) + ops_to_delete.push_back(func); continue; } @@ -58,6 +62,18 @@ struct SymbolUsesPass : public ModulePass { func.emitRemark() << "function has " << llvm::size(*symbolUses) << " uses"; } + + for (FuncOp func : ops_to_delete) { + // In order to test the SymbolTable::erase method, also erase completely + // useless functions. + SymbolTable table(module); + auto func_name = func.getName(); + assert(table.lookup(func_name) && "expected no unknown operations"); + table.erase(func); + assert(!table.lookup(func_name) && + "expected erased operation to be unknown now"); + module.emitRemark() << func_name << " function successfully erased"; + } } }; diff --git a/third_party/mlir/test/lib/TestDialect/TestOps.td b/third_party/mlir/test/lib/TestDialect/TestOps.td index 6952eaa7717..d998eb37e74 100644 --- a/third_party/mlir/test/lib/TestDialect/TestOps.td +++ b/third_party/mlir/test/lib/TestDialect/TestOps.td @@ -92,6 +92,12 @@ def SymbolScopeOp : TEST_Op<"symbol_scope", let regions = (region SizedRegion<1>:$region); } +def SymbolTableRegionOp : TEST_Op<"symbol_table_region", [SymbolTable]> { + let summary = "operation which defines a new symbol table without a " + "restriction on a terminator"; + let regions = (region SizedRegion<1>:$region); +} + //===----------------------------------------------------------------------===// // Test Operands //===----------------------------------------------------------------------===// From c3c42f9afe3f88317a0263b66833fa8b32adcb88 Mon Sep 17 00:00:00 2001 From: Mehdi Amini Date: Thu, 5 Dec 2019 04:32:13 -0800 Subject: [PATCH 150/383] Fix MLIR Build after LLVM upstream JIT changes (getMainJITDylib removed) The getMainJITDylib() method was removed in 4fc68b9b7f, replace it by creating a JITDylib on the fly. PiperOrigin-RevId: 283948595 Change-Id: Ib6b6933188e5e36bced2be03a5fe08cb41101cf0 --- third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp b/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp index 58f783824bc..2913c436ad5 100644 --- a/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp +++ b/third_party/mlir/lib/ExecutionEngine/ExecutionEngine.cpp @@ -234,9 +234,12 @@ Expected> ExecutionEngine::create( auto objectLayer = std::make_unique( session, []() { return std::make_unique(); }); auto dataLayout = deserModule->getDataLayout(); + llvm::orc::JITDylib *mainJD = session.getJITDylibByName("
"); + if (!mainJD) + mainJD = &session.createJITDylib("
"); // Resolve symbols that are statically linked in the current process. - session.getMainJITDylib().addGenerator( + mainJD->addGenerator( cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess( dataLayout.getGlobalPrefix()))); From 0ea36de9ee4bfda89966599b58daf45122b33743 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 5 Dec 2019 04:39:06 -0800 Subject: [PATCH 151/383] [spirv] Fix nested loop (de)serialization For serialization, when we have nested ops, the inner loop will create multiple SPIR-V blocks. If the outer loop has block arguments (which corresponds to OpPhi instructions), we defer the handling of OpPhi's parent block handling until we serialized all blocks and then fix it up with the result . These two cases happening together was generating invalid SPIR-V blob because we previously assume the parent block to be the block containing the terminator. That is not true anymore when the block contains structured control flow ops. If that happens, it should be fixed to use the structured control flow op's merge block. For deserialization, we record a map from header blocks to their corresponding merge and continue blocks during the initial deserialization and then use the info to construct spv.selection/spv.loop. The existing implementation will also fall apart when we have nested loops. If so, we clone all blocks for the outer loop, including the ones for the inner loop, to the spv.loop's region. So the map for header blocks' merge info need to be updated; otherwise we are operating on already deleted blocks. PiperOrigin-RevId: 283949230 Change-Id: I3bc26d671073d6d704f4b5eda3e1e3bab05cd803 --- .../SPIRV/Serialization/Deserializer.cpp | 121 ++++++++++++------ .../SPIRV/Serialization/Serializer.cpp | 32 +++-- 2 files changed, 108 insertions(+), 45 deletions(-) diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp index 11509bb7688..2011c750d83 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp @@ -61,6 +61,23 @@ static inline bool isFnEntryBlock(Block *block) { } namespace { +/// A struct for containing a header block's merge and continue targets. +/// +/// This struct is used to track original structured control flow info from +/// SPIR-V blob. This info will be used to create spv.selection/spv.loop +/// later. +struct BlockMergeInfo { + Block *mergeBlock; + Block *continueBlock; // nullptr for spv.selection + + BlockMergeInfo() : mergeBlock(nullptr), continueBlock(nullptr) {} + BlockMergeInfo(Block *m, Block *c = nullptr) + : mergeBlock(m), continueBlock(c) {} +}; + +/// Map from a selection/loop's header block to its merge (and continue) target. +using BlockMergeInfoMap = DenseMap; + /// A SPIR-V module serializer. /// /// A SPIR-V binary module is a single linear stream of instructions; each @@ -253,15 +270,6 @@ private: // block and redirect all branches to the old header block to the old // merge block (which contains the spv.selection/spv.loop op now). - /// A struct for containing a header block's merge and continue targets. - struct BlockMergeInfo { - Block *mergeBlock; - Block *continueBlock; // nullptr for spv.selection - - BlockMergeInfo() : mergeBlock(nullptr), continueBlock(nullptr) {} - BlockMergeInfo(Block *m, Block *c = nullptr) - : mergeBlock(m), continueBlock(c) {} - }; /// For OpPhi instructions, we use block arguments to represent them. OpPhi /// encodes a list of (value, predecessor) pairs. At the time of handling the @@ -433,7 +441,7 @@ private: DenseMap blockMap; // Header block to its merge (and continue) target mapping. - DenseMap blockMergeInfo; + BlockMergeInfoMap blockMergeInfo; // Block to its phi (block argument) mapping. DenseMap blockPhiInfo; @@ -1648,17 +1656,21 @@ public: /// This method will create an spv.loop op in the `mergeBlock` and move all /// blocks in the structured loop into the spv.loop's region. All branches to /// the `headerBlock` will be redirected to the `mergeBlock`. - static LogicalResult structurize(Location loc, Block *headerBlock, - Block *mergeBlock, Block *continueBlock) { - return ControlFlowStructurizer(loc, headerBlock, mergeBlock, continueBlock) + /// This method will also update `mergeInfo` by remapping all blocks inside to + /// the newly cloned ones inside structured control flow op's regions. + static LogicalResult structurize(Location loc, BlockMergeInfoMap &mergeInfo, + Block *headerBlock, Block *mergeBlock, + Block *continueBlock) { + return ControlFlowStructurizer(loc, mergeInfo, headerBlock, mergeBlock, + continueBlock) .structurizeImpl(); } private: - ControlFlowStructurizer(Location loc, Block *header, Block *merge, - Block *cont) - : location(loc), headerBlock(header), mergeBlock(merge), - continueBlock(cont) {} + ControlFlowStructurizer(Location loc, BlockMergeInfoMap &mergeInfo, + Block *header, Block *merge, Block *cont) + : location(loc), blockMergeInfo(mergeInfo), headerBlock(header), + mergeBlock(merge), continueBlock(cont) {} /// Creates a new spv.selection op at the beginning of the `mergeBlock`. spirv::SelectionOp createSelectionOp(); @@ -1666,14 +1678,15 @@ private: /// Creates a new spv.loop op at the beginning of the `mergeBlock`. spirv::LoopOp createLoopOp(); - /// Collects all blocks reachable from `headerBlock` except `mergeBlock` and - /// `continueBlock` into `constructBlocks`. + /// Collects all blocks reachable from `headerBlock` except `mergeBlock`. void collectBlocksInConstruct(); LogicalResult structurizeImpl(); Location location; + BlockMergeInfoMap &blockMergeInfo; + Block *headerBlock; Block *mergeBlock; Block *continueBlock; // nullptr for spv.selection @@ -1713,10 +1726,11 @@ void ControlFlowStructurizer::collectBlocksInConstruct() { // Put the header block in the work list first. constructBlocks.insert(headerBlock); - // For each item in the work list, add its successors under conditions. + // For each item in the work list, add its successors excluding the merge + // block. for (unsigned i = 0; i < constructBlocks.size(); ++i) { for (auto *successor : constructBlocks[i]->getSuccessors()) - if (successor != mergeBlock && successor != continueBlock) + if (successor != mergeBlock) constructBlocks.insert(successor); } } @@ -1741,11 +1755,6 @@ LogicalResult ControlFlowStructurizer::structurizeImpl() { mapper.map(mergeBlock, &body.back()); collectBlocksInConstruct(); - if (isLoop) { - // Add the loop continue block at the last so it's the second to last block - // in LoopOp's region. - constructBlocks.insert(continueBlock); - } // We've identified all blocks belonging to the selection/loop's region. Now // need to "move" them into the selection/loop. Instead of really moving the @@ -1779,8 +1788,11 @@ LogicalResult ControlFlowStructurizer::structurizeImpl() { auto *newArg = newBlock->addArgument(blockArg->getType()); mapper.map(blockArg, newArg); LLVM_DEBUG(llvm::dbgs() << "[cf] remapped block argument " << blockArg - << " to " << newArg); + << " to " << newArg << '\n'); } + } else { + LLVM_DEBUG(llvm::dbgs() + << "[cf] block " << block << " is a function entry block\n"); } for (auto &op : *block) @@ -1833,13 +1845,40 @@ LogicalResult ControlFlowStructurizer::structurizeImpl() { // All the blocks cloned into the SelectionOp/LoopOp's region can now be // cleaned up. LLVM_DEBUG(llvm::dbgs() << "[cf] cleaning up blocks after clone\n"); - // First we need to drop all uses on ops inside all blocks. This is needed - // because we can have blocks referencing SSA values from one another. + // First we need to drop all operands' references inside all blocks. This is + // needed because we can have blocks referencing SSA values from one another. for (auto *block : constructBlocks) block->dropAllReferences(); - // Then erase all blocks except the old header block. + // Then erase all old blocks. for (auto *block : constructBlocks) { + // We've cloned all blocks belonging to this construct into the structured + // control flow op's region. Among these blocks, some may compose another + // selection/loop. If so, they will be recorded within blockMergeInfo. + // We need to update the pointers there to the newly remapped ones so we can + // continue structurizing them later. + // TODO(antiagainst): The asserts in the following assumes input SPIR-V blob + // forms correctly nested selection/loop constructs. We should relax this + // and support error cases better. + auto it = blockMergeInfo.find(block); + if (it != blockMergeInfo.end()) { + Block *newHeader = mapper.lookupOrNull(block); + assert(newHeader && "nested loop header block should be remapped!"); + + Block *newContinue = it->second.continueBlock; + if (newContinue) { + newContinue = mapper.lookupOrNull(newContinue); + assert(newContinue && "nested loop continue block should be remapped!"); + } + + Block *newMerge = it->second.mergeBlock; + if (Block *mappedTo = mapper.lookupOrNull(newMerge)) + newMerge = mappedTo; + + blockMergeInfo.try_emplace(newHeader, newMerge, newContinue); + blockMergeInfo.erase(it); + } + // The structured selection/loop's entry block does not have arguments. // If the function's header block is also part of the structured control // flow, we cannot just simply erase it because it may contain arguments @@ -1858,6 +1897,11 @@ LogicalResult ControlFlowStructurizer::structurizeImpl() { } } + LLVM_DEBUG( + llvm::dbgs() << "[cf] after structurizing construct with header block " + << headerBlock << ":\n" + << *op << '\n'); + return success(); } @@ -1913,13 +1957,13 @@ LogicalResult Deserializer::wireUpBlockArgument() { LogicalResult Deserializer::structurizeControlFlow() { LLVM_DEBUG(llvm::dbgs() << "[cf] start structurizing control flow\n"); - for (const auto &info : blockMergeInfo) { - auto *headerBlock = info.first; + while (!blockMergeInfo.empty()) { + Block *headerBlock = blockMergeInfo.begin()->first; + BlockMergeInfo mergeInfo = blockMergeInfo.begin()->second; + LLVM_DEBUG(llvm::dbgs() << "[cf] header block " << headerBlock << ":\n"); LLVM_DEBUG(headerBlock->print(llvm::dbgs())); - const auto &mergeInfo = info.second; - auto *mergeBlock = mergeInfo.mergeBlock; assert(mergeBlock && "merge block cannot be nullptr"); if (!mergeBlock->args_empty()) @@ -1934,11 +1978,14 @@ LogicalResult Deserializer::structurizeControlFlow() { LLVM_DEBUG(continueBlock->print(llvm::dbgs())); } - if (failed(ControlFlowStructurizer::structurize(unknownLoc, headerBlock, - mergeBlock, continueBlock))) + // Erase this case before calling into structurizer, who will update + // blockMergeInfo. + blockMergeInfo.erase(blockMergeInfo.begin()); + if (failed(ControlFlowStructurizer::structurize(unknownLoc, blockMergeInfo, + headerBlock, mergeBlock, + continueBlock))) return failure(); } - blockMergeInfo.clear(); LLVM_DEBUG(llvm::dbgs() << "[cf] completed structurizing control flow\n"); return success(); diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp index ebe3ceba336..ebafcb8675e 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Serializer.cpp @@ -43,14 +43,12 @@ using namespace mlir; /// Encodes an SPIR-V instruction with the given `opcode` and `operands` into /// the given `binary` vector. -LogicalResult encodeInstructionInto(SmallVectorImpl &binary, - spirv::Opcode op, - ArrayRef operands) { +static LogicalResult encodeInstructionInto(SmallVectorImpl &binary, + spirv::Opcode op, + ArrayRef operands) { uint32_t wordCount = 1 + operands.size(); binary.push_back(spirv::getPrefixedOpcode(wordCount, op)); - if (!operands.empty()) { binary.append(operands.begin(), operands.end()); - } return success(); } @@ -84,6 +82,18 @@ static LogicalResult visitInPrettyBlockOrder( return success(); } +/// Returns the last structured control flow op's merge block if the given +/// `block` contains any structured control flow op. Otherwise returns nullptr. +static Block *getLastStructuredControlFlowOpMergeBlock(Block *block) { + for (Operation &op : llvm::reverse(block->getOperations())) { + if (auto selectionOp = dyn_cast(op)) + return selectionOp.getMergeBlock(); + if (auto loopOp = dyn_cast(op)) + return loopOp.getMergeBlock(); + } + return nullptr; +} + namespace { /// A SPIR-V module serializer. @@ -1375,11 +1385,17 @@ LogicalResult Serializer::emitPhiForBlockArguments(Block *block) { // to this block. SmallVector, 4> predecessors; for (Block *predecessor : block->getPredecessors()) { - auto *op = predecessor->getTerminator(); - if (auto branchOp = dyn_cast(op)) { + auto *terminator = predecessor->getTerminator(); + // Check whether this predecessor block contains a structured control flow + // op. If so, the structured control flow op will be serialized to multiple + // SPIR-V blocks. The branch op jumping to the OpPhi's block then resides in + // the last structured control flow op's merge block. + if (auto *merge = getLastStructuredControlFlowOpMergeBlock(predecessor)) + predecessor = merge; + if (auto branchOp = dyn_cast(terminator)) { predecessors.emplace_back(predecessor, branchOp.operand_begin()); } else { - return op->emitError("unimplemented terminator for Phi creation"); + return terminator->emitError("unimplemented terminator for Phi creation"); } } From 5aa02563151930be9f12369577dfe97df1b3f0b7 Mon Sep 17 00:00:00 2001 From: Christian Sigg Date: Thu, 5 Dec 2019 04:41:45 -0800 Subject: [PATCH 152/383] Unpin psutil as v5.6.7 no longer shows the 'egg_base' issue. PiperOrigin-RevId: 283949442 Change-Id: I6a18e5ac36f99749294aab6d7b24b2f02f2ecaae --- tensorflow/tools/ci_build/install/install_pip_packages.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh index a8c898af72a..3f9c0e671e5 100755 --- a/tensorflow/tools/ci_build/install/install_pip_packages.sh +++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh @@ -89,9 +89,8 @@ pip2 install pandas==0.19.2 pip3 install pandas==0.19.2 # Benchmark tests require the following: -# 5.6.4 fails to pip2 install. TODO(b/143872855): remove pinning once fixed. -pip2 install psutil==5.6.3 -pip3 install psutil==5.6.3 +pip2 install psutil +pip3 install psutil pip2 install py-cpuinfo pip3 install py-cpuinfo From 99b636a4c784ff38176dacdb5c447302cf6e8625 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 05:35:45 -0800 Subject: [PATCH 153/383] Add plumbing to allow for compiling for ChromeOS. PiperOrigin-RevId: 283955023 Change-Id: I9fb819e7c1942feef1d0b42a4bd7d3911340fbde --- tensorflow/BUILD | 6 +++++ tensorflow/core/BUILD | 5 ++++- tensorflow/core/platform/BUILD | 40 ++++++++++++++++++++++++++++++++++ tensorflow/tensorflow.bzl | 6 +++++ 4 files changed, 56 insertions(+), 1 deletion(-) diff --git a/tensorflow/BUILD b/tensorflow/BUILD index 0f299ec13f8..bc5e32bd572 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -195,6 +195,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "chromiumos", + values = {"crosstool_top": "//external:android/chromiumos"}, + visibility = ["//visibility:public"], +) + config_setting( name = "linux_aarch64", values = {"cpu": "aarch64"}, diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 107a726ea60..d5730316774 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -67,6 +67,7 @@ load( "//tensorflow:tensorflow.bzl", "cc_header_only_library", "if_android", + "if_chromiumos", "if_emscripten", "if_ios", "if_mobile", @@ -1570,7 +1571,6 @@ filegroup( "//tensorflow/core/lib/strings:legacy_lib_strings_all_headers", "//tensorflow/core/lib/strings:legacy_lib_strings_all_srcs", "//tensorflow/core/platform/default/build_config:android_srcs", - "//tensorflow/core/platform:legacy_srcs_no_runtime", "//tensorflow/core/profiler:mobile_srcs", "//tensorflow/core/util/ctc:android_srcs", "//tensorflow/core/util/sparse:mobile_srcs_no_runtime_group", @@ -1605,6 +1605,9 @@ filegroup( "common_runtime/eager/*", "common_runtime/gpu_device_factory.*", ], + ) + if_chromiumos( + ["//tensorflow/core/platform:legacy_srcs_no_runtime_google"], + otherwise = ["//tensorflow/core/platform:legacy_srcs_no_runtime"], ), visibility = ["//visibility:private"], ) diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD index 8e6fd49d1ab..001f7827a46 100644 --- a/tensorflow/core/platform/BUILD +++ b/tensorflow/core/platform/BUILD @@ -849,6 +849,46 @@ filegroup( visibility = ["//tensorflow/core:__pkg__"], ) +filegroup( + name = "legacy_srcs_no_runtime_google", + srcs = glob( + [ + "**/*.h", + "**/*.cc", + ], + exclude = [ + "*test.*", + "*testutil*", + "*testlib*", + "*main.cc", + "**/*test.*", + "**/*testutil*", + "**/*testlib*", + "**/*main.cc", + "**/cuda_libdevice_path.*", + # Exclude env_time and logging to avoid collisions with + # :platform_base, a common dependency for downstream targets. + "**/env_time.cc", + "**/logging.cc", + "**/mutex.cc", + "**/rocm_rocdl_path.*", + "google/test_benchmark.*", + "google/monitoring.*", + "cuda.h", + "rocm.h", + "default/**/*", + "hadoop/**/*", + "gif.h", + "jpeg.h", + "png.h", + "logger.cc", + "stream_executor.*", + "windows/**/*", + ], + ), + visibility = ["//tensorflow/core:__pkg__"], +) + filegroup( name = "legacy_lib_internal_headers", srcs = glob( diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 5f9f2296c3c..e3cfbb63d34 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -170,6 +170,12 @@ def if_emscripten(a): "//conditions:default": [], }) +def if_chromiumos(a, otherwise = []): + return select({ + clean_dep("//tensorflow:chromiumos"): a, + "//conditions:default": otherwise, + }) + def if_macos(a, otherwise = []): return select({ clean_dep("//tensorflow:macos"): a, From f15505003261b9954a8424912caaa09e29cdc39a Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 06:08:02 -0800 Subject: [PATCH 154/383] Refactors NnApiMock to extract a class to be used to do failure injection on NNAPI in native tests PiperOrigin-RevId: 283959245 Change-Id: I3210136453c025b2d09b9aa16cab86d7e424cfb7 --- tensorflow/lite/delegates/nnapi/BUILD | 1 - .../nnapi/nnapi_delegate_mock_test.h | 132 +++++++++++- tensorflow/lite/nnapi/BUILD | 26 +-- tensorflow/lite/nnapi/nnapi_handler.cc | 44 ---- tensorflow/lite/nnapi/nnapi_handler.h | 197 ------------------ tensorflow/lite/nnapi/nnapi_handler_test.cc | 143 ------------- 6 files changed, 130 insertions(+), 413 deletions(-) delete mode 100644 tensorflow/lite/nnapi/nnapi_handler.cc delete mode 100644 tensorflow/lite/nnapi/nnapi_handler.h delete mode 100644 tensorflow/lite/nnapi/nnapi_handler_test.cc diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD index 54251676da3..6e48b214d66 100644 --- a/tensorflow/lite/delegates/nnapi/BUILD +++ b/tensorflow/lite/delegates/nnapi/BUILD @@ -103,7 +103,6 @@ cc_library( }), deps = [ ":nnapi_delegate", - "//tensorflow/lite/nnapi:nnapi_handler", "//tensorflow/lite/nnapi:nnapi_implementation", "@com_google_absl//absl/memory", "@com_google_googletest//:gtest", diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h index 4a48409de1e..6e5e2098f42 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_mock_test.h @@ -28,17 +28,134 @@ limitations under the License. #include #include "absl/memory/memory.h" #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h" -#include "tensorflow/lite/nnapi/nnapi_handler.h" #include "tensorflow/lite/nnapi/nnapi_implementation.h" namespace tflite { namespace delegate { namespace nnapi { -class NnApiMock : public ::tflite::nnapi::NnApiHandler { +class NnApiMock { public: + template + void GetDeviceCountReturns() { + nnapi_->ANeuralNetworks_getDeviceCount = [](uint32_t* numDevices) -> int { + *numDevices = 2; + return Value; + }; + } + + template + void ModelCreateReturns() { + nnapi_->ANeuralNetworksModel_create = [](ANeuralNetworksModel** model) { + *model = reinterpret_cast(1); + return Value; + }; + } + + template + void AddOperandReturns() { + nnapi_->ANeuralNetworksModel_addOperand = + [](ANeuralNetworksModel* model, + const ANeuralNetworksOperandType* type) { return Value; }; + } + + template + void SetOperandValueReturns() { + nnapi_->ANeuralNetworksModel_setOperandValue = + [](ANeuralNetworksModel* model, int32_t index, const void* buffer, + size_t length) { return Value; }; + } + + template + void AddOperationReturns() { + nnapi_->ANeuralNetworksModel_addOperation = + [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type, + uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, + const uint32_t* outputs) { return Value; }; + } + + template + void IdentifyInputAndOutputsReturns() { + nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs = + [](ANeuralNetworksModel* model, uint32_t inputCount, + const uint32_t* inputs, uint32_t outputCount, + const uint32_t* outputs) { return Value; }; + } + + template + void RelaxComputationFloatReturns() { + nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16 = + [](ANeuralNetworksModel* model, bool allow) { return Value; }; + } + + template + void ModelFinishReturns() { + nnapi_->ANeuralNetworksModel_finish = [](ANeuralNetworksModel* model) { + return Value; + }; + } + + template + void MemoryCreateFromFdReturns() { + nnapi_->ANeuralNetworksMemory_createFromFd = + [](size_t size, int protect, int fd, size_t offset, + ANeuralNetworksMemory** memory) { + *memory = reinterpret_cast(2); + return Value; + }; + } + + template + void CompilationCreateReturns() { + nnapi_->ANeuralNetworksCompilation_create = + [](ANeuralNetworksModel* model, + ANeuralNetworksCompilation** compilation) { + *compilation = reinterpret_cast(3); + return Value; + }; + } + + template + void CompilationFinishReturns() { + nnapi_->ANeuralNetworksCompilation_finish = + [](ANeuralNetworksCompilation* compilation) { return Value; }; + } + + template + void ExecutionCreateReturns() { + nnapi_->ANeuralNetworksExecution_create = + [](ANeuralNetworksCompilation* compilation, + ANeuralNetworksExecution** execution) { + if (compilation == nullptr) return 1; + *execution = reinterpret_cast(4); + return Value; + }; + } + template + void ExecutionSetInputFromMemoryReturns() { + nnapi_->ANeuralNetworksExecution_setInputFromMemory = + [](ANeuralNetworksExecution* execution, int32_t index, + const ANeuralNetworksOperandType* type, + const ANeuralNetworksMemory* memory, size_t offset, + size_t length) { return Value; }; + } + template + void ExecutionSetOutputFromMemoryReturns() { + nnapi_->ANeuralNetworksExecution_setOutputFromMemory = + [](ANeuralNetworksExecution* execution, int32_t index, + const ANeuralNetworksOperandType* type, + const ANeuralNetworksMemory* memory, size_t offset, + size_t length) { return Value; }; + } + + template + void ExecutionComputeReturns() { + nnapi_->ANeuralNetworksExecution_compute = + [](ANeuralNetworksExecution* execution) { return Value; }; + } + explicit NnApiMock(NnApi* nnapi, int android_sdk_version = 29) - : ::tflite::nnapi::NnApiHandler(nnapi) { + : nnapi_(nnapi), prev_nnapi_(*nnapi) { nnapi_->nnapi_exists = true; nnapi_->android_sdk_version = android_sdk_version; @@ -69,7 +186,14 @@ class NnApiMock : public ::tflite::nnapi::NnApiHandler { ExecutionComputeReturns<0>(); } - ~NnApiMock() { Reset(); } + ~NnApiMock() { + // Restores global NNAPI to original value for non mocked tests + *nnapi_ = prev_nnapi_; + } + + private: + NnApi* nnapi_; + NnApi prev_nnapi_; }; class NnApiDelegateMockTest : public ::testing::Test { diff --git a/tensorflow/lite/nnapi/BUILD b/tensorflow/lite/nnapi/BUILD index 0a687e83131..e26d9567337 100644 --- a/tensorflow/lite/nnapi/BUILD +++ b/tensorflow/lite/nnapi/BUILD @@ -57,7 +57,7 @@ cc_library( "//conditions:default": ["-lrt"], }), deps = [ - ":nnapi_lib", + "//tensorflow/lite/nnapi:nnapi_lib", ], ) @@ -76,29 +76,7 @@ cc_test( name = "nnapi_implementation_test", srcs = ["nnapi_implementation_test.cc"], deps = [ - ":nnapi_implementation", - "@com_google_googletest//:gtest_main", - ], -) - -cc_library( - name = "nnapi_handler", - srcs = ["nnapi_handler.cc"], - hdrs = ["nnapi_handler.h"], - deps = [ - ":nnapi_implementation", - ":nnapi_lib", - "//tensorflow/core/platform:logging", - "//tensorflow/lite:framework", - ], -) - -cc_test( - name = "nnapi_handler_test", - srcs = ["nnapi_handler_test.cc"], - deps = [ - ":nnapi_handler", - ":nnapi_implementation", + "//tensorflow/lite/nnapi:nnapi_implementation", "@com_google_googletest//:gtest_main", ], ) diff --git a/tensorflow/lite/nnapi/nnapi_handler.cc b/tensorflow/lite/nnapi/nnapi_handler.cc deleted file mode 100644 index 354ad66463c..00000000000 --- a/tensorflow/lite/nnapi/nnapi_handler.cc +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/nnapi/nnapi_handler.h" - -#include - -#include "tensorflow/lite/nnapi/nnapi_implementation.h" - -namespace tflite { -namespace nnapi { - -const NnApi* NnApiPassthroughInstance() { - static const NnApi orig_nnapi_copy = *NnApiImplementation(); - return &orig_nnapi_copy; -} - -// static -NnApiHandler* NnApiHandler::Instance() { - // Ensuring that the original copy of nnapi is saved before we return - // access to NnApiHandler - NnApiPassthroughInstance(); - static NnApiHandler handler{const_cast(NnApiImplementation())}; - return &handler; -} - -void NnApiHandler::Reset() { - // Restores global NNAPI to original value - *nnapi_ = *NnApiPassthroughInstance(); -} - -} // namespace nnapi -} // namespace tflite diff --git a/tensorflow/lite/nnapi/nnapi_handler.h b/tensorflow/lite/nnapi/nnapi_handler.h deleted file mode 100644 index 70406ba2c6e..00000000000 --- a/tensorflow/lite/nnapi/nnapi_handler.h +++ /dev/null @@ -1,197 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#ifndef TENSORFLOW_LITE_NNAPI_NNAPI_HANDLER_H_ -#define TENSORFLOW_LITE_NNAPI_NNAPI_HANDLER_H_ - -#include "tensorflow/core/platform/logging.h" -#include "tensorflow/lite/nnapi/nnapi_implementation.h" - -namespace tflite { -namespace nnapi { - -// Offers an interface to alter the behaviour of the NNAPI instance. -// As for NNAPI, it is designed to be a singleton. -// It allows to change the behaviour of some of the methods with some stub -// implementation and then to reset the behavior to the original one using -// Reset(). -// -class NnApiHandler { - public: - // No destructor defined to allow this class to be used as singleton. - - // Factory method, only one instance per process/jni library. - static NnApiHandler* Instance(); - - // Makes the current object a transparent proxy again, resetting any - // applied changes to its methods. - void Reset(); - - // Using templates in the ...Returns methods because the functions need to be - // stateless and the template generated code is more readable than using a - // file-local variable in the method implementation to store the configured - // result. - - template - void GetDeviceCountReturns() { - nnapi_->ANeuralNetworks_getDeviceCount = [](uint32_t* numDevices) -> int { - *numDevices = 2; - return Value; - }; - } - - void StubGetDeviceCountWith(int(stub)(uint32_t*)) { - nnapi_->ANeuralNetworks_getDeviceCount = stub; - } - - template - void ModelCreateReturns() { - nnapi_->ANeuralNetworksModel_create = [](ANeuralNetworksModel** model) { - *model = reinterpret_cast(1); - return Value; - }; - } - - template - void AddOperandReturns() { - nnapi_->ANeuralNetworksModel_addOperand = - [](ANeuralNetworksModel* model, - const ANeuralNetworksOperandType* type) { return Value; }; - } - - template - void SetOperandValueReturns() { - nnapi_->ANeuralNetworksModel_setOperandValue = - [](ANeuralNetworksModel* model, int32_t index, const void* buffer, - size_t length) { return Value; }; - } - - template - void AddOperationReturns() { - nnapi_->ANeuralNetworksModel_addOperation = - [](ANeuralNetworksModel* model, ANeuralNetworksOperationType type, - uint32_t inputCount, const uint32_t* inputs, uint32_t outputCount, - const uint32_t* outputs) { return Value; }; - } - - template - void IdentifyInputAndOutputsReturns() { - nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs = - [](ANeuralNetworksModel* model, uint32_t inputCount, - const uint32_t* inputs, uint32_t outputCount, - const uint32_t* outputs) { return Value; }; - } - - template - void RelaxComputationFloatReturns() { - nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16 = - [](ANeuralNetworksModel* model, bool allow) { return Value; }; - } - - template - void ModelFinishReturns() { - nnapi_->ANeuralNetworksModel_finish = [](ANeuralNetworksModel* model) { - return Value; - }; - } - - template - void MemoryCreateFromFdReturns() { - nnapi_->ANeuralNetworksMemory_createFromFd = - [](size_t size, int protect, int fd, size_t offset, - ANeuralNetworksMemory** memory) { - *memory = reinterpret_cast(2); - return Value; - }; - } - - template - void CompilationCreateReturns() { - nnapi_->ANeuralNetworksCompilation_create = - [](ANeuralNetworksModel* model, - ANeuralNetworksCompilation** compilation) { - *compilation = reinterpret_cast(3); - return Value; - }; - } - - template - void CompilationFinishReturns() { - nnapi_->ANeuralNetworksCompilation_finish = - [](ANeuralNetworksCompilation* compilation) { return Value; }; - } - - template - void ExecutionCreateReturns() { - nnapi_->ANeuralNetworksExecution_create = - [](ANeuralNetworksCompilation* compilation, - ANeuralNetworksExecution** execution) { - if (compilation == nullptr) return 1; - *execution = reinterpret_cast(4); - return Value; - }; - } - template - void ExecutionSetInputFromMemoryReturns() { - nnapi_->ANeuralNetworksExecution_setInputFromMemory = - [](ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, - const ANeuralNetworksMemory* memory, size_t offset, - size_t length) { return Value; }; - } - template - void ExecutionSetOutputFromMemoryReturns() { - nnapi_->ANeuralNetworksExecution_setOutputFromMemory = - [](ANeuralNetworksExecution* execution, int32_t index, - const ANeuralNetworksOperandType* type, - const ANeuralNetworksMemory* memory, size_t offset, - size_t length) { return Value; }; - } - - template - void ExecutionComputeReturns() { - nnapi_->ANeuralNetworksExecution_compute = - [](ANeuralNetworksExecution* execution) { return Value; }; - } - - protected: - explicit NnApiHandler(NnApi* nnapi) : nnapi_(nnapi) { DCHECK(nnapi); } - - NnApi* nnapi_; -}; - -// Returns a pointer to an unaltered instance of NNAPI. Is intended -// to be used by stub methods when wanting to pass-through to original -// implementation for example: -// -// NnApiTestUtility()->StubGetDeviceWith( -// [](uint32_t devIndex, ANeuralNetworksDevice** device) -> int { -// static int count = 0; -// if (count++ < 1) { -// NnApiPassthroughInstance()->ANeuralNetworks_getDevice( -// devIndex, device); -// } else { -// return ANEURALNETWORKS_BAD_DATA; -// } -// }); -const NnApi* NnApiPassthroughInstance(); - -// Returns an instance of NnApiProxy that can be used to alter -// the behaviour of the TFLite wide instance of NnApi. -NnApiHandler* NnApiProxyInstance(); - -} // namespace nnapi -} // namespace tflite - -#endif // TENSORFLOW_LITE_NNAPI_NNAPI_HANDLER_H_ diff --git a/tensorflow/lite/nnapi/nnapi_handler_test.cc b/tensorflow/lite/nnapi/nnapi_handler_test.cc deleted file mode 100644 index aea766ef036..00000000000 --- a/tensorflow/lite/nnapi/nnapi_handler_test.cc +++ /dev/null @@ -1,143 +0,0 @@ -/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ -#include "tensorflow/lite/nnapi/nnapi_handler.h" - -#include -#include - -#include -#include -#include "tensorflow/lite/nnapi/nnapi_implementation.h" - -namespace tflite { -namespace nnapi { - -using testing::Eq; -using testing::Ne; -using testing::NotNull; - -void ExpectEquals(const NnApi& left, const NnApi& right); - -class NnApiHandlerTest : public ::testing::Test { - protected: - ~NnApiHandlerTest() override { NnApiHandler::Instance()->Reset(); } -}; - -TEST_F(NnApiHandlerTest, ShouldAlterNnApiInstanceBehaviour) { - const NnApi* nnapi = NnApiImplementation(); - - const auto device_count_stub = [](uint32_t* device_count) -> int { - *device_count = 999; - return ANEURALNETWORKS_NO_ERROR; - }; - - NnApiHandler::Instance()->StubGetDeviceCountWith(device_count_stub); - - ASSERT_THAT(nnapi->ANeuralNetworks_getDeviceCount, NotNull()); - - uint32_t device_count = 0; - nnapi->ANeuralNetworks_getDeviceCount(&device_count); - EXPECT_THAT(device_count, Eq(999)); -} - -TEST_F(NnApiHandlerTest, ShouldRestoreNnApiToItsOriginalValueWithReset) { - NnApi nnapi_orig_copy = *NnApiImplementation(); - - auto device_count_override = [](uint32_t* device_count) -> int { - *device_count = 777; - return ANEURALNETWORKS_NO_ERROR; - }; - - NnApiHandler::Instance()->StubGetDeviceCountWith(device_count_override); - - EXPECT_THAT(nnapi_orig_copy.ANeuralNetworks_getDeviceCount, - Ne(NnApiImplementation()->ANeuralNetworks_getDeviceCount)); - - NnApiHandler::Instance()->Reset(); - - ExpectEquals(nnapi_orig_copy, *NnApiImplementation()); -} - -int (*device_count_ptr)(uint32_t*); -TEST_F(NnApiHandlerTest, ShouldSupportPassthroughCalls) { - const NnApi* nnapi = NnApiImplementation(); - device_count_ptr = nnapi->ANeuralNetworks_getDeviceCount; - - NnApiHandler::Instance()->StubGetDeviceCountWith( - [](uint32_t* device_count) -> int { - return NnApiPassthroughInstance()->ANeuralNetworks_getDeviceCount == - device_count_ptr; - }); - - uint32_t device_count = 0; - EXPECT_THAT(nnapi->ANeuralNetworks_getDeviceCount(&device_count), Eq(1)); -} - -void ExpectEquals(const NnApi& left, const NnApi& right) { -#define EXPECT_NNAPI_MEMBER_EQ(name) EXPECT_EQ(left.name, right.name) - - EXPECT_NNAPI_MEMBER_EQ(nnapi_exists); - EXPECT_NNAPI_MEMBER_EQ(android_sdk_version); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksMemory_createFromFd); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksMemory_free); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_create); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_free); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_finish); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_addOperand); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_setOperandValue); - EXPECT_NNAPI_MEMBER_EQ( - ANeuralNetworksModel_setOperandSymmPerChannelQuantParams); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_setOperandValueFromMemory); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_addOperation); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_identifyInputsAndOutputs); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_relaxComputationFloat32toFloat16); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksCompilation_create); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksCompilation_free); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksCompilation_setPreference); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksCompilation_finish); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_create); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_free); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_setInput); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_setInputFromMemory); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_setOutput); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_setOutputFromMemory); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_startCompute); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksEvent_wait); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksEvent_free); - EXPECT_NNAPI_MEMBER_EQ(ASharedMemory_create); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworks_getDeviceCount); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworks_getDevice); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksDevice_getName); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksDevice_getVersion); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksDevice_getFeatureLevel); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksDevice_getType); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksModel_getSupportedOperationsForDevices); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksCompilation_createForDevices); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksCompilation_setCaching); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_compute); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_getOutputOperandRank); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_getOutputOperandDimensions); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksBurst_create); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksBurst_free); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_burstCompute); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksMemory_createFromAHardwareBuffer); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_setMeasureTiming); - EXPECT_NNAPI_MEMBER_EQ(ANeuralNetworksExecution_getDuration); - -#undef EXPECT_NNAPI_MEMBER_EQ -} - -} // namespace nnapi -} // namespace tflite From 664b9e5dac3b2fe155e260a69b8f2f6f77d0729c Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Thu, 5 Dec 2019 06:14:25 -0800 Subject: [PATCH 155/383] Open source prepare_reference_module. This is in preparation of open sourcing another tool. PiperOrigin-RevId: 283959961 Change-Id: I262f08b07223b182be3df812fbe12532ab2bb5b6 --- tensorflow/compiler/xla/tools/BUILD | 16 ++++++ .../xla/tools/prepare_reference_module.cc | 56 +++++++++++++++++++ .../xla/tools/prepare_reference_module.h | 41 ++++++++++++++ 3 files changed, 113 insertions(+) create mode 100644 tensorflow/compiler/xla/tools/prepare_reference_module.cc create mode 100644 tensorflow/compiler/xla/tools/prepare_reference_module.h diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index d18cf667848..77274980698 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -279,3 +279,19 @@ tf_cc_test( "//tensorflow/core:test", ], ) + +cc_library( + name = "prepare_reference_module", + srcs = ["prepare_reference_module.cc"], + hdrs = ["prepare_reference_module.h"], + deps = [ + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla:xla_proto_cc", + "//tensorflow/compiler/xla/service:despecializer", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/core/platform:errors", + "//tensorflow/stream_executor/lib", + ], +) diff --git a/tensorflow/compiler/xla/tools/prepare_reference_module.cc b/tensorflow/compiler/xla/tools/prepare_reference_module.cc new file mode 100644 index 00000000000..1f4cc67205c --- /dev/null +++ b/tensorflow/compiler/xla/tools/prepare_reference_module.cc @@ -0,0 +1,56 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/tools/prepare_reference_module.h" + +#include +#include + +#include "tensorflow/compiler/xla/debug_options_flags.h" +#include "tensorflow/compiler/xla/service/despecializer.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/compiler/xla/xla.pb.h" +#include "tensorflow/core/platform/errors.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace xla { + +StatusOr> PrepareReferenceModule( + const HloModule& test_module, + const std::function& config_modifier_hook, + const std::function& module_modifier_hook) { + DebugOptions debug_options = GetDebugOptionsFromFlags(); + // The combination of fast math and optimizations leads to unsound code + // transformations (see third_party/tensorflow/compiler/xla/xla.proto for + // details). The test platform should not change this from the default. + debug_options.set_xla_cpu_enable_fast_math(false); + debug_options.set_xla_gpu_enable_fast_min_max(false); + HloModuleConfig reference_config = test_module.config(); + reference_config.set_debug_options(debug_options); + if (config_modifier_hook) { + config_modifier_hook(&reference_config); + } + std::unique_ptr reference_module = + test_module.Clone(reference_config, "reference"); + if (module_modifier_hook) { + TF_RETURN_IF_ERROR(module_modifier_hook(reference_module.get())); + } else { + TF_RETURN_IF_ERROR(Despecializer().Run(reference_module.get()).status()); + } + return std::move(reference_module); +} +}; // namespace xla diff --git a/tensorflow/compiler/xla/tools/prepare_reference_module.h b/tensorflow/compiler/xla/tools/prepare_reference_module.h new file mode 100644 index 00000000000..45341c08637 --- /dev/null +++ b/tensorflow/compiler/xla/tools/prepare_reference_module.h @@ -0,0 +1,41 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PREPARE_REFERENCE_MODULE_H_ +#define TENSORFLOW_COMPILER_XLA_TOOLS_PREPARE_REFERENCE_MODULE_H_ + +#include +#include + +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_module_config.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/stream_executor/lib/status.h" + +namespace xla { + +// A helper function that takes a HloModule, derives a HloModuleConfig from it +// which disables fast-math und sets the DebugOptions from flags, then runs the +// deoptimization pipeline (or calls 'module_modifier_hook' if provided). This +// is meant to produce a reference module that is comparable to our custom test +// platforms. +StatusOr> PrepareReferenceModule( + const HloModule& test_module, + const std::function& config_modifier_hook = {}, + const std::function& module_modifier_hook = {}); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_TOOLS_PREPARE_REFERENCE_MODULE_H_ From e88a251481be63d334d44cc53771b6885580d04d Mon Sep 17 00:00:00 2001 From: Alex Stark Date: Thu, 5 Dec 2019 06:42:18 -0800 Subject: [PATCH 156/383] DepthwiseConv dot-product: Avoid use of x18, x29, x30 in NEON 3x3. PiperOrigin-RevId: 283963167 Change-Id: Idbffa8e310f1918f8f52f9d55d7bb0531d27ce11 --- .../depthwiseconv_uint8_3x3_filter.h | 2947 ++++++++++------- 1 file changed, 1696 insertions(+), 1251 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h index 9f827e988a4..ec4f664b9fe 100644 --- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h +++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h @@ -7202,166 +7202,234 @@ struct KernelMacroBlockThis Loop Header: Depth=1 + // Child Loop BB225_31 Depth 2 + // Child Loop BB225_34 Depth 2 + // Child Loop BB225_20 Depth 2 + // Child Loop BB225_23 Depth 3 + // Child Loop BB225_27 Depth 4 + // Child Loop BB225_7 Depth 2 + // Child Loop BB225_9 Depth 3 + // Child Loop BB225_15 Depth 3 + "ldp q18, q15, [%[output_block_data]]\n" + "ldp q19, q5, [%[output_block_data], #32]\n" + "ldp q20, q6, [%[output_block_data], #64]\n" + "cmp w17, #4\n" // =4 + "add %[output_block_data], x3, #96\n" // =96 + "stp x12, %[output_block_data], [sp, #40]\n" // 16-byte Folded Spill + "b.ne " DC_KERNEL_NO_MULT_16 "f\n" + // %bb.5: // in Loop: Header=BB225_4 Depth=1 + "mov x24, x12\n" + "ldr x12, [sp, #64]\n" // 8-byte Folded Reload + "mov x16, xzr\n" + "stp q6, q5, [sp, #144]\n" // 32-byte Folded Spill + "str q15, [sp, #112]\n" // 16-byte Folded Spill + "str x12, [sp, #232]\n" // 8-byte Folded Spill + "b " DC_KERNEL_NO_MULT_7 "f\n" + DC_KERNEL_NO_MULT_6 ":\n" // in Loop: Header=BB225_7 Depth=2 + "ldr x12, [sp, #232]\n" // 8-byte Folded Reload + "ldp q20, q19, [sp, #144]\n" // 32-byte Folded Reload + "add x16, x16, #1\n" // =1 + "cmp x16, #2\n" // =2 + "add x12, x12, #16\n" // =16 + "add x24, x24, #4\n" // =4 + "mov v18.16b, v15.16b\n" + "str x12, [sp, #232]\n" // 8-byte Folded Spill + "b.eq " DC_KERNEL_NO_MULT_3 "b\n" + DC_KERNEL_NO_MULT_7 ":\n" // Parent Loop BB225_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB225_9 Depth 3 + // Child Loop BB225_15 Depth 3 + "ldr x12, [sp, #184]\n" // 8-byte Folded Reload + "ldr q21, [%[bias_data]], #16\n" + "add %[output_block_data], x12, x16, lsl #4\n" + "ldr w12, [sp, #280]\n" // 4-byte Folded Reload + "ldr q22, [%[output_block_data]]\n" "mov v31.16b, v21.16b\n" - "ldr q24, [x13, %[scratch_block_data]]\n" - "ldr %[scratch_block_data], [sp, #96]\n" // 8-byte Folded Reload "mov v8.16b, v21.16b\n" + "cmp w12, #1\n" // =1 + "ldr x12, [sp, #264]\n" // 8-byte Folded Reload "mov v9.16b, v21.16b\n" "mov v10.16b, v21.16b\n" - "ldr q25, [x13, %[scratch_block_data]]\n" - "ldr %[scratch_block_data], [sp, #88]\n" // 8-byte Folded Reload - "str q3, [sp, #224]\n" // 16-byte Folded Spill - "shl v3.4s, v20.4s, #8\n" - ".word 0x4e98969f // sdot v31.4s, v20.16b, v24.16b\n" - "ldr q26, [x13, %[scratch_block_data]]\n" - "ldp %[scratch_block_data], x7, [sp, #104]\n" // 16-byte Folded Reload - ".word 0x4e989668 // sdot v8.4s, v19.16b, v24.16b\n" - ".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n" + "ldr q27, [%[output_block_data], x12]\n" + "ldr x12, [sp, #256]\n" // 8-byte Folded Reload + "ldr q26, [%[output_block_data], x12]\n" + "ldr x12, [sp, #224]\n" // 8-byte Folded Reload + ".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n" + "ldr q25, [%[output_block_data], x12]\n" + "ldr x12, [sp, #216]\n" // 8-byte Folded Reload + ".word 0x4e9a9668 // sdot v8.4s, v19.16b, v26.16b\n" + ".word 0x4e9a9649 // sdot v9.4s, v18.16b, v26.16b\n" ".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n" - "ldr q27, [x13, %[scratch_block_data]]\n" - "mov x13, x19\n" - "mov %[scratch_block_data], %[output_block_data]\n" - "str q3, [sp, #208]\n" // 16-byte Folded Spill - "stp %[output_block_data], x19, [sp, #144]\n" // 16-byte Folded Spill - "b " DC_KERNEL_NO_MULT_5 "f\n" - DC_KERNEL_NO_MULT_4 ":\n" // in Loop: Header=BB225_5 Depth=3 + "ldr q24, [%[output_block_data], x12]\n" + "ldr x12, [sp, #208]\n" // 8-byte Folded Reload + "ldr q23, [%[output_block_data], x12]\n" + "b.lt " DC_KERNEL_NO_MULT_11 "f\n" + // %bb.8: // in Loop: Header=BB225_7 Depth=2 + "stp x24, x16, [sp, #192]\n" // 16-byte Folded Spill + "ldr w12, [sp, #280]\n" // 4-byte Folded Reload + "mov x17, x24\n" + "ldr x21, [sp, #232]\n" // 8-byte Folded Reload + "mov x24, x25\n" + "mov x25, %[filter_workspace]\n" + "mov %[filter_workspace], x22\n" + "mov x22, x23\n" + "ldr x23, [sp, #88]\n" // 8-byte Folded Reload + "shl v28.4s, v18.4s, #8\n" + "shl v29.4s, v19.4s, #8\n" + "shl v30.4s, v20.4s, #8\n" + "mov v11.16b, v23.16b\n" + "mov v12.16b, v24.16b\n" + "mov v13.16b, v27.16b\n" + "mov v14.16b, v22.16b\n" + DC_KERNEL_NO_MULT_9 ":\n" // Parent Loop BB225_4 Depth=1 + // Parent Loop BB225_7 Depth=2 + // => This Inner Loop Header: Depth=3 ".word 0x4e8e965f // sdot v31.4s, v18.16b, v14.16b\n" - ".word 0x4e979648 // sdot v8.4s, v18.16b, v23.16b\n" + ".word 0x4e8d9648 // sdot v8.4s, v18.16b, v13.16b\n" ".word 0x4e999669 // sdot v9.4s, v19.16b, v25.16b\n" - ".word 0x4e97967f // sdot v31.4s, v19.16b, v23.16b\n" - ".word 0x4e9a966a // sdot v10.4s, v19.16b, v26.16b\n" + ".word 0x4e8d967f // sdot v31.4s, v19.16b, v13.16b\n" + ".word 0x4e8c966a // sdot v10.4s, v19.16b, v12.16b\n" ".word 0x4e999688 // sdot v8.4s, v20.16b, v25.16b\n" - ".word 0x4e9a9689 // sdot v9.4s, v20.16b, v26.16b\n" + ".word 0x4e8c9689 // sdot v9.4s, v20.16b, v12.16b\n" "sqrdmulh v31.4s, v31.4s, v1.4s\n" - ".word 0x4e9b968a // sdot v10.4s, v20.16b, v27.16b\n" + ".word 0x4e8b968a // sdot v10.4s, v20.16b, v11.16b\n" "sqrdmulh v8.4s, v8.4s, v1.4s\n" "sqrdmulh v9.4s, v9.4s, v1.4s\n" "sqrshl v31.4s, v31.4s, v2.4s\n" @@ -7377,437 +7445,469 @@ struct KernelMacroBlock This Inner Loop Header: Depth=3 - "cmp w2, w14\n" - "b.lt " DC_KERNEL_NO_MULT_4 "b\n" - // %bb.6: // in Loop: Header=BB225_13 Depth=2 - "ldr %[bias_data], [sp, #168]\n" // 8-byte Folded Reload - "ldp d6, d5, [sp, #72]\n" // 16-byte Folded Reload - "cmp w5, #0\n" // =0 - "add %[bias_data], x2, #16\n" // =16 - "str %[bias_data], [sp, #168]\n" // 8-byte Folded Spill - "b.le " DC_KERNEL_NO_MULT_12 "f\n" - // %bb.7: // in Loop: Header=BB225_13 Depth=2 + "st1 { v3.s }[3], [%[output_block_data]]\n" + "b.ne " DC_KERNEL_NO_MULT_9 "b\n" + // %bb.10: // in Loop: Header=BB225_7 Depth=2 + "add %[output_block_data], %[scratch_block_data], x21\n" + "ldr x21, [sp, #136]\n" // 8-byte Folded Reload + "ldp d14, d17, [sp, #96]\n" // 16-byte Folded Reload + "mov x23, x22\n" + "mov x22, %[filter_workspace]\n" + "mov %[filter_workspace], x25\n" + "mov x25, x24\n" + "ldr q15, [sp, #112]\n" // 16-byte Folded Reload + "ldp x24, x16, [sp, #192]\n" // 16-byte Folded Reload + "add x12, x26, x17\n" + "ldr w17, [sp, #284]\n" // 4-byte Folded Reload + "cmp w17, #0\n" // =0 + "b.gt " DC_KERNEL_NO_MULT_12 "f\n" + "b " DC_KERNEL_NO_MULT_6 "b\n" + DC_KERNEL_NO_MULT_11 ":\n" // in Loop: Header=BB225_7 Depth=2 + "ldr x12, [sp, #80]\n" // 8-byte Folded Reload + "add x12, x12, x16, lsl #2\n" + "ldr w17, [sp, #284]\n" // 4-byte Folded Reload + "cmp w17, #0\n" // =0 + "b.le " DC_KERNEL_NO_MULT_6 "b\n" + DC_KERNEL_NO_MULT_12 ":\n" // in Loop: Header=BB225_7 Depth=2 + "ldr w17, [sp, #284]\n" // 4-byte Folded Reload "movi v28.16b, #0\n" - "cmp w5, #3\n" // =3 "movi v29.16b, #0\n" "movi v30.16b, #0\n" + "cmp w17, #3\n" // =3 "movi v11.16b, #0\n" "movi v12.16b, #0\n" "movi v13.16b, #0\n" - "b.lt " DC_KERNEL_NO_MULT_9 "f\n" - // %bb.8: // in Loop: Header=BB225_13 Depth=2 - "ldr q28, [x11, x13]\n" - "ldr q29, [x25, x13]\n" - "ldr q30, [x24, x13]\n" - "ldr q11, [x10, x13]\n" - "ldr q12, [x23, x13]\n" - "ldr q13, [x8, x13]\n" - DC_KERNEL_NO_MULT_9 ":\n" // in Loop: Header=BB225_13 Depth=2 - "ldr x19, [sp, #136]\n" // 8-byte Folded Reload - "mov x13, xzr\n" - "mov w2, wzr\n" - "add %[output_block_data], x22, %[scratch_block_data]\n" - "add x6, x17, %[scratch_block_data]\n" - "add x7, x12, %[scratch_block_data]\n" - "add %[scratch_block_data], x19, x0\n" - "b " DC_KERNEL_NO_MULT_11 "f\n" - DC_KERNEL_NO_MULT_10 ":\n" // in Loop: Header=BB225_11 Depth=3 - ".word 0x4e8e965f // sdot v31.4s, v18.16b, v14.16b\n" - ".word 0x4e979648 // sdot v8.4s, v18.16b, v23.16b\n" + "b.lt " DC_KERNEL_NO_MULT_14 "f\n" + // %bb.13: // in Loop: Header=BB225_7 Depth=2 + "add x17, %[output_block_data], #32\n" // =32 + "ldr %[output_block_data], [sp, #264]\n" // 8-byte Folded Reload + "ldr q13, [x17]\n" + "ldr q12, [x17, %[output_block_data]]\n" + "ldr %[output_block_data], [sp, #256]\n" // 8-byte Folded Reload + "ldr q11, [x17, %[output_block_data]]\n" + "ldr %[output_block_data], [sp, #224]\n" // 8-byte Folded Reload + "ldr q30, [x17, %[output_block_data]]\n" + "ldr %[output_block_data], [sp, #216]\n" // 8-byte Folded Reload + "ldr q29, [x17, %[output_block_data]]\n" + "ldr %[output_block_data], [sp, #208]\n" // 8-byte Folded Reload + "ldr q28, [x17, %[output_block_data]]\n" + DC_KERNEL_NO_MULT_14 ":\n" // in Loop: Header=BB225_7 Depth=2 + "ldr w17, [sp, #284]\n" // 4-byte Folded Reload + DC_KERNEL_NO_MULT_15 ":\n" // Parent Loop BB225_4 Depth=1 + // Parent Loop BB225_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ".word 0x4e96965f // sdot v31.4s, v18.16b, v22.16b\n" + ".word 0x4e9b9648 // sdot v8.4s, v18.16b, v27.16b\n" ".word 0x4e999669 // sdot v9.4s, v19.16b, v25.16b\n" - ".word 0x4e97967f // sdot v31.4s, v19.16b, v23.16b\n" - ".word 0x4e9a966a // sdot v10.4s, v19.16b, v26.16b\n" + ".word 0x4e9b967f // sdot v31.4s, v19.16b, v27.16b\n" + ".word 0x4e98966a // sdot v10.4s, v19.16b, v24.16b\n" ".word 0x4e999688 // sdot v8.4s, v20.16b, v25.16b\n" - ".word 0x4e9a9689 // sdot v9.4s, v20.16b, v26.16b\n" + ".word 0x4e989689 // sdot v9.4s, v20.16b, v24.16b\n" "sqrdmulh v3.4s, v31.4s, v1.4s\n" - ".word 0x4e9b968a // sdot v10.4s, v20.16b, v27.16b\n" - "sqrdmulh v31.4s, v8.4s, v1.4s\n" - "sqrdmulh v8.4s, v9.4s, v1.4s\n" + ".word 0x4e97968a // sdot v10.4s, v20.16b, v23.16b\n" + "sqrdmulh v4.4s, v8.4s, v1.4s\n" + "sqrdmulh v5.4s, v9.4s, v1.4s\n" "sqrshl v3.4s, v3.4s, v2.4s\n" - "sqrdmulh v9.4s, v10.4s, v1.4s\n" - "sqrshl v31.4s, v31.4s, v2.4s\n" - "sqrshl v8.4s, v8.4s, v2.4s\n" + "sqrdmulh v6.4s, v10.4s, v1.4s\n" + "sqrshl v4.4s, v4.4s, v2.4s\n" + "sqrshl v5.4s, v5.4s, v2.4s\n" "sqxtn v3.4h, v3.4s\n" - "sqrshl v9.4s, v9.4s, v2.4s\n" - "sqxtn v8.4h, v8.4s\n" - "sqxtn2 v3.8h, v31.4s\n" - "sqxtn2 v8.8h, v9.4s\n" + "sqrshl v6.4s, v6.4s, v2.4s\n" + "sqxtn v5.4h, v5.4s\n" + "sqxtn2 v3.8h, v4.4s\n" + "sqxtn2 v5.8h, v6.4s\n" "sqadd v3.8h, v3.8h, v0.8h\n" - "sqadd v31.8h, v8.8h, v0.8h\n" + "sqadd v4.8h, v5.8h, v0.8h\n" "sqxtun v3.8b, v3.8h\n" - "sqxtun2 v3.16b, v31.8h\n" - "ldr q4, [sp, #272]\n" // 16-byte Folded Reload - "add x19, x7, x13\n" - "ushr v24.4s, v24.4s, #8\n" - "ushr v25.4s, v25.4s, #8\n" - "umax v3.16b, v3.16b, v4.16b\n" - "ldr q4, [sp, #256]\n" // 16-byte Folded Reload - "ushr v14.4s, v14.4s, #8\n" - "ushr v23.4s, v23.4s, #8\n" - "sli v24.4s, v30.4s, #24\n" - "umin v3.16b, v3.16b, v4.16b\n" - "str s3, [%[scratch_block_data], x13]\n" - "st1 { v3.s }[1], [x19]\n" - "add x19, x6, x13\n" - "st1 { v3.s }[2], [x19]\n" - "add x19, %[output_block_data], x13\n" + "sqxtun2 v3.16b, v4.8h\n" + "umax v3.16b, v3.16b, v7.16b\n" + "add %[output_block_data], x12, x21\n" + "umin v3.16b, v3.16b, v16.16b\n" "ushr v26.4s, v26.4s, #8\n" + "ushr v25.4s, v25.4s, #8\n" + "str s3, [x12]\n" + "st1 { v3.s }[1], [%[output_block_data]]\n" + "add %[output_block_data], x12, x28\n" + "ushr v22.4s, v22.4s, #8\n" "ushr v27.4s, v27.4s, #8\n" - "sli v25.4s, v11.4s, #24\n" + "sli v26.4s, v11.4s, #24\n" + "ushr v24.4s, v24.4s, #8\n" + "ushr v23.4s, v23.4s, #8\n" + "sli v25.4s, v30.4s, #24\n" "mov v31.16b, v21.16b\n" "mov v8.16b, v21.16b\n" "mov v9.16b, v21.16b\n" "mov v10.16b, v21.16b\n" - "add w2, w2, #1\n" // =1 - "sli v14.4s, v28.4s, #24\n" - "ushr v28.4s, v28.4s, #8\n" - "ushr v30.4s, v30.4s, #8\n" - "sli v23.4s, v29.4s, #24\n" - "ushr v29.4s, v29.4s, #8\n" - "ushr v11.4s, v11.4s, #8\n" - "sli v26.4s, v12.4s, #24\n" - "ushr v12.4s, v12.4s, #8\n" - "sli v27.4s, v13.4s, #24\n" + "st1 { v3.s }[2], [%[output_block_data]]\n" + "add %[output_block_data], x12, x27\n" + "subs w17, w17, #1\n" // =1 + "sli v22.4s, v13.4s, #24\n" "ushr v13.4s, v13.4s, #8\n" - "st1 { v3.s }[3], [x19]\n" - ".word 0x4e98969f // sdot v31.4s, v20.16b, v24.16b\n" - ".word 0x4e989668 // sdot v8.4s, v19.16b, v24.16b\n" - ".word 0x4e989649 // sdot v9.4s, v18.16b, v24.16b\n" + "ushr v11.4s, v11.4s, #8\n" + "sli v27.4s, v12.4s, #24\n" + "ushr v12.4s, v12.4s, #8\n" + "ushr v30.4s, v30.4s, #8\n" + "sli v24.4s, v29.4s, #24\n" + "ushr v29.4s, v29.4s, #8\n" + "sli v23.4s, v28.4s, #24\n" + "ushr v28.4s, v28.4s, #8\n" + ".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n" + ".word 0x4e9a9668 // sdot v8.4s, v19.16b, v26.16b\n" + ".word 0x4e9a9649 // sdot v9.4s, v18.16b, v26.16b\n" + "add x12, x12, x5\n" ".word 0x4e99964a // sdot v10.4s, v18.16b, v25.16b\n" - "add x13, x13, x16\n" - DC_KERNEL_NO_MULT_11 ":\n" // Parent Loop BB225_26 Depth=1 - // Parent Loop BB225_13 Depth=2 - // => This Inner Loop Header: Depth=3 - "cmp w2, w5\n" - "b.lt " DC_KERNEL_NO_MULT_10 "b\n" - DC_KERNEL_NO_MULT_12 ":\n" // in Loop: Header=BB225_13 Depth=2 - "ldp x19, %[scratch_block_data], [sp, #152]\n" // 16-byte Folded Reload - "ldr %[output_block_data], [sp, #144]\n" // 8-byte Folded Reload - "mov v20.16b, v17.16b\n" - "mov v19.16b, v16.16b\n" - "add %[scratch_block_data], x0, #1\n" // =1 - "add %[output_block_data], x3, #4\n" // =4 - "add x19, x19, #16\n" // =16 - "mov v18.16b, v7.16b\n" - DC_KERNEL_NO_MULT_13 ":\n" // Parent Loop BB225_26 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB225_5 Depth 3 - // Child Loop BB225_11 Depth 3 - "cmp %[scratch_block_data], #2\n" // =2 - "b.ne " DC_KERNEL_NO_MULT_3 "b\n" - "b " DC_KERNEL_NO_MULT_25 "f\n" - DC_KERNEL_NO_MULT_14 ":\n" // in Loop: Header=BB225_26 Depth=1 + "st1 { v3.s }[3], [%[output_block_data]]\n" + "b.ne " DC_KERNEL_NO_MULT_15 "b\n" + "b " DC_KERNEL_NO_MULT_6 "b\n" + DC_KERNEL_NO_MULT_16 ":\n" // in Loop: Header=BB225_4 Depth=1 + "cmp w17, #1\n" // =1 + "add x16, %[bias_data], #32\n" // =32 + "b.lt " DC_KERNEL_NO_MULT_2 "b\n" + // %bb.17: // in Loop: Header=BB225_4 Depth=1 + "ldr w23, [sp, #276]\n" // 4-byte Folded Reload + "cmp w23, #1\n" // =1 + "b.lt " DC_KERNEL_NO_MULT_29 "f\n" + // %bb.18: // in Loop: Header=BB225_4 Depth=1 + "str x16, [sp, #192]\n" // 8-byte Folded Spill "ldp q21, q22, [%[bias_data]]\n" - "ldr %[bias_data], [sp, #64]\n" // 8-byte Folded Reload - "ldr x7, [sp, #128]\n" // 8-byte Folded Reload - "mov w0, wzr\n" - "b " DC_KERNEL_NO_MULT_24 "f\n" - DC_KERNEL_NO_MULT_15 ":\n" // in Loop: Header=BB225_24 Depth=2 - "str w0, [sp, #240]\n" // 4-byte Folded Spill - "ldr %[scratch_block_data], [sp, #176]\n" // 8-byte Folded Reload - "add %[output_block_data], x7, %[filter_workspace]\n" - "ldp q23, q24, [x7]\n" - "ldp q25, q26, [%[output_block_data]]\n" - "add %[scratch_block_data], x7, x0\n" - "str %[output_block_data], [sp, #208]\n" // 8-byte Folded Spill - "ldp q27, q28, [%[scratch_block_data]]\n" - "mov w13, wzr\n" - "mov %[scratch_block_data], %[bias_data]\n" - "str %[bias_data], [sp, #224]\n" // 8-byte Folded Spill - "b " DC_KERNEL_NO_MULT_22 "f\n" - DC_KERNEL_NO_MULT_16 ":\n" // in Loop: Header=BB225_22 Depth=3 - "cmp w13, w14\n" - "orr w2, wzr, #0x4\n" - "csel w6, w5, w2, eq\n" - "add %[output_block_data], x7, #32\n" // =32 + "ldr x17, [sp, #184]\n" // 8-byte Folded Reload + "ldr x12, [sp, #80]\n" // 8-byte Folded Reload + "ldr x23, [sp, #248]\n" // 8-byte Folded Reload + "mov w24, wzr\n" + "b " DC_KERNEL_NO_MULT_20 "f\n" + DC_KERNEL_NO_MULT_19 ":\n" // in Loop: Header=BB225_20 Depth=2 + "ldr w12, [sp, #76]\n" // 4-byte Folded Reload + "add w24, w24, #1\n" // =1 + "ldr x21, [sp, #136]\n" // 8-byte Folded Reload + "ldr x17, [sp, #200]\n" // 8-byte Folded Reload + "cmp w24, w12\n" + "ldr x12, [sp, #232]\n" // 8-byte Folded Reload + "add x12, x12, x21\n" + "b.eq " DC_KERNEL_NO_MULT_28 "f\n" + DC_KERNEL_NO_MULT_20 ":\n" // Parent Loop BB225_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB225_23 Depth 3 + // Child Loop BB225_27 Depth 4 + "ldr %[output_block_data], [sp, #264]\n" // 8-byte Folded Reload + "ldp q23, q24, [x17]\n" + "mov x21, x12\n" + "mov w12, wzr\n" + "add x16, x17, %[output_block_data]\n" + "ldr %[output_block_data], [sp, #256]\n" // 8-byte Folded Reload + "ldp q25, q26, [x16]\n" + "str x16, [sp, #200]\n" // 8-byte Folded Spill + "add %[output_block_data], x17, x3\n" + "ldp q27, q28, [%[output_block_data]]\n" + "str x21, [sp, #232]\n" // 8-byte Folded Spill + "b " DC_KERNEL_NO_MULT_23 "f\n" + DC_KERNEL_NO_MULT_21 ":\n" // in Loop: Header=BB225_23 Depth=3 + "mov %[filter_workspace], x26\n" + DC_KERNEL_NO_MULT_22 ":\n" // in Loop: Header=BB225_23 Depth=3 + "ldr w17, [sp, #276]\n" // 4-byte Folded Reload + "add w12, w12, #1\n" // =1 + "cmp w12, w17\n" + "mov x17, x16\n" + "b.eq " DC_KERNEL_NO_MULT_19 "b\n" + DC_KERNEL_NO_MULT_23 ":\n" // Parent Loop BB225_4 Depth=1 + // Parent Loop BB225_20 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB225_27 Depth 4 + "mov x26, %[filter_workspace]\n" + "ldr w1, [sp, #280]\n" // 4-byte Folded Reload + "ldr w3, [sp, #284]\n" // 4-byte Folded Reload + "add x16, x17, #32\n" // =32 + "cmp w12, w1\n" + "mov w1, #4\n" + "csel w3, w3, w1, eq\n" + "cmp w3, #3\n" // =3 + "b.ge " DC_KERNEL_NO_MULT_25 "f\n" + // %bb.24: // in Loop: Header=BB225_23 Depth=3 "movi v29.16b, #0\n" + "cmp w3, #1\n" // =1 "movi v30.16b, #0\n" - "movi v8.16b, #0\n" "movi v31.16b, #0\n" - "cmp w6, #3\n" // =3 "movi v9.16b, #0\n" "movi v10.16b, #0\n" - "b.lt " DC_KERNEL_NO_MULT_18 "f\n" - // %bb.17: // in Loop: Header=BB225_22 Depth=3 - "ldr %[bias_data], [sp, #176]\n" // 8-byte Folded Reload - "add x19, %[output_block_data], %[filter_workspace]\n" - "ldp q29, q31, [x7, #32]\n" - "ldp q30, q9, [x19]\n" - "add %[bias_data], %[output_block_data], x2\n" - "ldp q8, q10, [%[bias_data]]\n" - DC_KERNEL_NO_MULT_18 ":\n" // in Loop: Header=BB225_22 Depth=3 - "mov w7, wzr\n" - "b " DC_KERNEL_NO_MULT_20 "f\n" - DC_KERNEL_NO_MULT_19 ":\n" // in Loop: Header=BB225_20 Depth=4 + "movi v8.16b, #0\n" + "b.ge " DC_KERNEL_NO_MULT_26 "f\n" + "b " DC_KERNEL_NO_MULT_21 "b\n" + DC_KERNEL_NO_MULT_25 ":\n" // in Loop: Header=BB225_23 Depth=3 + "ldr x23, [sp, #264]\n" // 8-byte Folded Reload + "mov %[filter_workspace], x22\n" + "mov x22, x15\n" + "mov x15, x14\n" + "add x23, x16, x23\n" + "mov x14, x13\n" + "mov x13, x20\n" + "mov x20, x16\n" + "mov x16, x25\n" + "ldr x25, [sp, #256]\n" // 8-byte Folded Reload + "ldp q8, q31, [x17, #32]\n" + "ldp q10, q30, [x23]\n" + "ldp x6, x23, [sp, #240]\n" // 16-byte Folded Reload + "add x25, x20, x25\n" + "ldp q9, q29, [x25]\n" + "mov x25, x16\n" + "mov x16, x20\n" + "mov x20, x13\n" + "mov x13, x14\n" + "mov x14, x15\n" + "mov x15, x22\n" + "mov x22, %[filter_workspace]\n" + "mov %[bias_data], x7\n" + DC_KERNEL_NO_MULT_26 ":\n" // in Loop: Header=BB225_23 Depth=3 + "mov %[filter_workspace], x26\n" + DC_KERNEL_NO_MULT_27 ":\n" // Parent Loop BB225_4 Depth=1 + // Parent Loop BB225_20 Depth=2 + // Parent Loop BB225_23 Depth=3 + // => This Inner Loop Header: Depth=4 "mov v3.16b, v21.16b\n" - "mov v11.16b, v22.16b\n" + "mov v4.16b, v22.16b\n" ".word 0x4e979643 // sdot v3.4s, v18.16b, v23.16b\n" - ".word 0x4e9894eb // sdot v11.4s, v7.16b, v24.16b\n" + ".word 0x4e9895e4 // sdot v4.4s, v15.16b, v24.16b\n" ".word 0x4e999663 // sdot v3.4s, v19.16b, v25.16b\n" - ".word 0x4e9a960b // sdot v11.4s, v16.16b, v26.16b\n" + ".word 0x4e9a94a4 // sdot v4.4s, v5.16b, v26.16b\n" ".word 0x4e9b9683 // sdot v3.4s, v20.16b, v27.16b\n" - ".word 0x4e9c962b // sdot v11.4s, v17.16b, v28.16b\n" + ".word 0x4e9c94c4 // sdot v4.4s, v6.16b, v28.16b\n" "sqrdmulh v3.4s, v3.4s, v1.4s\n" - "sqrdmulh v11.4s, v11.4s, v1.4s\n" + "sqrdmulh v4.4s, v4.4s, v1.4s\n" "sqrshl v3.4s, v3.4s, v2.4s\n" - "sqrshl v11.4s, v11.4s, v2.4s\n" + "sqrshl v4.4s, v4.4s, v2.4s\n" "sqxtn v3.4h, v3.4s\n" - "sqxtn2 v3.8h, v11.4s\n" + "sqxtn2 v3.8h, v4.4s\n" "sqadd v3.8h, v3.8h, v0.8h\n" "sqxtun v3.8b, v3.8h\n" - "umax v3.8b, v3.8b, v5.8b\n" + "umax v3.8b, v3.8b, v17.8b\n" "ushr v23.4s, v23.4s, #8\n" "ushr v24.4s, v24.4s, #8\n" "ushr v25.4s, v25.4s, #8\n" "ushr v26.4s, v26.4s, #8\n" "ushr v27.4s, v27.4s, #8\n" "ushr v28.4s, v28.4s, #8\n" - "umin v3.8b, v3.8b, v6.8b\n" - "sli v23.4s, v29.4s, #24\n" - "ushr v29.4s, v29.4s, #8\n" + "umin v3.8b, v3.8b, v14.8b\n" + "subs w3, w3, #1\n" // =1 + "sli v23.4s, v8.4s, #24\n" + "ushr v8.4s, v8.4s, #8\n" "sli v24.4s, v31.4s, #24\n" "ushr v31.4s, v31.4s, #8\n" - "sli v25.4s, v30.4s, #24\n" - "ushr v30.4s, v30.4s, #8\n" - "sli v26.4s, v9.4s, #24\n" - "ushr v9.4s, v9.4s, #8\n" - "sli v27.4s, v8.4s, #24\n" - "ushr v8.4s, v8.4s, #8\n" - "sli v28.4s, v10.4s, #24\n" + "sli v25.4s, v10.4s, #24\n" "ushr v10.4s, v10.4s, #8\n" - "str d3, [%[scratch_block_data]]\n" - "add %[scratch_block_data], x0, x16\n" - "add w7, w7, #1\n" // =1 - DC_KERNEL_NO_MULT_20 ":\n" // Parent Loop BB225_26 Depth=1 - // Parent Loop BB225_24 Depth=2 - // Parent Loop BB225_22 Depth=3 - // => This Inner Loop Header: Depth=4 - "cmp w7, w6\n" - "b.lt " DC_KERNEL_NO_MULT_19 "b\n" - // %bb.21: // in Loop: Header=BB225_22 Depth=3 - "add w13, w13, #1\n" // =1 - "mov x7, %[output_block_data]\n" - DC_KERNEL_NO_MULT_22 ":\n" // Parent Loop BB225_26 Depth=1 - // Parent Loop BB225_24 Depth=2 - // => This Loop Header: Depth=3 - // Child Loop BB225_20 Depth 4 - "ldr w2, [sp, #292]\n" // 4-byte Folded Reload - "cmp w13, w2\n" - "b.lt " DC_KERNEL_NO_MULT_16 "b\n" - // %bb.23: // in Loop: Header=BB225_24 Depth=2 - "ldr x13, [sp, #120]\n" // 8-byte Folded Reload - "ldr %[bias_data], [sp, #224]\n" // 8-byte Folded Reload - "ldr w0, [sp, #240]\n" // 4-byte Folded Reload - "ldr x7, [sp, #208]\n" // 8-byte Folded Reload - "add %[bias_data], x2, x13\n" - "add w0, w0, #1\n" // =1 - DC_KERNEL_NO_MULT_24 ":\n" // Parent Loop BB225_26 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB225_22 Depth 3 - // Child Loop BB225_20 Depth 4 - "ldr w13, [sp, #288]\n" // 4-byte Folded Reload - "cmp w0, w13\n" - "b.lt " DC_KERNEL_NO_MULT_15 "b\n" - DC_KERNEL_NO_MULT_25 ":\n" // in Loop: Header=BB225_26 Depth=1 - "ldr x13, [sp, #128]\n" // 8-byte Folded Reload - "ldr %[scratch_block_data], [sp, #8]\n" // 8-byte Folded Reload - "ldp x19, %[bias_data], [sp, #48]\n" // 16-byte Folded Reload - "ldr w7, [sp, #36]\n" // 4-byte Folded Reload - "ldr x6, [sp, #24]\n" // 8-byte Folded Reload - "add x13, x13, %[scratch_block_data]\n" - "str x13, [sp, #128]\n" // 8-byte Folded Spill - "ldr x13, [sp, #64]\n" // 8-byte Folded Reload - "add %[bias_data], x2, #32\n" // =32 - "add w7, w7, #1\n" // =1 - "add x19, x19, %[scratch_block_data]\n" - "add x13, x13, #8\n" // =8 - "str x13, [sp, #64]\n" // 8-byte Folded Spill - "ldr x13, [sp, #40]\n" // 8-byte Folded Reload - "add x13, x13, #8\n" // =8 - DC_KERNEL_NO_MULT_26 ":\n" // =>This Loop Header: Depth=1 - // Child Loop BB225_24 Depth 2 - // Child Loop BB225_22 Depth 3 - // Child Loop BB225_20 Depth 4 - // Child Loop BB225_13 Depth 2 - // Child Loop BB225_5 Depth 3 - // Child Loop BB225_11 Depth 3 - "ldr w0, [sp, #20]\n" // 4-byte Folded Reload - "cmp w7, w0\n" - "b.lt " DC_KERNEL_NO_MULT_1 "b\n" - // %bb.27: - // Compiled intrinsics total stack 448, now 304 for spillage only. - "add sp, sp, #304\n" // =448 + "sli v26.4s, v30.4s, #24\n" + "ushr v30.4s, v30.4s, #8\n" + "sli v27.4s, v9.4s, #24\n" + "ushr v9.4s, v9.4s, #8\n" + "sli v28.4s, v29.4s, #24\n" + "ushr v29.4s, v29.4s, #8\n" + "str d3, [x21]\n" + "add x21, x21, x5\n" + "b.ne " DC_KERNEL_NO_MULT_27 "b\n" + "b " DC_KERNEL_NO_MULT_22 "b\n" + DC_KERNEL_NO_MULT_28 ":\n" // in Loop: Header=BB225_4 Depth=1 + "ldr %[bias_data], [sp, #192]\n" // 8-byte Folded Reload + "ldr x26, [sp, #16]\n" // 8-byte Folded Reload + "b " DC_KERNEL_NO_MULT_3 "b\n" + DC_KERNEL_NO_MULT_29 ":\n" // in Loop: Header=BB225_4 Depth=1 + "ldr w12, [sp, #12]\n" // 4-byte Folded Reload + "cmp w17, #2\n" // =2 + "b.hs " DC_KERNEL_NO_MULT_31 "f\n" + // %bb.30: // in Loop: Header=BB225_4 Depth=1 + "ldr x23, [sp, #248]\n" // 8-byte Folded Reload + "mov w12, wzr\n" + "b " DC_KERNEL_NO_MULT_33 "f\n" + DC_KERNEL_NO_MULT_31 ":\n" // Parent Loop BB225_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "subs w12, w12, #2\n" // =2 + "b.ne " DC_KERNEL_NO_MULT_31 "b\n" + // %bb.32: // in Loop: Header=BB225_4 Depth=1 + "ldr w12, [sp, #12]\n" // 4-byte Folded Reload + "ldr x23, [sp, #248]\n" // 8-byte Folded Reload + "cmp w17, w12\n" + "b.eq " DC_KERNEL_NO_MULT_2 "b\n" + DC_KERNEL_NO_MULT_33 ":\n" // in Loop: Header=BB225_4 Depth=1 + "sub w12, w17, w12\n" + DC_KERNEL_NO_MULT_34 ":\n" // Parent Loop BB225_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "subs w12, w12, #1\n" // =1 + "b.ne " DC_KERNEL_NO_MULT_34 "b\n" + "b " DC_KERNEL_NO_MULT_2 "b\n" + DC_KERNEL_NO_MULT_35 ":\n" + // Compiled intrinsics total stack 464, now 320 for spillage only. + "add sp, sp, #320\n" // =464 : // Outputs. [ scratch_block_data ] "+r"(scratch_block_data), @@ -7827,8 +7927,8 @@ struct KernelMacroBlock This Inner Loop Header: Depth=3 - "cmp x30, x19\n" - "b.lt " DC_KERNEL_NO_MULT_STRIDE_4 "b\n" - "b " DC_KERNEL_NO_MULT_STRIDE_7 "f\n" - DC_KERNEL_NO_MULT_STRIDE_6 ":\n" // in Loop: Header=BB227_7 Depth=3 - "mov v8.16b, v24.16b\n" - "mov v10.16b, v24.16b\n" - ".word 0x4e9e9728 // sdot v8.4s, v25.16b, v30.16b\n" - ".word 0x4e9d9748 // sdot v8.4s, v26.16b, v29.16b\n" - ".word 0x4e9c972a // sdot v10.4s, v25.16b, v28.16b\n" - ".word 0x4e9c9768 // sdot v8.4s, v27.16b, v28.16b\n" - ".word 0x4e9f974a // sdot v10.4s, v26.16b, v31.16b\n" - ".word 0x4e89976a // sdot v10.4s, v27.16b, v9.16b\n" - "sqrdmulh v8.4s, v8.4s, v3.4s\n" + // implicit-def: $q17 + // implicit-def: $q6 + // implicit-def: $q11 + // implicit-def: $q13 + // implicit-def: $q14 + // implicit-def: $q15 + // implicit-def: $q20 + "b " DC_KERNEL_NO_MULT_STRIDE_4 "f\n" + DC_KERNEL_NO_MULT_STRIDE_2 ":\n" // in Loop: Header=BB227_4 Depth=1 + "add x25, %[bias_data], #32\n" // =32 + "mov v22.16b, v12.16b\n" + DC_KERNEL_NO_MULT_STRIDE_3 ":\n" // in Loop: Header=BB227_4 Depth=1 + "add x10, x10, #1\n" // =1 + "cmp x10, x8\n" + "add x9, x9, #8\n" // =8 + "mov %[bias_data], x25\n" + "b.eq " DC_KERNEL_NO_MULT_STRIDE_35 "f\n" + DC_KERNEL_NO_MULT_STRIDE_4 ":\n" // =>This Loop Header: Depth=1 + // Child Loop BB227_30 Depth 2 + // Child Loop BB227_22 Depth 2 + // Child Loop BB227_7 Depth 2 + // Child Loop BB227_10 Depth 2 + // Child Loop BB227_13 Depth 2 + // Child Loop BB227_26 Depth 2 + "ldr x15, [sp, #152]\n" // 8-byte Folded Reload + "add w14, w10, w10, lsl #1\n" + "lsl w14, w14, #5\n" + "cmp w1, #2\n" // =2 + "add x27, x15, x14\n" + "madd x26, x10, %[function_params], %[scratch_block_data]\n" + "b.ne " DC_KERNEL_NO_MULT_STRIDE_15 "f\n" + // %bb.5: // in Loop: Header=BB227_4 Depth=1 + "ubfx x14, x9, #3, #29\n" + "lsl x25, x14, #3\n" + "ldr x14, [sp, #88]\n" // 8-byte Folded Reload + "ldr q24, [x27]\n" + "ldr q25, [x27, #32]\n" + "ldr q26, [x27, #64]\n" + "add x24, x14, x25\n" + "ldr x14, [sp, #144]\n" // 8-byte Folded Reload + "ldr q27, [%[bias_data]]\n" + "ldr q31, [x26]\n" + "ldr q8, [x26, x12]\n" + "ldr q30, [x26, x5]\n" + "ldr q29, [x26, x19]\n" + "ldr q28, [x26, x7]\n" + "lsl w15, w10, #3\n" + "cmp w23, #1\n" // =1 + "add x28, x14, x15\n" + "mov v12.16b, v22.16b\n" + "mov w14, wzr\n" + "b.lt " DC_KERNEL_NO_MULT_STRIDE_9 "f\n" + // %bb.6: // in Loop: Header=BB227_4 Depth=1 + "mov x17, xzr\n" + "add x22, x26, #32\n" // =32 + "mov x21, x23\n" + "mov v19.16b, v30.16b\n" + DC_KERNEL_NO_MULT_STRIDE_7 ":\n" // Parent Loop BB227_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "mov v20.16b, v27.16b\n" + "mov v21.16b, v27.16b\n" + ".word 0x4e9f9714 // sdot v20.4s, v24.16b, v31.16b\n" + ".word 0x4e939715 // sdot v21.4s, v24.16b, v19.16b\n" + ".word 0x4e889734 // sdot v20.4s, v25.16b, v8.16b\n" + ".word 0x4e9d9735 // sdot v21.4s, v25.16b, v29.16b\n" + ".word 0x4e939754 // sdot v20.4s, v26.16b, v19.16b\n" + ".word 0x4e9c9755 // sdot v21.4s, v26.16b, v28.16b\n" + "sqrdmulh v20.4s, v20.4s, v3.4s\n" + "and %[output_block_data], x17, #0xffffffe0\n" + "sqrdmulh v21.4s, v21.4s, v3.4s\n" + "sqrshl v20.4s, v20.4s, v4.4s\n" + "add %[output_block_data], x22, x3\n" + "sqrshl v21.4s, v21.4s, v4.4s\n" + "sqxtn v20.4h, v20.4s\n" + "rev32 v22.8h, v31.8h\n" + "rev32 v23.8h, v8.8h\n" + "rev32 v9.8h, v30.8h\n" + "rev32 v10.8h, v29.8h\n" + "ldr q31, [%[output_block_data]]\n" + "ldr q8, [%[output_block_data], x12]\n" + "ldr q30, [%[output_block_data], x5]\n" + "ldr q29, [%[output_block_data], x19]\n" + "rev32 v19.8h, v28.8h\n" + "ldr q28, [%[output_block_data], x7]\n" + "sqxtn2 v20.8h, v21.4s\n" + "sqadd v20.8h, v20.8h, v0.8h\n" + "sqxtun v20.8b, v20.8h\n" + "add x15, x28, w14, sxtw\n" + "umax v20.8b, v20.8b, v1.8b\n" + "add %[output_block_data], x15, x11\n" + "umin v20.8b, v20.8b, v2.8b\n" + "mov v11.16b, v27.16b\n" + "str s20, [x15]\n" + "st1 { v20.s }[1], [%[output_block_data]]\n" + "trn1 v20.8h, v22.8h, v31.8h\n" + "mov v21.16b, v27.16b\n" + "trn1 v22.8h, v23.8h, v8.8h\n" + "trn1 v23.8h, v9.8h, v30.8h\n" + ".word 0x4e94970b // sdot v11.4s, v24.16b, v20.16b\n" + "trn1 v9.8h, v10.8h, v29.8h\n" + ".word 0x4e979715 // sdot v21.4s, v24.16b, v23.16b\n" + ".word 0x4e96972b // sdot v11.4s, v25.16b, v22.16b\n" + "trn1 v19.8h, v19.8h, v28.8h\n" + ".word 0x4e899735 // sdot v21.4s, v25.16b, v9.16b\n" + ".word 0x4e97974b // sdot v11.4s, v26.16b, v23.16b\n" + ".word 0x4e939755 // sdot v21.4s, v26.16b, v19.16b\n" + "sqrdmulh v19.4s, v11.4s, v3.4s\n" + "sqrdmulh v20.4s, v21.4s, v3.4s\n" + "sqrshl v19.4s, v19.4s, v4.4s\n" + "sqrshl v20.4s, v20.4s, v4.4s\n" + "sqxtn v19.4h, v19.4s\n" + "sqxtn2 v19.8h, v20.4s\n" + "sqadd v19.8h, v19.8h, v0.8h\n" + "sqxtun v19.8b, v19.8h\n" + "add x15, x15, x6\n" + "umax v19.8b, v19.8b, v1.8b\n" + "add %[output_block_data], x15, x11\n" + "umin v19.8b, v19.8b, v2.8b\n" + "add x17, x17, #32\n" // =32 + "subs x21, x21, #1\n" // =1 + "str s19, [x15]\n" + "st1 { v19.s }[1], [%[output_block_data]]\n" + "add w14, w14, w20\n" + "mov v19.16b, v30.16b\n" + "b.ne " DC_KERNEL_NO_MULT_STRIDE_7 "b\n" + // %bb.8: // in Loop: Header=BB227_4 Depth=1 + "mov v20.16b, v31.16b\n" + "mov v15.16b, v8.16b\n" + "mov v14.16b, v30.16b\n" + "mov v13.16b, v29.16b\n" + "mov v11.16b, v28.16b\n" + "mov w14, w23\n" + DC_KERNEL_NO_MULT_STRIDE_9 ":\n" // in Loop: Header=BB227_4 Depth=1 + "cmp w14, w13\n" + "ldr x14, [sp, #136]\n" // 8-byte Folded Reload + "b.ge " DC_KERNEL_NO_MULT_STRIDE_11 "f\n" + DC_KERNEL_NO_MULT_STRIDE_10 ":\n" // Parent Loop BB227_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "mov v9.16b, v27.16b\n" + "mov v10.16b, v27.16b\n" + ".word 0x4e9f9709 // sdot v9.4s, v24.16b, v31.16b\n" + ".word 0x4e889729 // sdot v9.4s, v25.16b, v8.16b\n" + ".word 0x4e9e970a // sdot v10.4s, v24.16b, v30.16b\n" + ".word 0x4e9e9749 // sdot v9.4s, v26.16b, v30.16b\n" + ".word 0x4e9d972a // sdot v10.4s, v25.16b, v29.16b\n" + ".word 0x4e9c974a // sdot v10.4s, v26.16b, v28.16b\n" + "sqrdmulh v9.4s, v9.4s, v3.4s\n" "sqrdmulh v10.4s, v10.4s, v3.4s\n" - "sqrshl v8.4s, v8.4s, v4.4s\n" + "sqrshl v9.4s, v9.4s, v4.4s\n" "sqrshl v10.4s, v10.4s, v4.4s\n" - "sqxtn v8.4h, v8.4s\n" - "sqxtn2 v8.8h, v10.4s\n" - "sqadd v8.8h, v8.8h, v0.8h\n" - "sqxtun v8.8b, v8.8h\n" - "umax v8.8b, v8.8b, v1.8b\n" - "add x12, x28, x24\n" + "sqxtn v9.4h, v9.4s\n" + "sqxtn2 v9.8h, v10.4s\n" + "sqadd v9.8h, v9.8h, v0.8h\n" + "sqxtun v9.8b, v9.8h\n" + "umax v9.8b, v9.8b, v1.8b\n" + "rev32 v31.8h, v31.8h\n" + "rev32 v8.8h, v8.8h\n" "rev32 v30.8h, v30.8h\n" "rev32 v29.8h, v29.8h\n" "rev32 v28.8h, v28.8h\n" - "rev32 v31.8h, v31.8h\n" - "rev32 v9.8h, v9.8h\n" - "umin v8.8b, v8.8b, v2.8b\n" - "add x30, x30, #1\n" // =1 - "trn1 v30.8h, v30.8h, v19.8h\n" - "trn1 v29.8h, v29.8h, v20.8h\n" - "trn1 v31.8h, v31.8h, v22.8h\n" - "trn1 v28.8h, v28.8h, v21.8h\n" - "trn1 v9.8h, v9.8h, v23.8h\n" - "str s8, [x22, x24]\n" - "st1 { v8.s }[1], [x12]\n" - "add x24, x24, x7\n" - DC_KERNEL_NO_MULT_STRIDE_7 ":\n" // Parent Loop BB227_19 Depth=1 - // Parent Loop BB227_9 Depth=2 - // => This Inner Loop Header: Depth=3 - "cmp x30, x11\n" - "b.lt " DC_KERNEL_NO_MULT_STRIDE_6 "b\n" - // %bb.8: // in Loop: Header=BB227_9 Depth=2 - "add x29, x29, #16\n" // =16 - "add x25, x25, #1\n" // =1 - "add x15, x15, #4\n" // =4 - DC_KERNEL_NO_MULT_STRIDE_9 ":\n" // Parent Loop BB227_19 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB227_5 Depth 3 - // Child Loop BB227_7 Depth 3 - "cmp x25, #2\n" // =2 - "b.ne " DC_KERNEL_NO_MULT_STRIDE_3 "b\n" - // %bb.10: // in Loop: Header=BB227_19 Depth=1 - "ldr %[filter_workspace], [sp, #48]\n" // 8-byte Folded Reload - "ldp %[scratch_block_data], %[output_block_data], [sp, #32]\n" // 16-byte Folded Reload - "ldr x26, [sp, #24]\n" // 8-byte Folded Reload - "ldr w27, [sp, #20]\n" // 4-byte Folded Reload - "ldr x28, [sp, #8]\n" // 8-byte Folded Reload - "b " DC_KERNEL_NO_MULT_STRIDE_18 "f\n" - DC_KERNEL_NO_MULT_STRIDE_11 ":\n" // in Loop: Header=BB227_19 Depth=1 - "mul w12, w28, w8\n" - "add x12, %[scratch_block_data], w12, sxtw\n" - "add x16, x12, x10\n" - "ldp q8, q9, [x16]\n" - "add x16, x12, x17\n" - "ldp q24, q25, [x23]\n" - "ldp q26, q27, [x23, #32]\n" - "ldp q28, q29, [x23, #64]\n" - "ldp q10, q12, [x16]\n" - "ldp q30, q31, [%[bias_data]]\n" - "ldp q13, q11, [x12]\n" - "mov x24, xzr\n" - "add x23, x12, #32\n" // =32 - "b " DC_KERNEL_NO_MULT_STRIDE_17 "f\n" - DC_KERNEL_NO_MULT_STRIDE_12 ":\n" // in Loop: Header=BB227_17 Depth=2 - "cmp w11, w14\n" - "ccmp x21, x15, #0, eq\n" - "b.eq " DC_KERNEL_NO_MULT_STRIDE_14 "f\n" - // %bb.13: // in Loop: Header=BB227_17 Depth=2 - "and x12, x15, #0xffffffe0\n" - "add x12, x23, x12\n" - "add x16, x12, x10\n" - "add x25, x12, x17\n" - "ldp q5, q7, [x12]\n" - "ldp q6, q17, [x16]\n" - "ldp q16, q18, [x25]\n" - DC_KERNEL_NO_MULT_STRIDE_14 ":\n" // in Loop: Header=BB227_17 Depth=2 - "mov v14.16b, v30.16b\n" - "mov v15.16b, v31.16b\n" - ".word 0x4e8d970e // sdot v14.4s, v24.16b, v13.16b\n" - ".word 0x4e88974e // sdot v14.4s, v26.16b, v8.16b\n" - ".word 0x4e8b972f // sdot v15.4s, v25.16b, v11.16b\n" - ".word 0x4e8a978e // sdot v14.4s, v28.16b, v10.16b\n" - ".word 0x4e89976f // sdot v15.4s, v27.16b, v9.16b\n" - ".word 0x4e8c97af // sdot v15.4s, v29.16b, v12.16b\n" - "sqrdmulh v14.4s, v14.4s, v3.4s\n" - "sqrdmulh v15.4s, v15.4s, v3.4s\n" - "sqrshl v14.4s, v14.4s, v4.4s\n" - "sqrshl v15.4s, v15.4s, v4.4s\n" - "sqxtn v14.4h, v14.4s\n" - "sqxtn2 v14.8h, v15.4s\n" - "sqadd v14.8h, v14.8h, v0.8h\n" - "sqxtun v14.8b, v14.8h\n" - "rev32 v13.8h, v13.8h\n" - "rev32 v8.8h, v8.8h\n" - "rev32 v10.8h, v10.8h\n" - "rev32 v11.8h, v11.8h\n" - "rev32 v9.8h, v9.8h\n" - "rev32 v12.8h, v12.8h\n" + "umin v9.8b, v9.8b, v2.8b\n" + "add x15, x24, x11\n" + "subs x14, x14, #1\n" // =1 + "trn1 v31.8h, v31.8h, v20.8h\n" + "trn1 v8.8h, v8.8h, v15.8h\n" + "trn1 v29.8h, v29.8h, v13.8h\n" + "trn1 v30.8h, v30.8h, v14.8h\n" + "trn1 v28.8h, v28.8h, v11.8h\n" + "str s9, [x24]\n" + "add x24, x24, x20\n" + "st1 { v9.s }[1], [x15]\n" + "b.ne " DC_KERNEL_NO_MULT_STRIDE_10 "b\n" + DC_KERNEL_NO_MULT_STRIDE_11 ":\n" // in Loop: Header=BB227_4 Depth=1 + "ldr q24, [x27, #16]\n" + "ldr q25, [x27, #48]\n" + "ldr q26, [x27, #80]\n" + "ldr q30, [x26, #16]!\n" + "ldr q27, [%[bias_data], #16]\n" + "cmp w23, #0\n" // =0 + "ldr q8, [x26, x12]\n" + "ldr q31, [x26, x5]\n" + "ldr q29, [x26, x19]\n" + "ldr q28, [x26, x7]\n" + "b.le " DC_KERNEL_NO_MULT_STRIDE_24 "f\n" + // %bb.12: // in Loop: Header=BB227_4 Depth=1 + "mov w14, wzr\n" + "mov x17, xzr\n" + "add x22, x26, #32\n" // =32 + "add x24, x28, #4\n" // =4 + "mov x21, x23\n" + "mov v19.16b, v31.16b\n" + DC_KERNEL_NO_MULT_STRIDE_13 ":\n" // Parent Loop BB227_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "mov v5.16b, v27.16b\n" + "mov v20.16b, v27.16b\n" + ".word 0x4e9e9705 // sdot v5.4s, v24.16b, v30.16b\n" + ".word 0x4e939714 // sdot v20.4s, v24.16b, v19.16b\n" + ".word 0x4e889725 // sdot v5.4s, v25.16b, v8.16b\n" + ".word 0x4e9d9734 // sdot v20.4s, v25.16b, v29.16b\n" + ".word 0x4e939745 // sdot v5.4s, v26.16b, v19.16b\n" + ".word 0x4e9c9754 // sdot v20.4s, v26.16b, v28.16b\n" + "sqrdmulh v5.4s, v5.4s, v3.4s\n" + "and %[output_block_data], x17, #0xffffffe0\n" + "sqrdmulh v20.4s, v20.4s, v3.4s\n" + "sqrshl v5.4s, v5.4s, v4.4s\n" + "add %[output_block_data], x22, x3\n" + "sqrshl v20.4s, v20.4s, v4.4s\n" + "sqxtn v5.4h, v5.4s\n" + "rev32 v21.8h, v30.8h\n" + "rev32 v22.8h, v8.8h\n" + "rev32 v23.8h, v31.8h\n" + "rev32 v9.8h, v29.8h\n" + "ldr q30, [%[output_block_data]]\n" + "ldr q8, [%[output_block_data], x12]\n" + "ldr q31, [%[output_block_data], x5]\n" + "ldr q29, [%[output_block_data], x19]\n" + "rev32 v19.8h, v28.8h\n" + "ldr q28, [%[output_block_data], x7]\n" + "sqxtn2 v5.8h, v20.4s\n" + "sqadd v5.8h, v5.8h, v0.8h\n" + "sqxtun v5.8b, v5.8h\n" + "add x15, x24, w14, sxtw\n" + "umax v5.8b, v5.8b, v1.8b\n" + "add %[output_block_data], x15, x11\n" + "umin v5.8b, v5.8b, v2.8b\n" + "mov v10.16b, v27.16b\n" + "str s5, [x15]\n" + "st1 { v5.s }[1], [%[output_block_data]]\n" + "trn1 v5.8h, v21.8h, v30.8h\n" + "mov v20.16b, v27.16b\n" + "trn1 v21.8h, v22.8h, v8.8h\n" + "trn1 v22.8h, v23.8h, v31.8h\n" + ".word 0x4e85970a // sdot v10.4s, v24.16b, v5.16b\n" + "trn1 v23.8h, v9.8h, v29.8h\n" + ".word 0x4e969714 // sdot v20.4s, v24.16b, v22.16b\n" + ".word 0x4e95972a // sdot v10.4s, v25.16b, v21.16b\n" + "trn1 v19.8h, v19.8h, v28.8h\n" + ".word 0x4e979734 // sdot v20.4s, v25.16b, v23.16b\n" + ".word 0x4e96974a // sdot v10.4s, v26.16b, v22.16b\n" + ".word 0x4e939754 // sdot v20.4s, v26.16b, v19.16b\n" + "sqrdmulh v5.4s, v10.4s, v3.4s\n" + "sqrdmulh v19.4s, v20.4s, v3.4s\n" + "sqrshl v5.4s, v5.4s, v4.4s\n" + "sqrshl v19.4s, v19.4s, v4.4s\n" + "sqxtn v5.4h, v5.4s\n" + "sqxtn2 v5.8h, v19.4s\n" + "sqadd v5.8h, v5.8h, v0.8h\n" + "sqxtun v5.8b, v5.8h\n" + "add x15, x15, x6\n" + "umax v5.8b, v5.8b, v1.8b\n" + "add x17, x17, #32\n" // =32 + "subs x21, x21, #1\n" // =1 + "add %[output_block_data], x15, x11\n" + "umin v5.8b, v5.8b, v2.8b\n" + "add w14, w14, w20\n" + "mov v19.16b, v31.16b\n" + "str s5, [x15]\n" + "st1 { v5.s }[1], [%[output_block_data]]\n" + "b.ne " DC_KERNEL_NO_MULT_STRIDE_13 "b\n" + // %bb.14: // in Loop: Header=BB227_4 Depth=1 + "mov v20.16b, v30.16b\n" + "mov v15.16b, v8.16b\n" + "mov v14.16b, v31.16b\n" + "mov v13.16b, v29.16b\n" + "mov v11.16b, v28.16b\n" + "mov w14, w23\n" + "cmp w14, w13\n" + "b.ge " DC_KERNEL_NO_MULT_STRIDE_2 "b\n" + "b " DC_KERNEL_NO_MULT_STRIDE_25 "f\n" + DC_KERNEL_NO_MULT_STRIDE_15 ":\n" // in Loop: Header=BB227_4 Depth=1 "cmp w13, #1\n" // =1 - "umax v14.8b, v14.8b, v1.8b\n" - "trn1 v13.8h, v13.8h, v5.8h\n" - "trn1 v11.8h, v11.8h, v7.8h\n" - "ccmp x21, x15, #0, le\n" - "trn1 v8.8h, v8.8h, v6.8h\n" - "trn1 v9.8h, v9.8h, v17.8h\n" - "trn1 v10.8h, v10.8h, v16.8h\n" - "umin v14.8b, v14.8b, v2.8b\n" - "trn1 v12.8h, v12.8h, v18.8h\n" - "str d14, [x22]\n" - "b.eq " DC_KERNEL_NO_MULT_STRIDE_16 "f\n" - // %bb.15: // in Loop: Header=BB227_17 Depth=2 + "add x25, %[bias_data], #32\n" // =32 + "b.lt " DC_KERNEL_NO_MULT_STRIDE_3 "b\n" + // %bb.16: // in Loop: Header=BB227_4 Depth=1 + "stp q13, q11, [sp, #96]\n" // 32-byte Folded Spill + "add x15, x26, x12\n" + "ldp q9, q10, [x15]\n" + "ldr x15, [sp, #144]\n" // 8-byte Folded Reload + "lsl w14, w10, #3\n" + "ldp q30, q31, [%[bias_data]]\n" + "add x17, x26, x5\n" + "add %[bias_data], x15, x14\n" + "ldr w14, [sp, #84]\n" // 4-byte Folded Reload + "ldp q24, q25, [x27]\n" + "ldp q26, q27, [x27, #32]\n" + "ldp q28, q29, [x27, #64]\n" + "ldp q12, q11, [x26], #32\n" + "ldp q8, q13, [x17]\n" + "cmp w13, w14\n" + "b.ne " DC_KERNEL_NO_MULT_STRIDE_27 "f\n" + // %bb.17: // in Loop: Header=BB227_4 Depth=1 + "ldr x14, [sp, #72]\n" // 8-byte Folded Reload + "mov x24, xzr\n" + "mov w27, wzr\n" + "mov x28, x13\n" + "mov v19.16b, v15.16b\n" + "mov v5.16b, v14.16b\n" + "cbnz x14, " DC_KERNEL_NO_MULT_STRIDE_21 "f\n" + "b " DC_KERNEL_NO_MULT_STRIDE_22 "f\n" + DC_KERNEL_NO_MULT_STRIDE_18 ":\n" // in Loop: Header=BB227_22 Depth=2 "mov v14.16b, v30.16b\n" - ".word 0x4e8d970e // sdot v14.4s, v24.16b, v13.16b\n" - "mov v13.16b, v31.16b\n" - ".word 0x4e8b972d // sdot v13.4s, v25.16b, v11.16b\n" - ".word 0x4e88974e // sdot v14.4s, v26.16b, v8.16b\n" - ".word 0x4e89976d // sdot v13.4s, v27.16b, v9.16b\n" - ".word 0x4e8a978e // sdot v14.4s, v28.16b, v10.16b\n" - ".word 0x4e8c97ad // sdot v13.4s, v29.16b, v12.16b\n" + ".word 0x4e8c970e // sdot v14.4s, v24.16b, v12.16b\n" + "mov v12.16b, v31.16b\n" + ".word 0x4e8b972c // sdot v12.4s, v25.16b, v11.16b\n" + ".word 0x4e89974e // sdot v14.4s, v26.16b, v9.16b\n" + ".word 0x4e8a976c // sdot v12.4s, v27.16b, v10.16b\n" + ".word 0x4e88978e // sdot v14.4s, v28.16b, v8.16b\n" + ".word 0x4e8d97ac // sdot v12.4s, v29.16b, v13.16b\n" "sqrdmulh v8.4s, v14.4s, v3.4s\n" - "sqrdmulh v9.4s, v13.4s, v3.4s\n" + "sqrdmulh v9.4s, v12.4s, v3.4s\n" "sqrshl v8.4s, v8.4s, v4.4s\n" "sqrshl v9.4s, v9.4s, v4.4s\n" "sqxtn v8.4h, v8.4s\n" @@ -8205,34 +8396,220 @@ struct KernelMacroBlock This Inner Loop Header: Depth=2 - "cmp x24, x11\n" - "b.lt " DC_KERNEL_NO_MULT_STRIDE_12 "b\n" - DC_KERNEL_NO_MULT_STRIDE_18 ":\n" // in Loop: Header=BB227_19 Depth=1 - "add %[bias_data], x2, #32\n" // =32 - "add x8, x8, #1\n" // =1 - DC_KERNEL_NO_MULT_STRIDE_19 ":\n" // =>This Loop Header: Depth=1 - // Child Loop BB227_17 Depth 2 - // Child Loop BB227_9 Depth 2 - // Child Loop BB227_5 Depth 3 - // Child Loop BB227_7 Depth 3 - "cmp x8, x26\n" - "b.lt " DC_KERNEL_NO_MULT_STRIDE_1 "b\n" - // %bb.20: - // Compiled intrinsics total stack 208, now 64 for spillage only. - "add sp, sp, #64\n" // =208 + "mov v14.16b, v30.16b\n" + "mov v15.16b, v31.16b\n" + ".word 0x4e8c970e // sdot v14.4s, v24.16b, v12.16b\n" + ".word 0x4e89974e // sdot v14.4s, v26.16b, v9.16b\n" + ".word 0x4e8b972f // sdot v15.4s, v25.16b, v11.16b\n" + ".word 0x4e88978e // sdot v14.4s, v28.16b, v8.16b\n" + ".word 0x4e8a976f // sdot v15.4s, v27.16b, v10.16b\n" + ".word 0x4e8d97af // sdot v15.4s, v29.16b, v13.16b\n" + "sqrdmulh v14.4s, v14.4s, v3.4s\n" + "sqrdmulh v15.4s, v15.4s, v3.4s\n" + "sqrshl v14.4s, v14.4s, v4.4s\n" + "sqrshl v15.4s, v15.4s, v4.4s\n" + "sqxtn v14.4h, v14.4s\n" + "sqxtn2 v14.8h, v15.4s\n" + "sqadd v14.8h, v14.8h, v0.8h\n" + "sqxtun v14.8b, v14.8h\n" + "rev32 v12.8h, v12.8h\n" + "rev32 v9.8h, v9.8h\n" + "rev32 v8.8h, v8.8h\n" + "rev32 v11.8h, v11.8h\n" + "rev32 v10.8h, v10.8h\n" + "rev32 v13.8h, v13.8h\n" + "umax v14.8b, v14.8b, v1.8b\n" + "add x15, %[bias_data], w27, sxtw\n" + "cmp w16, #1\n" // =1 + "trn1 v12.8h, v12.8h, v6.8h\n" + "trn1 v11.8h, v11.8h, v22.8h\n" + "trn1 v9.8h, v9.8h, v17.8h\n" + "trn1 v10.8h, v10.8h, v7.8h\n" + "trn1 v8.8h, v8.8h, v18.8h\n" + "umin v14.8b, v14.8b, v2.8b\n" + "trn1 v13.8h, v13.8h, v16.8h\n" + "str d14, [x15]\n" + "b.gt " DC_KERNEL_NO_MULT_STRIDE_18 "b\n" + // %bb.23: // in Loop: Header=BB227_22 Depth=2 + "cbz x14, " DC_KERNEL_NO_MULT_STRIDE_19 "b\n" + "b " DC_KERNEL_NO_MULT_STRIDE_18 "b\n" + DC_KERNEL_NO_MULT_STRIDE_24 ":\n" // in Loop: Header=BB227_4 Depth=1 + "mov w14, wzr\n" + "cmp w14, w13\n" + "b.ge " DC_KERNEL_NO_MULT_STRIDE_2 "b\n" + DC_KERNEL_NO_MULT_STRIDE_25 ":\n" // in Loop: Header=BB227_4 Depth=1 + "ldr x14, [sp, #8]\n" // 8-byte Folded Reload + "ldr x15, [sp, #136]\n" // 8-byte Folded Reload + "add x14, x14, x25\n" + DC_KERNEL_NO_MULT_STRIDE_26 ":\n" // Parent Loop BB227_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "mov v5.16b, v27.16b\n" + "mov v19.16b, v27.16b\n" + ".word 0x4e9e9705 // sdot v5.4s, v24.16b, v30.16b\n" + ".word 0x4e889725 // sdot v5.4s, v25.16b, v8.16b\n" + ".word 0x4e9f9713 // sdot v19.4s, v24.16b, v31.16b\n" + ".word 0x4e9f9745 // sdot v5.4s, v26.16b, v31.16b\n" + ".word 0x4e9d9733 // sdot v19.4s, v25.16b, v29.16b\n" + ".word 0x4e9c9753 // sdot v19.4s, v26.16b, v28.16b\n" + "sqrdmulh v5.4s, v5.4s, v3.4s\n" + "sqrdmulh v19.4s, v19.4s, v3.4s\n" + "sqrshl v5.4s, v5.4s, v4.4s\n" + "sqrshl v19.4s, v19.4s, v4.4s\n" + "sqxtn v5.4h, v5.4s\n" + "sqxtn2 v5.8h, v19.4s\n" + "sqadd v5.8h, v5.8h, v0.8h\n" + "sqxtun v5.8b, v5.8h\n" + "umax v5.8b, v5.8b, v1.8b\n" + "mov v9.16b, v20.16b\n" + "rev32 v20.8h, v30.8h\n" + "rev32 v21.8h, v8.8h\n" + "rev32 v22.8h, v31.8h\n" + "rev32 v23.8h, v29.8h\n" + "rev32 v28.8h, v28.8h\n" + "umin v5.8b, v5.8b, v2.8b\n" + "add x17, x14, x11\n" + "subs x15, x15, #1\n" // =1 + "trn1 v30.8h, v20.8h, v9.8h\n" + "mov v20.16b, v9.16b\n" + "trn1 v8.8h, v21.8h, v15.8h\n" + "trn1 v29.8h, v23.8h, v13.8h\n" + "trn1 v31.8h, v22.8h, v14.8h\n" + "trn1 v28.8h, v28.8h, v11.8h\n" + "str s5, [x14]\n" + "add x14, x14, x20\n" + "st1 { v5.s }[1], [x17]\n" + "b.ne " DC_KERNEL_NO_MULT_STRIDE_26 "b\n" + "b " DC_KERNEL_NO_MULT_STRIDE_2 "b\n" + DC_KERNEL_NO_MULT_STRIDE_27 ":\n" // in Loop: Header=BB227_4 Depth=1 + "ldr x28, [sp, #72]\n" // 8-byte Folded Reload + "mov w14, wzr\n" + "mov x24, xzr\n" + "mov x27, x13\n" + "stp q20, q15, [sp, #16]\n" // 32-byte Folded Spill + "str q14, [sp, #48]\n" // 16-byte Folded Spill + "b " DC_KERNEL_NO_MULT_STRIDE_30 "f\n" + DC_KERNEL_NO_MULT_STRIDE_28 ":\n" // in Loop: Header=BB227_30 Depth=2 + "mov v5.16b, v30.16b\n" + ".word 0x4e8c9705 // sdot v5.4s, v24.16b, v12.16b\n" + "mov v19.16b, v31.16b\n" + ".word 0x4e8b9733 // sdot v19.4s, v25.16b, v11.16b\n" + ".word 0x4e899745 // sdot v5.4s, v26.16b, v9.16b\n" + ".word 0x4e8a9773 // sdot v19.4s, v27.16b, v10.16b\n" + ".word 0x4e889785 // sdot v5.4s, v28.16b, v8.16b\n" + ".word 0x4e8d97b3 // sdot v19.4s, v29.16b, v13.16b\n" + "sqrdmulh v5.4s, v5.4s, v3.4s\n" + "sqrdmulh v19.4s, v19.4s, v3.4s\n" + "sqrshl v5.4s, v5.4s, v4.4s\n" + "sqrshl v19.4s, v19.4s, v4.4s\n" + "sqxtn v5.4h, v5.4s\n" + "sqxtn2 v5.8h, v19.4s\n" + "sqadd v5.8h, v5.8h, v0.8h\n" + "sqxtun v5.8b, v5.8h\n" + "umax v5.8b, v5.8b, v1.8b\n" + "umin v5.8b, v5.8b, v2.8b\n" + "mov v6.16b, v14.16b\n" + "mov v12.16b, v14.16b\n" + "mov v9.16b, v17.16b\n" + "mov v8.16b, v18.16b\n" + "mov v11.16b, v22.16b\n" + "mov v10.16b, v7.16b\n" + "mov v13.16b, v16.16b\n" + "str d5, [x15, x6]\n" + DC_KERNEL_NO_MULT_STRIDE_29 ":\n" // in Loop: Header=BB227_30 Depth=2 + "add x24, x24, #32\n" // =32 + "sub x28, x28, #1\n" // =1 + "subs x27, x27, #1\n" // =1 + "add w14, w14, w20\n" + "b.eq " DC_KERNEL_NO_MULT_STRIDE_34 "f\n" + DC_KERNEL_NO_MULT_STRIDE_30 ":\n" // Parent Loop BB227_4 Depth=1 + // => This Inner Loop Header: Depth=2 + "mov v14.16b, v30.16b\n" + "mov v15.16b, v31.16b\n" + ".word 0x4e8c970e // sdot v14.4s, v24.16b, v12.16b\n" + "and x17, x24, #0xffffffe0\n" + ".word 0x4e8b972f // sdot v15.4s, v25.16b, v11.16b\n" + ".word 0x4e89974e // sdot v14.4s, v26.16b, v9.16b\n" + "add x17, x26, x17\n" + ".word 0x4e8a976f // sdot v15.4s, v27.16b, v10.16b\n" + ".word 0x4e88978e // sdot v14.4s, v28.16b, v8.16b\n" + "rev32 v21.8h, v8.8h\n" + "rev32 v6.8h, v11.8h\n" + "ldp q11, q22, [x17]\n" + ".word 0x4e8d97af // sdot v15.4s, v29.16b, v13.16b\n" + "sqrdmulh v8.4s, v14.4s, v3.4s\n" + "rev32 v20.8h, v9.8h\n" + "sqrdmulh v9.4s, v15.4s, v3.4s\n" + "sqrshl v8.4s, v8.4s, v4.4s\n" + "rev32 v5.8h, v13.8h\n" + "add %[output_block_data], x17, x12\n" + "add x17, x17, x5\n" + "sqrshl v9.4s, v9.4s, v4.4s\n" + "sqxtn v13.4h, v8.4s\n" + "rev32 v19.8h, v12.8h\n" + "ldp q17, q7, [%[output_block_data]]\n" + "ldp q18, q16, [x17]\n" + "sqxtn2 v13.8h, v9.4s\n" + "trn1 v12.8h, v19.8h, v11.8h\n" + "sqadd v19.8h, v13.8h, v0.8h\n" + "sqxtun v19.8b, v19.8h\n" + "rev32 v23.8h, v10.8h\n" + "umax v19.8b, v19.8b, v1.8b\n" + "add x15, %[bias_data], w14, sxtw\n" + "cmp w16, #1\n" // =1 + "mov v14.16b, v11.16b\n" + "trn1 v11.8h, v6.8h, v22.8h\n" + "trn1 v9.8h, v20.8h, v17.8h\n" + "trn1 v8.8h, v21.8h, v18.8h\n" + "trn1 v10.8h, v23.8h, v7.8h\n" + "umin v19.8b, v19.8b, v2.8b\n" + "trn1 v13.8h, v5.8h, v16.8h\n" + "str d19, [x15]\n" + "b.gt " DC_KERNEL_NO_MULT_STRIDE_28 "b\n" + // %bb.31: // in Loop: Header=BB227_30 Depth=2 + "cbnz x28, " DC_KERNEL_NO_MULT_STRIDE_28 "b\n" + // %bb.32: // in Loop: Header=BB227_30 Depth=2 + "mov v6.16b, v14.16b\n" + "b " DC_KERNEL_NO_MULT_STRIDE_29 "b\n" + DC_KERNEL_NO_MULT_STRIDE_33 ":\n" // in Loop: Header=BB227_4 Depth=1 + "ldp q13, q11, [sp, #96]\n" // 32-byte Folded Reload + "b " DC_KERNEL_NO_MULT_STRIDE_3 "b\n" + DC_KERNEL_NO_MULT_STRIDE_34 ":\n" // in Loop: Header=BB227_4 Depth=1 + "ldp q13, q11, [sp, #96]\n" // 32-byte Folded Reload + "ldp q15, q14, [sp, #32]\n" // 32-byte Folded Reload + "ldr q20, [sp, #16]\n" // 16-byte Folded Reload + "b " DC_KERNEL_NO_MULT_STRIDE_3 "b\n" + DC_KERNEL_NO_MULT_STRIDE_35 ":\n" + // Compiled intrinsics total stack 304, now 160 for spillage only. + "add sp, sp, #160\n" // =304 : // Outputs. [ scratch_block_data ] "+r"(scratch_block_data), @@ -8252,8 +8629,8 @@ struct KernelMacroBlockThis Loop Header: Depth=1 + // Child Loop BB205_18 Depth 2 + // Child Loop BB205_20 Depth 3 + // Child Loop BB205_21 Depth 4 + // Child Loop BB205_7 Depth 2 + // Child Loop BB205_9 Depth 3 + // Child Loop BB205_13 Depth 3 + "ldr x12, [sp, #32]\n" // 8-byte Folded Reload + "ldr x14, [sp, #56]\n" // 8-byte Folded Reload + "ldp q20, q7, [x12]\n" + "ldp q19, q16, [x12, #32]\n" + "ldp q18, q17, [x12, #64]\n" + "cmp w14, #4\n" // =4 + "add x12, x12, #96\n" // =96 + "str x12, [sp, #32]\n" // 8-byte Folded Spill + "str x13, [sp, #16]\n" // 8-byte Folded Spill + "b.ne " DC_KERNEL_MULT_15 "f\n" + // %bb.5: // in Loop: Header=BB205_4 Depth=1 + "mov %[filter_workspace], xzr\n" + "mov x5, x13\n" + "b " DC_KERNEL_MULT_7 "f\n" + DC_KERNEL_MULT_6 ":\n" // in Loop: Header=BB205_7 Depth=2 + "add %[filter_workspace], x1, #1\n" // =1 + "cmp %[filter_workspace], #2\n" // =2 + "add x5, x5, #4\n" // =4 + "mov v18.16b, v17.16b\n" + "mov v19.16b, v16.16b\n" + "mov v20.16b, v7.16b\n" + "b.eq " DC_KERNEL_MULT_3 "b\n" + DC_KERNEL_MULT_7 ":\n" // Parent Loop BB205_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB205_9 Depth 3 + // Child Loop BB205_13 Depth 3 + "ldr q21, [%[bias_data]], #16\n" "ldr w12, [%[scratch_block_data]]\n" - "add %[output_block_data], %[scratch_block_data], x11\n" - "ldr x7, [sp, #72]\n" // 8-byte Folded Reload - "ldr w6, [%[scratch_block_data], x8]\n" - "fmov s21, w12\n" - "mov v21.s[1], w12\n" - "ld1 { v21.s }[2], [%[output_block_data]]\n" - "ldr %[output_block_data], [sp, #120]\n" // 8-byte Folded Reload - "ldr w7, [%[scratch_block_data], x7]\n" - "fmov s23, w6\n" - "mov v23.s[1], w6\n" - "ldr q22, [%[output_block_data]]\n" - "ldr %[output_block_data], [sp, #48]\n" // 8-byte Folded Reload - "mov v23.s[2], w7\n" - "dup v8.4s, w7\n" - "dup v31.4s, w6\n" - "ldr w3, [%[scratch_block_data], %[output_block_data]]\n" - "mov v23.s[3], w6\n" - "ldp x7, x6, [sp, #56]\n" // 16-byte Folded Reload - "mov v28.16b, v22.16b\n" - "fmov s24, w3\n" - "mov v24.s[1], w3\n" - "ld1 { v24.s }[2], [x6]\n" - "ldr x6, [sp, #96]\n" // 8-byte Folded Reload - "mov v29.16b, v22.16b\n" - "mov v30.16b, v22.16b\n" - ".word 0x4e9f969c // sdot v28.4s, v20.16b, v31.16b\n" - ".word 0x4e9f967d // sdot v29.4s, v19.16b, v31.16b\n" - ".word 0x4e9f965e // sdot v30.4s, v18.16b, v31.16b\n" - "mov v31.16b, v22.16b\n" - "mov x13, xzr\n" - "shl v25.4s, v18.4s, #8\n" + "ldp %[function_params], x13, [sp, #248]\n" // 16-byte Folded Reload + "ldr x16, [sp, #240]\n" // 8-byte Folded Reload + "ldr x14, [sp, #280]\n" // 8-byte Folded Reload + "fmov s22, w12\n" + "add x13, %[scratch_block_data], x13\n" + "ldr w16, [%[scratch_block_data], x16]\n" + "mov v22.s[1], w12\n" + "ld1 { v22.s }[2], [x13]\n" + "ldr x13, [sp, #232]\n" // 8-byte Folded Reload + "ldr w14, [%[scratch_block_data], x14]\n" + "fmov s23, w16\n" + "ldr w4, [%[scratch_block_data], %[function_params]]\n" + "add x13, %[scratch_block_data], x13\n" + "mov v23.s[1], w16\n" + "ld1 { v23.s }[2], [x13]\n" + "fmov s24, w14\n" + "mov v24.s[1], w14\n" + "dup v25.4s, w14\n" + "mov v28.16b, v21.16b\n" + "mov v29.16b, v21.16b\n" + "mov v30.16b, v21.16b\n" + "dup v26.4s, w4\n" + "mov v31.16b, v21.16b\n" + "mov v24.s[2], w4\n" + "cmp w25, #1\n" // =1 + ".word 0x4e99965c // sdot v28.4s, v18.16b, v25.16b\n" + ".word 0x4e99967d // sdot v29.4s, v19.16b, v25.16b\n" + ".word 0x4e99969e // sdot v30.4s, v20.16b, v25.16b\n" + "mov v24.s[3], w14\n" + "mov v22.s[3], w12\n" + "mov v23.s[3], w16\n" + ".word 0x4e9a969f // sdot v31.4s, v20.16b, v26.16b\n" + "b.lt " DC_KERNEL_MULT_14 "f\n" + // %bb.8: // in Loop: Header=BB205_7 Depth=2 + "stp %[filter_workspace], %[bias_data], [sp, #216]\n" // 16-byte Folded Spill + "mov w13, w25\n" + "str x5, [sp, #208]\n" // 8-byte Folded Spill + "mov x16, x5\n" + "mov x14, %[scratch_block_data]\n" + "ldp x25, %[scratch_block_data], [sp, #168]\n" // 16-byte Folded Reload + "mov x15, x10\n" + "mov x9, x8\n" + "mov x8, x24\n" + "mov x24, x28\n" + "mov x28, x27\n" + "ldp %[filter_workspace], x27, [sp, #144]\n" // 16-byte Folded Reload + "ldr x5, [sp, #136]\n" // 8-byte Folded Reload + "ldr %[bias_data], [sp, #104]\n" // 8-byte Folded Reload + "ldp x10, x11, [sp, #64]\n" // 16-byte Folded Reload + "shl v25.4s, v20.4s, #8\n" "shl v26.4s, v19.4s, #8\n" - "shl v27.4s, v20.4s, #8\n" - "mov v21.s[3], w12\n" - "mov v24.s[3], w3\n" - ".word 0x4e88965f // sdot v31.4s, v18.16b, v8.16b\n" - "mov x12, x19\n" - "b " DC_KERNEL_MULT_5 "f\n" - DC_KERNEL_MULT_4 ":\n" // in Loop: Header=BB205_5 Depth=3 - ".word 0x4f95e25c // sdot v28.4s, v18.16b, v21.4b[0]\n" - ".word 0x4f95ea5d // sdot v29.4s, v18.16b, v21.4b[2]\n" - ".word 0x4f97ea7e // sdot v30.4s, v19.16b, v23.4b[2]\n" - ".word 0x4f95ea7c // sdot v28.4s, v19.16b, v21.4b[2]\n" - ".word 0x4f98e27f // sdot v31.4s, v19.16b, v24.4b[0]\n" - ".word 0x4f97ea9d // sdot v29.4s, v20.16b, v23.4b[2]\n" - ".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n" + "shl v27.4s, v18.4s, #8\n" + DC_KERNEL_MULT_9 ":\n" // Parent Loop BB205_4 Depth=1 + // Parent Loop BB205_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ".word 0x4f96e29c // sdot v28.4s, v20.16b, v22.4b[0]\n" + ".word 0x4f96ea9d // sdot v29.4s, v20.16b, v22.4b[2]\n" + ".word 0x4f98ea7e // sdot v30.4s, v19.16b, v24.4b[2]\n" + ".word 0x4f96ea7c // sdot v28.4s, v19.16b, v22.4b[2]\n" + ".word 0x4f97e27f // sdot v31.4s, v19.16b, v23.4b[0]\n" + ".word 0x4f98ea5d // sdot v29.4s, v18.16b, v24.4b[2]\n" + ".word 0x4f97e25e // sdot v30.4s, v18.16b, v23.4b[0]\n" "sqrdmulh v28.4s, v28.4s, v1.4s\n" - ".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n" + ".word 0x4f97ea5f // sdot v31.4s, v18.16b, v23.4b[2]\n" "sqrdmulh v29.4s, v29.4s, v1.4s\n" "sqrdmulh v30.4s, v30.4s, v1.4s\n" "sqrshl v28.4s, v28.4s, v2.4s\n" @@ -8492,52 +8958,51 @@ struct KernelMacroBlock This Inner Loop Header: Depth=3 - "cmp w13, w9\n" - "b.lt " DC_KERNEL_MULT_4 "b\n" - // %bb.6: // in Loop: Header=BB205_11 Depth=2 - "ldr %[output_block_data], [sp, #120]\n" // 8-byte Folded Reload - "cmp w13, w25\n" - "str x19, [sp, #104]\n" // 8-byte Folded Spill - "add %[output_block_data], x3, #16\n" // =16 - "str %[output_block_data], [sp, #120]\n" // 8-byte Folded Spill - "b.ge " DC_KERNEL_MULT_10 "f\n" - // %bb.7: // in Loop: Header=BB205_11 Depth=2 - "add x7, %[scratch_block_data], x13, lsl #2\n" - "add x19, x23, x13, lsl #2\n" - "ld1 { v23.s }[1], [x19]\n" - "add x19, x22, x13, lsl #2\n" - "add x7, x7, #4\n" // =4 - "ld1 { v24.s }[1], [x19]\n" - "ld1 { v21.s }[1], [x7]\n" - "add x19, x24, x13, lsl #2\n" - "add x7, x14, x13, lsl #2\n" - "add x13, x16, x13, lsl #2\n" - "ldr x20, [sp, #96]\n" // 8-byte Folded Reload - "ld1 { v23.s }[3], [x7]\n" - "ld1 { v24.s }[3], [x13]\n" - "ld1 { v21.s }[3], [x19]\n" - "mov %[output_block_data], xzr\n" - "mov w6, wzr\n" - "add x13, x21, x12\n" - "add x7, %[function_params], x12\n" - "add x19, x5, x12\n" - "add x12, x20, x12\n" - "b " DC_KERNEL_MULT_9 "f\n" - DC_KERNEL_MULT_8 ":\n" // in Loop: Header=BB205_9 Depth=3 - ".word 0x4f95e25c // sdot v28.4s, v18.16b, v21.4b[0]\n" - ".word 0x4f95ea5d // sdot v29.4s, v18.16b, v21.4b[2]\n" - ".word 0x4f97ea7e // sdot v30.4s, v19.16b, v23.4b[2]\n" - ".word 0x4f95ea7c // sdot v28.4s, v19.16b, v21.4b[2]\n" - ".word 0x4f98e27f // sdot v31.4s, v19.16b, v24.4b[0]\n" - ".word 0x4f97ea9d // sdot v29.4s, v20.16b, v23.4b[2]\n" + "add %[function_params], x15, x16\n" + "add x12, x12, x16\n" + "subs w13, w13, #1\n" // =1 + "ushr v22.2d, v22.2d, #32\n" + "ushr v23.2d, v23.2d, #32\n" + ".word 0x4f98e25c // sdot v28.4s, v18.16b, v24.4b[0]\n" + ".word 0x4f98e27d // sdot v29.4s, v19.16b, v24.4b[0]\n" ".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n" - "sqrdmulh v25.4s, v28.4s, v1.4s\n" ".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n" + "add x16, x16, x6\n" + "st1 { v8.s }[2], [%[function_params]]\n" + "st1 { v8.s }[3], [x12]\n" + "b.ne " DC_KERNEL_MULT_9 "b\n" + // %bb.10: // in Loop: Header=BB205_7 Depth=2 + "ldr w25, [sp, #164]\n" // 4-byte Folded Reload + "ldp x21, %[scratch_block_data], [sp, #192]\n" // 16-byte Folded Reload + "ldr %[function_params], [sp, #184]\n" // 8-byte Folded Reload + "ldp %[filter_workspace], %[bias_data], [sp, #216]\n" // 16-byte Folded Reload + "ldr x5, [sp, #208]\n" // 8-byte Folded Reload + "add x13, %[output_block_data], x16\n" + "mov w12, w25\n" + "mov x27, x28\n" + "mov x28, x24\n" + "mov x24, x8\n" + "mov x8, x9\n" + "mov x10, x15\n" + "mov w15, #4\n" + "ldr w16, [sp, #276]\n" // 4-byte Folded Reload + "cmp w12, w16\n" + "b.ge " DC_KERNEL_MULT_6 "b\n" + DC_KERNEL_MULT_11 ":\n" // in Loop: Header=BB205_7 Depth=2 + "ldr w12, [sp, #272]\n" // 4-byte Folded Reload + "cmp w12, #1\n" // =1 + "b.lt " DC_KERNEL_MULT_6 "b\n" + // %bb.12: // in Loop: Header=BB205_7 Depth=2 + "add x12, x14, #4\n" // =4 + "ldr x14, [sp, #240]\n" // 8-byte Folded Reload + "ldr x16, [sp, #280]\n" // 8-byte Folded Reload + "add x14, x12, x14\n" + "ld1 { v23.s }[1], [x14]\n" + "ldr x14, [sp, #232]\n" // 8-byte Folded Reload + "add x16, x12, x16\n" + "ld1 { v24.s }[1], [x16]\n" + "add x14, x12, x14\n" + "ld1 { v23.s }[3], [x14]\n" + "ldp x16, x14, [sp, #248]\n" // 16-byte Folded Reload + "add x16, x12, x16\n" + "ld1 { v24.s }[3], [x16]\n" + "ldr x16, [sp, #40]\n" // 8-byte Folded Reload + "ld1 { v22.s }[1], [x12], x14\n" + "ld1 { v22.s }[3], [x12]\n" + "ldr w12, [sp, #272]\n" // 4-byte Folded Reload + DC_KERNEL_MULT_13 ":\n" // Parent Loop BB205_4 Depth=1 + // Parent Loop BB205_7 Depth=2 + // => This Inner Loop Header: Depth=3 + ".word 0x4f96e29c // sdot v28.4s, v20.16b, v22.4b[0]\n" + ".word 0x4f96ea9d // sdot v29.4s, v20.16b, v22.4b[2]\n" + ".word 0x4f98ea7e // sdot v30.4s, v19.16b, v24.4b[2]\n" + ".word 0x4f96ea7c // sdot v28.4s, v19.16b, v22.4b[2]\n" + ".word 0x4f97e27f // sdot v31.4s, v19.16b, v23.4b[0]\n" + ".word 0x4f98ea5d // sdot v29.4s, v18.16b, v24.4b[2]\n" + ".word 0x4f97e25e // sdot v30.4s, v18.16b, v23.4b[0]\n" + "sqrdmulh v25.4s, v28.4s, v1.4s\n" + ".word 0x4f97ea5f // sdot v31.4s, v18.16b, v23.4b[2]\n" "sqrdmulh v26.4s, v29.4s, v1.4s\n" "sqrdmulh v27.4s, v30.4s, v1.4s\n" "sqrshl v25.4s, v25.4s, v2.4s\n" @@ -8717,85 +9187,107 @@ struct KernelMacroBlock This Inner Loop Header: Depth=3 - "cmp w6, w17\n" - "b.lt " DC_KERNEL_MULT_8 "b\n" - DC_KERNEL_MULT_10 ":\n" // in Loop: Header=BB205_11 Depth=2 - "ldp x19, x12, [sp, #104]\n" // 16-byte Folded Reload - "mov v20.16b, v17.16b\n" - "mov v19.16b, v16.16b\n" - "mov v18.16b, v7.16b\n" - "add x12, x12, #1\n" // =1 - "add x19, x19, #4\n" // =4 - DC_KERNEL_MULT_11 ":\n" // Parent Loop BB205_22 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB205_5 Depth 3 - // Child Loop BB205_9 Depth 3 - "cmp x12, #2\n" // =2 - "b.ne " DC_KERNEL_MULT_3 "b\n" - "b " DC_KERNEL_MULT_21 "f\n" - DC_KERNEL_MULT_12 ":\n" // in Loop: Header=BB205_22 Depth=1 - "ldr x13, [sp, #40]\n" // 8-byte Folded Reload - "ldp q21, q22, [x13]\n" - "ldr x13, [sp, #24]\n" // 8-byte Folded Reload - "str x13, [sp, #120]\n" // 8-byte Folded Spill - "b " DC_KERNEL_MULT_20 "f\n" - DC_KERNEL_MULT_13 ":\n" // in Loop: Header=BB205_20 Depth=2 - "madd x6, x12, x11, %[scratch_block_data]\n" - "ldr w13, [x6]\n" - "add x7, x6, x11\n" - "mov w3, wzr\n" - "fmov s23, w13\n" - "mov v23.s[1], w13\n" - "ld1 { v23.s }[2], [x7]\n" - "add x7, x6, x8\n" - "ld1r { v24.4s }, [x7]\n" - "ldr x7, [sp, #120]\n" // 8-byte Folded Reload - "mov v23.s[3], w13\n" + "mov v28.16b, v21.16b\n" + "mov v29.16b, v21.16b\n" + "mov v30.16b, v21.16b\n" + "mov v31.16b, v21.16b\n" + "st1 { v25.s }[2], [x14]\n" + "add x14, x13, x16\n" + "subs w12, w12, #1\n" // =1 + "ushr v22.2d, v22.2d, #8\n" + "ushr v23.2d, v23.2d, #8\n" + ".word 0x4f98e25c // sdot v28.4s, v18.16b, v24.4b[0]\n" + ".word 0x4f98e27d // sdot v29.4s, v19.16b, v24.4b[0]\n" + ".word 0x4f98e29e // sdot v30.4s, v20.16b, v24.4b[0]\n" + "add x13, x13, x17\n" + ".word 0x4f98ea9f // sdot v31.4s, v20.16b, v24.4b[2]\n" + "st1 { v25.s }[3], [x14]\n" + "b.ne " DC_KERNEL_MULT_13 "b\n" + "b " DC_KERNEL_MULT_6 "b\n" + DC_KERNEL_MULT_14 ":\n" // in Loop: Header=BB205_7 Depth=2 + "ldr x11, [sp, #48]\n" // 8-byte Folded Reload + "ldr %[function_params], [sp, #184]\n" // 8-byte Folded Reload + "mov w12, wzr\n" + "mov x14, %[scratch_block_data]\n" + "add x13, x11, %[filter_workspace], lsl #2\n" + "ldr w16, [sp, #276]\n" // 4-byte Folded Reload + "cmp w12, w16\n" + "b.ge " DC_KERNEL_MULT_6 "b\n" + "b " DC_KERNEL_MULT_11 "b\n" + DC_KERNEL_MULT_15 ":\n" // in Loop: Header=BB205_4 Depth=1 + "ldr w14, [sp, #8]\n" // 4-byte Folded Reload + "add x11, %[bias_data], #32\n" // =32 + "tbnz w14, #0, " DC_KERNEL_MULT_2 "b\n" + // %bb.16: // in Loop: Header=BB205_4 Depth=1 + "ldp q21, q22, [%[bias_data]]\n" + "ldr %[filter_workspace], [sp, #48]\n" // 8-byte Folded Reload + "mov x14, xzr\n" "b " DC_KERNEL_MULT_18 "f\n" - DC_KERNEL_MULT_14 ":\n" // in Loop: Header=BB205_18 Depth=3 - "add x6, x6, #4\n" // =4 - "mov x13, x6\n" - "ld1 { v23.s }[1], [x13], x8\n" - "add x20, x6, x11\n" - "cmp w3, w26\n" - "mov w19, wzr\n" - "ld1 { v23.s }[3], [x20]\n" - "ld1 { v24.s }[1], [x13]\n" - "orr w13, wzr, #0x4\n" - "csel w13, w17, w13, eq\n" - "b " DC_KERNEL_MULT_16 "f\n" - DC_KERNEL_MULT_15 ":\n" // in Loop: Header=BB205_16 Depth=4 + DC_KERNEL_MULT_17 ":\n" // in Loop: Header=BB205_18 Depth=2 + "ldr x12, [sp, #56]\n" // 8-byte Folded Reload + "ldp x21, %[scratch_block_data], [sp, #192]\n" // 16-byte Folded Reload + "add x14, x14, #1\n" // =1 + "cmp x14, x12\n" + "add %[filter_workspace], x1, x21\n" + "b.eq " DC_KERNEL_MULT_2 "b\n" + DC_KERNEL_MULT_18 ":\n" // Parent Loop BB205_4 Depth=1 + // => This Loop Header: Depth=2 + // Child Loop BB205_20 Depth 3 + // Child Loop BB205_21 Depth 4 + "ldr x16, [sp, #256]\n" // 8-byte Folded Reload + "mov w13, wzr\n" + "madd x12, x14, x16, %[scratch_block_data]\n" + "mov %[scratch_block_data], x16\n" + "ldr w16, [x12]\n" + "add %[function_params], x12, %[scratch_block_data]\n" + "fmov s23, w16\n" + "mov v23.s[1], w16\n" + "ld1 { v23.s }[2], [%[function_params]]\n" + "ldr %[function_params], [sp, #280]\n" // 8-byte Folded Reload + "mov v23.s[3], w16\n" + "add %[function_params], x12, %[function_params]\n" + "ld1r { v24.4s }, [%[function_params]]\n" + "mov x16, %[filter_workspace]\n" + "b " DC_KERNEL_MULT_20 "f\n" + DC_KERNEL_MULT_19 ":\n" // in Loop: Header=BB205_20 Depth=3 + "ldr w4, [sp, #276]\n" // 4-byte Folded Reload + "add w13, w13, #1\n" // =1 + "cmp w13, w4\n" + "b.eq " DC_KERNEL_MULT_17 "b\n" + DC_KERNEL_MULT_20 ":\n" // Parent Loop BB205_4 Depth=1 + // Parent Loop BB205_18 Depth=2 + // => This Loop Header: Depth=3 + // Child Loop BB205_21 Depth 4 + "ldr x21, [sp, #280]\n" // 8-byte Folded Reload + "add x12, x12, #4\n" // =4 + "mov %[function_params], x12\n" + "ld1 { v23.s }[1], [%[function_params]], x21\n" + "ldr w21, [sp, #268]\n" // 4-byte Folded Reload + "ld1 { v24.s }[1], [%[function_params]]\n" + "ldr w4, [sp, #272]\n" // 4-byte Folded Reload + "cmp w13, w21\n" + "add x21, x12, %[scratch_block_data]\n" + "ld1 { v23.s }[3], [x21]\n" + "csel w4, w4, w15, eq\n" + "cmp w4, #1\n" // =1 + "b.lt " DC_KERNEL_MULT_19 "b\n" + DC_KERNEL_MULT_21 ":\n" // Parent Loop BB205_4 Depth=1 + // Parent Loop BB205_18 Depth=2 + // Parent Loop BB205_20 Depth=3 + // => This Inner Loop Header: Depth=4 "mov v25.16b, v21.16b\n" "mov v26.16b, v22.16b\n" - ".word 0x4f97e259 // sdot v25.4s, v18.16b, v23.4b[0]\n" + ".word 0x4f97e299 // sdot v25.4s, v20.16b, v23.4b[0]\n" ".word 0x4f97e0fa // sdot v26.4s, v7.16b, v23.4b[0]\n" ".word 0x4f97ea79 // sdot v25.4s, v19.16b, v23.4b[2]\n" ".word 0x4f97ea1a // sdot v26.4s, v16.16b, v23.4b[2]\n" - ".word 0x4f98e299 // sdot v25.4s, v20.16b, v24.4b[0]\n" + ".word 0x4f98e259 // sdot v25.4s, v18.16b, v24.4b[0]\n" ".word 0x4f98e23a // sdot v26.4s, v17.16b, v24.4b[0]\n" "sqrdmulh v25.4s, v25.4s, v1.4s\n" "sqrdmulh v26.4s, v26.4s, v1.4s\n" @@ -8807,63 +9299,16 @@ struct KernelMacroBlock This Inner Loop Header: Depth=4 - "cmp w19, w13\n" - "b.lt " DC_KERNEL_MULT_15 "b\n" - // %bb.17: // in Loop: Header=BB205_18 Depth=3 - "add w3, w3, #1\n" // =1 - DC_KERNEL_MULT_18 ":\n" // Parent Loop BB205_22 Depth=1 - // Parent Loop BB205_20 Depth=2 - // => This Loop Header: Depth=3 - // Child Loop BB205_16 Depth 4 - "cmp w3, w25\n" - "b.lt " DC_KERNEL_MULT_14 "b\n" - // %bb.19: // in Loop: Header=BB205_20 Depth=2 - "ldr x13, [sp, #80]\n" // 8-byte Folded Reload - "ldr %[output_block_data], [sp, #120]\n" // 8-byte Folded Reload - "add x12, x12, #1\n" // =1 - "add %[output_block_data], x3, x13\n" - "str %[output_block_data], [sp, #120]\n" // 8-byte Folded Spill - DC_KERNEL_MULT_20 ":\n" // Parent Loop BB205_22 Depth=1 - // => This Loop Header: Depth=2 - // Child Loop BB205_18 Depth 3 - // Child Loop BB205_16 Depth 4 - "ldr x13, [sp, #88]\n" // 8-byte Folded Reload - "cmp x12, x13\n" - "b.lt " DC_KERNEL_MULT_13 "b\n" - DC_KERNEL_MULT_21 ":\n" // in Loop: Header=BB205_22 Depth=1 - "ldr x12, [sp, #40]\n" // 8-byte Folded Reload - "ldr w7, [sp, #12]\n" // 4-byte Folded Reload - "add x12, x12, #32\n" // =32 - "str x12, [sp, #40]\n" // 8-byte Folded Spill - "ldr x12, [sp, #24]\n" // 8-byte Folded Reload - "add w7, w7, #1\n" // =1 - "add x12, x12, #8\n" // =8 - "str x12, [sp, #24]\n" // 8-byte Folded Spill - "ldr x12, [sp, #32]\n" // 8-byte Folded Reload - "add x12, x12, #8\n" // =8 - "str x12, [sp, #32]\n" // 8-byte Folded Spill - DC_KERNEL_MULT_22 ":\n" // =>This Loop Header: Depth=1 - // Child Loop BB205_20 Depth 2 - // Child Loop BB205_18 Depth 3 - // Child Loop BB205_16 Depth 4 - // Child Loop BB205_11 Depth 2 - // Child Loop BB205_5 Depth 3 - // Child Loop BB205_9 Depth 3 - "ldr w12, [sp, #8]\n" // 4-byte Folded Reload - "cmp w7, w12\n" - "b.lt " DC_KERNEL_MULT_1 "b\n" - // %bb.23: - // Compiled intrinsics total stack 266, now 176 for spillage only. - "add sp, sp, #176\n" // =288 + "str d25, [x16]\n" + "add x16, x16, x17\n" + "b.ne " DC_KERNEL_MULT_21 "b\n" + "b " DC_KERNEL_MULT_19 "b\n" + DC_KERNEL_MULT_22 ":\n" + // Compiled intrinsics total stack 400, now 304 for spillage only. + "add sp, sp, #304\n" // =400 : // Outputs. [ scratch_block_data ] "+r"(scratch_block_data), @@ -8884,7 +9329,7 @@ struct KernelMacroBlock Date: Thu, 5 Dec 2019 08:09:32 -0800 Subject: [PATCH 157/383] Misc cleanup. Moved non-static methods which use no member data to static. PiperOrigin-RevId: 283975981 Change-Id: I026919677fc29b93d3039bfad199f4f599b253bd --- .../grappler/costs/op_level_cost_estimator.cc | 38 ++--- .../grappler/costs/op_level_cost_estimator.h | 143 +++++++++--------- 2 files changed, 91 insertions(+), 90 deletions(-) diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc index 751bf952213..f7df31a07bf 100644 --- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc +++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc @@ -659,8 +659,8 @@ Costs OpLevelCostEstimator::PredictOpCountBasedCost( return costs; } -int64 OpLevelCostEstimator::CountConv2DOperations( - const OpInfo& op_info, bool* found_unknown_shapes) const { +int64 OpLevelCostEstimator::CountConv2DOperations(const OpInfo& op_info, + bool* found_unknown_shapes) { return CountConv2DOperations(op_info, nullptr, found_unknown_shapes); } @@ -747,7 +747,7 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs( int64 OpLevelCostEstimator::CountConv2DOperations( const OpInfo& op_info, ConvolutionDimensions* conv_info, - bool* found_unknown_shapes) const { + bool* found_unknown_shapes) { DCHECK(op_info.op() == kConv2d || op_info.op() == kDepthwiseConv2dNative) << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative"; @@ -779,15 +779,15 @@ int64 OpLevelCostEstimator::CountConv2DOperations( return ops; } -int64 OpLevelCostEstimator::CountMatMulOperations( - const OpInfo& op_info, bool* found_unknown_shapes) const { +int64 OpLevelCostEstimator::CountMatMulOperations(const OpInfo& op_info, + bool* found_unknown_shapes) { return CountMatMulOperations(op_info, nullptr, found_unknown_shapes); } // TODO(nishantpatil): Create separate estimator for Sparse Matmul -int64 OpLevelCostEstimator::CountMatMulOperations( - const OpInfo& op_info, MatMulDimensions* mat_mul, - bool* found_unknown_shapes) const { +int64 OpLevelCostEstimator::CountMatMulOperations(const OpInfo& op_info, + MatMulDimensions* mat_mul, + bool* found_unknown_shapes) { double ops = 0; if (op_info.inputs_size() < 2) { @@ -857,13 +857,13 @@ int64 OpLevelCostEstimator::CountMatMulOperations( } int64 OpLevelCostEstimator::CountBatchMatMulOperations( - const OpInfo& op_info, bool* found_unknown_shapes) const { + const OpInfo& op_info, bool* found_unknown_shapes) { return CountBatchMatMulOperations(op_info, nullptr, found_unknown_shapes); } int64 OpLevelCostEstimator::CountBatchMatMulOperations( const OpInfo& op_info, BatchMatMulDimensions* batch_mat_mul, - bool* found_unknown_shapes) const { + bool* found_unknown_shapes) { if (op_info.op() != kBatchMatMul) { LOG(ERROR) << "Invalid Operation: " << op_info.op(); // TODO(pcma): Try to separate invalid inputs from unknown shapes @@ -1037,7 +1037,7 @@ bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto, // TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations. int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations( const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, - bool* found_unknown_shapes) const { + bool* found_unknown_shapes) { int64 ops = 0; DCHECK(op_info.op() == kConv2dBackpropInput || @@ -1095,7 +1095,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations( int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations( const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, - bool* found_unknown_shapes) const { + bool* found_unknown_shapes) { int64 ops = 0; DCHECK(op_info.op() == kConv2dBackpropFilter || @@ -1150,7 +1150,7 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations( } int64 OpLevelCostEstimator::CalculateTensorElementCount( - const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const { + const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) { VLOG(2) << " with " << DataTypeString(tensor.dtype()) << " tensor of shape " << tensor.shape().DebugString(); int64 tensor_size = 1; @@ -1164,15 +1164,15 @@ int64 OpLevelCostEstimator::CalculateTensorElementCount( } int64 OpLevelCostEstimator::CalculateTensorSize( - const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) const { + const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes) { int64 count = CalculateTensorElementCount(tensor, found_unknown_shapes); int size = DataTypeSize(BaseType(tensor.dtype())); VLOG(2) << "Count: " << count << " DataTypeSize: " << size; return count * size; } -int64 OpLevelCostEstimator::CalculateInputSize( - const OpInfo& op_info, bool* found_unknown_shapes) const { +int64 OpLevelCostEstimator::CalculateInputSize(const OpInfo& op_info, + bool* found_unknown_shapes) { int64 total_input_size = 0; for (auto& input : op_info.inputs()) { int64 input_size = CalculateTensorSize(input, found_unknown_shapes); @@ -1184,7 +1184,7 @@ int64 OpLevelCostEstimator::CalculateInputSize( } int64 OpLevelCostEstimator::CalculateLargestInputCount( - const OpInfo& op_info, bool* found_unknown_shapes) const { + const OpInfo& op_info, bool* found_unknown_shapes) { int64 largest_input_count = 0; for (auto& input : op_info.inputs()) { int64 input_count = @@ -1198,8 +1198,8 @@ int64 OpLevelCostEstimator::CalculateLargestInputCount( return largest_input_count; } -int64 OpLevelCostEstimator::CalculateOutputSize( - const OpInfo& op_info, bool* found_unknown_shapes) const { +int64 OpLevelCostEstimator::CalculateOutputSize(const OpInfo& op_info, + bool* found_unknown_shapes) { int64 total_output_size = 0; // use float as default for calculations for (const auto& output : op_info.outputs()) { diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h index 3956770ac84..9183c543f11 100644 --- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h +++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h @@ -54,77 +54,6 @@ class OpLevelCostEstimator { double output_io_bytes, const OpInfo& op_info) const; - // This family of routines counts the number of operations to perform the - // specified TensorFlow Op. - struct MatMulDimensions { - int m; - int n; - int k; - }; - struct BatchMatMulDimensions { - std::vector batch_dims; - MatMulDimensions matmul_dims; - }; - struct ConvolutionDimensions { - int64 batch; // Batch size. - int64 ix; // Input size x. - int64 iy; // Input size y. - int64 iz; // Input depth. - int64 kx; // Kernel x. - int64 ky; // Kernel y. - int64 kz; // Kernel depth (in case of group convolution, this will be - // smaller than input depth). - int64 oz; // Output depth. - int64 ox; // Output size x. - int64 oy; // Output size y. - int64 sx; // Stride x. - int64 sy; // Stride y. - Padding padding; // SAME or VALID. - }; - int64 CountConv2DOperations(const OpInfo& op_info, - bool* found_unknown_shapes) const; - int64 CountConv2DOperations(const OpInfo& op_info, - ConvolutionDimensions* conv_info, - bool* found_unknown_shapes) const; - int64 CountMatMulOperations(const OpInfo& op_info, - bool* found_unknown_shapes) const; - int64 CountMatMulOperations(const OpInfo& op_info, MatMulDimensions* mat_mul, - bool* found_unknown_shapes) const; - int64 CountBatchMatMulOperations(const OpInfo& op_info, - bool* found_unknown_shapes) const; - int64 CountBatchMatMulOperations(const OpInfo& op_info, - BatchMatMulDimensions* batch_mat_mul, - bool* found_unknown_shapes) const; - int64 CountConv2DBackpropInputOperations( - const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, - bool* found_unknown_shapes) const; - int64 CountConv2DBackpropFilterOperations( - const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, - bool* found_unknown_shapes) const; - - // Calculate the element count of an input/output tensor. - int64 CalculateTensorElementCount(const OpInfo::TensorProperties& tensor, - bool* found_unknown_shapes) const; - - // Calculate the total size in bytes of an input/output tensor. - int64 CalculateTensorSize(const OpInfo::TensorProperties& tensor, - bool* found_unknown_shapes) const; - - // Calculate the element count of the largest - // input of specified TensorFlow op. - int64 CalculateLargestInputCount(const OpInfo& op_info, - bool* found_unknown_shapes) const; - - // Calculate the total size in bytes of the all - // the inputs of specified TensorFlow op. - int64 CalculateInputSize(const OpInfo& op_info, - bool* found_unknown_shapes) const; - - // Calculate the total size in bytes of the all - // the outputs of specified TensorFlow op. - int64 CalculateOutputSize(const OpInfo& op_info, - bool* found_unknown_shapes) const; - // This family of routines predicts the costs to // perform the specified TensorFlow Op on the // device represented by a subclass. The default @@ -171,6 +100,78 @@ class OpLevelCostEstimator { } } + // This family of routines counts the number of operations to perform the + // specified TensorFlow Op. + struct MatMulDimensions { + int m; + int n; + int k; + }; + struct BatchMatMulDimensions { + std::vector batch_dims; + MatMulDimensions matmul_dims; + }; + struct ConvolutionDimensions { + int64 batch; // Batch size. + int64 ix; // Input size x. + int64 iy; // Input size y. + int64 iz; // Input depth. + int64 kx; // Kernel x. + int64 ky; // Kernel y. + int64 kz; // Kernel depth (in case of group convolution, this will be + // smaller than input depth). + int64 oz; // Output depth. + int64 ox; // Output size x. + int64 oy; // Output size y. + int64 sx; // Stride x. + int64 sy; // Stride y. + Padding padding; // SAME or VALID. + }; + static int64 CountConv2DOperations(const OpInfo& op_info, + bool* found_unknown_shapes); + static int64 CountConv2DOperations(const OpInfo& op_info, + ConvolutionDimensions* conv_info, + bool* found_unknown_shapes); + static int64 CountMatMulOperations(const OpInfo& op_info, + bool* found_unknown_shapes); + static int64 CountMatMulOperations(const OpInfo& op_info, + MatMulDimensions* mat_mul, + bool* found_unknown_shapes); + static int64 CountBatchMatMulOperations(const OpInfo& op_info, + bool* found_unknown_shapes); + static int64 CountBatchMatMulOperations(const OpInfo& op_info, + BatchMatMulDimensions* batch_mat_mul, + bool* found_unknown_shapes); + static int64 CountConv2DBackpropInputOperations( + const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, + bool* found_unknown_shapes); + static int64 CountConv2DBackpropFilterOperations( + const OpInfo& op_info, ConvolutionDimensions* returned_conv_dims, + bool* found_unknown_shapes); + + // Calculate the element count of an input/output tensor. + static int64 CalculateTensorElementCount( + const OpInfo::TensorProperties& tensor, bool* found_unknown_shapes); + + // Calculate the total size in bytes of an input/output tensor. + static int64 CalculateTensorSize(const OpInfo::TensorProperties& tensor, + bool* found_unknown_shapes); + + // Calculate the element count of the largest + // input of specified TensorFlow op. + static int64 CalculateLargestInputCount(const OpInfo& op_info, + bool* found_unknown_shapes); + + // Calculate the total size in bytes of the all + // the inputs of specified TensorFlow op. + static int64 CalculateInputSize(const OpInfo& op_info, + bool* found_unknown_shapes); + + // Calculate the total size in bytes of the all + // the outputs of specified TensorFlow op. + static int64 CalculateOutputSize(const OpInfo& op_info, + bool* found_unknown_shapes); + // For convolution and its grad ops. static ConvolutionDimensions ConvolutionDimensionsFromInputs( const TensorShapeProto& original_image_shape, From 53b1ed660a82958d8218ba7165417072cd7ac549 Mon Sep 17 00:00:00 2001 From: Yash Katariya Date: Thu, 5 Dec 2019 08:19:56 -0800 Subject: [PATCH 158/383] Make recurrent_v2 testable. PiperOrigin-RevId: 283977811 Change-Id: I450fb94adf8b79b732bea00db840d951ef8c9daa --- .../python/keras/layers/recurrent_v2.py | 132 +++++++++--------- 1 file changed, 66 insertions(+), 66 deletions(-) diff --git a/tensorflow/python/keras/layers/recurrent_v2.py b/tensorflow/python/keras/layers/recurrent_v2.py index a62e3fc8600..68d0884c54b 100644 --- a/tensorflow/python/keras/layers/recurrent_v2.py +++ b/tensorflow/python/keras/layers/recurrent_v2.py @@ -64,6 +64,23 @@ class GRUCell(recurrent.GRUCell): This class processes one step within the whole time sequence input, whereas `tf.keras.layer.GRU` processes the whole sequence. + For example: + + >>> inputs = tf.random.normal([32, 10, 8]) + >>> rnn = tf.keras.layers.RNN(tf.keras.layers.GRUCell(4)) + >>> output = rnn(inputs) + >>> print(output.shape) + (32, 4) + >>> rnn = tf.keras.layers.RNN( + ... tf.keras.layers.GRUCell(4), + ... return_sequences=True, + ... return_state=True) + >>> whole_sequence_output, final_state = rnn(inputs) + >>> print(whole_sequence_output.shape) + (32, 10, 4) + >>> print(final_state.shape) + (32, 4) + Arguments: units: Positive integer, dimensionality of the output space. activation: Activation function to use. Default: hyperbolic tangent @@ -114,24 +131,6 @@ class GRUCell(recurrent.GRUCell): training: Python boolean indicating whether the layer should behave in training mode or in inference mode. Only relevant when `dropout` or `recurrent_dropout` is used. - - Examples: - - ```python - inputs = np.random.random([32, 10, 8]).astype(np.float32) - rnn = tf.keras.layers.RNN(tf.keras.layers.GRUCell(4)) - - output = rnn(inputs) # The output has shape `[32, 4]`. - - rnn = tf.keras.layers.RNN( - tf.keras.layers.GRUCell(4), - return_sequences=True, - return_state=True) - - # whole_sequence_output has shape `[32, 10, 4]`. - # final_state has shape `[32, 4]`. - whole_sequence_output, final_state = rnn(inputs) - ``` """ def __init__(self, @@ -207,6 +206,20 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU): `recurrent_kernel`. To use this variant, set `'reset_after'=True` and `recurrent_activation='sigmoid'`. + For example: + + >>> inputs = tf.random.normal([32, 10, 8]) + >>> gru = tf.keras.layers.GRU(4) + >>> output = gru(inputs) + >>> print(output.shape) + (32, 4) + >>> gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True) + >>> whole_sequence_output, final_state = gru(inputs) + >>> print(whole_sequence_output.shape) + (32, 10, 4) + >>> print(final_state.shape) + (32, 4) + Arguments: units: Positive integer, dimensionality of the output space. activation: Activation function to use. @@ -289,21 +302,6 @@ class GRU(recurrent.DropoutRNNCellMixin, recurrent.GRU): initial_state: List of initial state tensors to be passed to the first call of the cell (optional, defaults to `None` which causes creation of zero-filled initial state tensors). - - Examples: - - ```python - inputs = np.random.random([32, 10, 8]).astype(np.float32) - gru = tf.keras.layers.GRU(4) - - output = gru(inputs) # The output has shape `[32, 4]`. - - gru = tf.keras.layers.GRU(4, return_sequences=True, return_state=True) - - # whole_sequence_output has shape `[32, 10, 4]`. - # final_state has shape `[32, 4]`. - whole_sequence_output, final_state = gru(inputs) - ``` """ def __init__(self, @@ -775,6 +773,25 @@ class LSTMCell(recurrent.LSTMCell): This class processes one step within the whole time sequence input, whereas `tf.keras.layer.LSTM` processes the whole sequence. + For example: + + >>> inputs = tf.random.normal([32, 10, 8]) + >>> rnn = tf.keras.layers.RNN(tf.keras.layers.LSTMCell(4)) + >>> output = rnn(inputs) + >>> print(output.shape) + (32, 4) + >>> rnn = tf.keras.layers.RNN( + ... tf.keras.layers.LSTMCell(4), + ... return_sequences=True, + ... return_state=True) + >>> whole_seq_output, final_memory_state, final_carry_state = rnn(inputs) + >>> print(whole_seq_output.shape) + (32, 10, 4) + >>> print(final_memory_state.shape) + (32, 4) + >>> print(final_carry_state.shape) + (32, 4) + Arguments: units: Positive integer, dimensionality of the output space. activation: Activation function to use. Default: hyperbolic tangent @@ -826,24 +843,6 @@ class LSTMCell(recurrent.LSTMCell): training: Python boolean indicating whether the layer should behave in training mode or in inference mode. Only relevant when `dropout` or `recurrent_dropout` is used. - - Examples: - - ```python - inputs = np.random.random([32, 10, 8]).astype(np.float32) - rnn = tf.keras.layers.RNN(tf.keras.layers.LSTMCell(4)) - - output = rnn(inputs) # The output has shape `[32, 4]`. - - rnn = tf.keras.layers.RNN( - tf.keras.layers.LSTMCell(4), - return_sequences=True, - return_state=True) - - # whole_sequence_output has shape `[32, 10, 4]`. - # final_memory_state and final_carry_state both have shape `[32, 4]`. - whole_sequence_output, final_memory_state, final_carry_state = rnn(inputs) - ``` """ def __init__(self, @@ -908,6 +907,22 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM): 5. `use_bias` is `True` 6. Inputs are not masked or strictly right padded. + For example: + + >>> inputs = tf.random.normal([32, 10, 8]) + >>> lstm = tf.keras.layers.LSTM(4) + >>> output = lstm(inputs) + >>> print(output.shape) + (32, 4) + >>> lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True) + >>> whole_seq_output, final_memory_state, final_carry_state = lstm(inputs) + >>> print(whole_seq_output.shape) + (32, 10, 4) + >>> print(final_memory_state.shape) + (32, 4) + >>> print(final_carry_state.shape) + (32, 4) + Arguments: units: Positive integer, dimensionality of the output space. activation: Activation function to use. @@ -983,21 +998,6 @@ class LSTM(recurrent.DropoutRNNCellMixin, recurrent.LSTM): initial_state: List of initial state tensors to be passed to the first call of the cell (optional, defaults to `None` which causes creation of zero-filled initial state tensors). - - Examples: - - ```python - inputs = np.random.random([32, 10, 8]).astype(np.float32) - lstm = tf.keras.layers.LSTM(4) - - output = lstm(inputs) # The output has shape `[32, 4]`. - - lstm = tf.keras.layers.LSTM(4, return_sequences=True, return_state=True) - - # whole_sequence_output has shape `[32, 10, 4]`. - # final_memory_state and final_carry_state both have shape `[32, 4]`. - whole_sequence_output, final_memory_state, final_carry_state = lstm(inputs) - ``` """ def __init__(self, From c6ccf6316430193e18dde016698d2d8c9989b30a Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Thu, 5 Dec 2019 08:22:20 -0800 Subject: [PATCH 159/383] [XLA/GPU] Simplify tiling scheme for batch dimension Previous tiling scheme for Z-dimension chooses GCD(8, $batch_dimension_size) as a Z-tile size. This results in a tile size of 1 chosen for all numbers which are coprime with all integers from 1 to 8, which worsens the performance. On a syntetic benchmark ``` HloModule Reduce Sum { x.1 = f32[] parameter(0) y.1 = f32[] parameter(1) ROOT add.1 = f32[] add(x.1, y.1) } ENTRY reduce.1 { input = f32[1163,2,10000] parameter(0) init_value = f32[] constant(0) ROOT reduce = f32[2] reduce(input, init_value), dimensions={0,2}, to_apply=Sum } ``` we previously got: ``` I1204 17:59:21.767770 259988 executable.cc:208] 412707 cycles (66.59% 67?) :: 283.6 usec ( 142.5 optimal) :: 82.00GFLOP/s :: :: 305.49GiB/s :: 225B/cycle :: %reduce = f32[2]{0} ``` and after this patch: ``` I1204 17:59:32.700495 3410 executable.cc:208] 269674 cycles (53.71% 54?) :: 185.3 usec ( 142.5 optimal) :: 125.50GFLOP/s :: :: 467.51GiB/s :: 345B/cycle :: %reduce = f32[2]{0} ``` PiperOrigin-RevId: 283978199 Change-Id: I3f58f561fa36a27923f43dfae3ad398124e8e374 --- tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 6 ++---- tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h | 5 ++--- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index dbc2c95773a..64422180693 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2900,11 +2900,9 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( tile_size_x = kWarpSize * 64; } else { tile_size_x = kWarpSize * 8; - block_size_z = 8; - while (reduction_dimensions.dimensions[0] % block_size_z != 0) { - block_size_z -= 1; - } } + block_size_z = + std::min(reduction_dimensions.dimensions[0], static_cast(8)); } } else { // Column reduction without transpose doesn't require communication among diff --git a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h index 345abbd0935..2eede7036cf 100644 --- a/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h +++ b/tensorflow/compiler/xla/service/gpu/kernel_mapping_scheme.h @@ -85,15 +85,14 @@ class KernelMappingScheme { dims_in_tiles_{dims_in_elems[0], CeilOfRatio(dims_in_elems[1], tile_size_y), CeilOfRatio(dims_in_elems[2], tile_size_x)}, - dims_in_blocks_{dims_in_tiles_[0] / block_size_z, dims_in_tiles_[1], - dims_in_tiles_[2]}, + dims_in_blocks_{CeilOfRatio(dims_in_tiles_[0], block_size_z), + dims_in_tiles_[1], dims_in_tiles_[2]}, block_size_z_{block_size_z}, num_threads_x_(num_threads_x), num_threads_y_(num_threads_y), dilated_x_(is_dilated_x) { CHECK_EQ(tile_size_y % num_threads_y_, 0); CHECK_EQ(tile_size_x % num_threads_x_, 0); - CHECK_EQ((dims_in_elems[0] % block_size_z), 0); VLOG(10) << "dims_in_elems_ = " << absl::StrJoin(dims_in_elems_, ","); VLOG(10) << "dims_in_tiles_ = " << absl::StrJoin(dims_in_tiles_, ","); VLOG(10) << "dims_in_blocks_ = " << absl::StrJoin(dims_in_blocks_, ","); From 4904836b8b9fcda7747e453ff9f13f58aa82c5f7 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Thu, 5 Dec 2019 08:51:09 -0800 Subject: [PATCH 160/383] Pull Eigen from the official gitlab source Otherwise this will break again when the github mirror goes away. This is pulling the same revision as before, the gitlab repo has different hashes. PiperOrigin-RevId: 283983066 Change-Id: I8af3851b91c243f131d08ccd4fd47880afd1d608 --- tensorflow/workspace.bzl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 0a3c7fe4d89..93969656e67 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -172,11 +172,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "eigen_archive", build_file = clean_dep("//third_party:eigen.BUILD"), patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"), - sha256 = "add24720f99ab4f3222f4c8a887f2609554cf9187d4f7d24a777a151a0ee2548", - strip_prefix = "eigen-git-mirror-4898dcdb06f1b1b0441b8e15119764793f8997e2", + sha256 = "8a4d3ef6c18c9d8e047c6444ec0a28b43d587e7a3363eb9819eb49dd6b390aed", + strip_prefix = "eigen-ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/eigenteam/eigen-git-mirror/archive/4898dcdb06f1b1b0441b8e15119764793f8997e2.tar.gz", - "https://github.com/eigenteam/eigen-git-mirror/archive/4898dcdb06f1b1b0441b8e15119764793f8997e2.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8/eigen-ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8.tar.gz", + "https://gitlab.com/libeigen/eigen/-/archive/ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8/eigen-ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8.tar.gz", ], ) From f98d056fbb68ee9c6f74c1e44fbfdf24548dcd5e Mon Sep 17 00:00:00 2001 From: boron <31139873+boronhub@users.noreply.github.com> Date: Thu, 5 Dec 2019 22:38:59 +0530 Subject: [PATCH 161/383] Update image_ops_impl.py --- tensorflow/python/ops/image_ops_impl.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py index f9252fdd547..d33498c517c 100644 --- a/tensorflow/python/ops/image_ops_impl.py +++ b/tensorflow/python/ops/image_ops_impl.py @@ -2974,6 +2974,14 @@ def rgb_to_yuv(images): Returns: images: tensor with the same shape as `images`. + + Usage Example: + ```python + >> import tensorflow as tf + >> x = tf.random.normal(shape=(256, 256, 3)) + >> tf.image.rgb_to_yuv(x) + ``` + """ images = ops.convert_to_tensor(images, name='images') kernel = ops.convert_to_tensor( From a0d7915aafacad8fa0b25e9827a370ffff683eff Mon Sep 17 00:00:00 2001 From: Robert Crowe Date: Thu, 5 Dec 2019 09:22:42 -0800 Subject: [PATCH 162/383] Expand on doc string for tf.device PiperOrigin-RevId: 283988926 Change-Id: I5fb87ff68b04ca55e950a15a42b7721537a091bb --- tensorflow/python/framework/ops.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index 0b645102d1c..d265ee5f91e 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -5050,10 +5050,16 @@ def device(device_name_or_function): def device_v2(device_name): """Specifies the device for ops created/executed in this context. - `device_name` can be fully specified, as in "/job:worker/task:1/device:cpu:0", - or partially specified, containing only a subset of the "/"-separated - fields. Any fields which are specified override device annotations from outer - scopes. For example: + This function specifies the device to be used for ops created/executed in a + particular context. Nested contexts will inherit and also create/execute + their ops on the specified device. If a specific device is not required, + consider not using this function so that a device can be automatically + assigned. In general the use of this function is optional. `device_name` can + be fully specified, as in "/job:worker/task:1/device:cpu:0", or partially + specified, containing only a subset of the "/"-separated fields. Any fields + which are specified will override device annotations from outer scopes. + + For example: ```python with tf.device('/job:foo'): From f575856f039cee05f0ea00da7cb99a18f244a7b0 Mon Sep 17 00:00:00 2001 From: Yanhui Liang Date: Thu, 5 Dec 2019 09:32:27 -0800 Subject: [PATCH 163/383] Add testable docstrings to conv2d. PiperOrigin-RevId: 283990710 Change-Id: I79039b1b2fb5b09099dec9c184314ebdcfc2b903 --- .../python/keras/layers/convolutional.py | 59 ++++++++++++++++--- 1 file changed, 51 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py index fefbd1951e9..5ef887fc8e7 100644 --- a/tensorflow/python/keras/layers/convolutional.py +++ b/tensorflow/python/keras/layers/convolutional.py @@ -449,6 +449,33 @@ class Conv2D(Conv): e.g. `input_shape=(128, 128, 3)` for 128x128 RGB pictures in `data_format="channels_last"`. + Examples: + + >>> # Small convolutional model for 128x128 RGB images with `channels_last` + >>> input_shape = (32, 128, 128, 3) + >>> x = tf.random.normal(input_shape) + >>> y = tf.keras.layers.Conv2D( + ... 2, 3, activation='relu', input_shape=input_shape)(x) + >>> print(y.shape) + (32, 126, 126, 2) + + >>> # With `dilation_rate` as 2. + >>> input_shape = (32, 128, 128, 3) + >>> x = tf.random.normal(input_shape) + >>> y = tf.keras.layers.Conv2D( + ... 2, 3, activation='relu', dilation_rate=2, input_shape=input_shape)(x) + >>> print(y.shape) + (32, 124, 124, 2) + + >>> # With `padding` as "same". + >>> input_shape = (32, 128, 128, 3) + >>> x = tf.random.normal(input_shape) + >>> y = tf.keras.layers.Conv2D( + ... 2, 3, activation='relu', padding="same", input_shape=input_shape)(x) + >>> print(y.shape) + (32, 128, 128, 2) + + Arguments: filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). @@ -481,17 +508,25 @@ class Conv2D(Conv): incompatible with specifying any stride value != 1. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). + (ie. "linear" activation: `a(x) = x`). Check `keras.activations` for + available activation functions. use_bias: Boolean, whether the layer uses a bias vector. - kernel_initializer: Initializer for the `kernel` weights matrix. - bias_initializer: Initializer for the bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix. Check + `keras.initializers` for available initializers. + bias_initializer: Initializer for the bias vector. Check + `keras.initializers` for available initializers. kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the `kernel` weights matrix. Check + `keras.regularizers` for available regularizers. + bias_regularizer: Regularizer function applied to the bias vector. Check + `keras.regularizers` for available regularizers. activity_regularizer: Regularizer function applied to - the output of the layer (its "activation").. - kernel_constraint: Constraint function applied to the kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the output of the layer (its "activation"). Check + `keras.regularizers` for available regularizers. + kernel_constraint: Constraint function applied to the kernel matrix. Check + `keras.constraints` for available constraints. + bias_constraint: Constraint function applied to the bias vector. Check + `keras.constraints` for available constraints. Input shape: 4D tensor with shape: @@ -505,6 +540,14 @@ class Conv2D(Conv): or 4D tensor with shape: `(samples, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to padding. + + Returns: + A tensor of rank 4 representing + `activation(conv2d(inputs, kernel) + bias)`. + + Raises: + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, From 1bcbacd3dff6aa28e27a9c10e5aeeb2138c38f57 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Thu, 5 Dec 2019 09:47:03 -0800 Subject: [PATCH 164/383] [tf.data] Refactoring optimization test methods. This CL breaks down large tests that iterate over different test cases into smaller ones -- one per test case. PiperOrigin-RevId: 283993741 Change-Id: I0e67958279d924d0b139164108e971bf39de96ca --- .../optimization/filter_fusion_test.py | 60 +++++++------ .../optimization/hoist_random_uniform_test.py | 64 ++++++++------ .../map_and_filter_fusion_test.py | 85 +++++++++++-------- .../optimization/map_fusion_test.py | 64 ++++++++------ .../optimization/map_parallelization_test.py | 46 +++++----- .../python/framework/test_combinations.py | 3 + 6 files changed, 190 insertions(+), 132 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py index 949f9e2e25c..1df52da395c 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from absl.testing import parameterized from tensorflow.python.data.experimental.ops import testing @@ -29,12 +31,42 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test +def _test_combinations(): + cases = [] + + take_all = lambda x: constant_op.constant(True) + is_zero = lambda x: math_ops.equal(x, 0) + greater = lambda x: math_ops.greater(x + 5, 0) + predicates = [take_all, is_zero, greater] + for i, x in enumerate(predicates): + for j, y in enumerate(predicates): + cases.append((lambda x: x, "Scalar{}{}".format(i, j), [x, y])) + for k, z in enumerate(predicates): + cases.append((lambda x: x, "Scalar{}{}{}".format(i, j, k), [x, y, z])) + + take_all = lambda x, y: constant_op.constant(True) + is_zero = lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0) + + cases.append((lambda x: (x, x), "Tuple1", [take_all, take_all])) + cases.append((lambda x: (x, 2), "Tuple2", [take_all, is_zero])) + + def reduce_fn(x, y): + function, name, predicates = y + return x + combinations.combine( + function=function, + predicates=combinations.NamedObject(name, predicates)) + + return functools.reduce(reduce_fn, cases, []) + + class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase): - def _testFilterFusion(self, map_function, predicates): + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + _test_combinations())) + def testFilterFusion(self, function, predicates): dataset = dataset_ops.Dataset.range(5).apply( - testing.assert_next(["Map", "Filter", - "MemoryCacheImpl"])).map(map_function) + testing.assert_next(["Map", "Filter", "MemoryCacheImpl"])).map(function) for predicate in predicates: dataset = dataset.filter(predicate) @@ -45,7 +77,7 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.with_options(options) expected_output = [] for x in range(5): - r = map_function(x) + r = function(x) filtered = False for predicate in predicates: if isinstance(r, tuple): @@ -60,26 +92,6 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output.append(r) self.assertDatasetProduces(dataset, expected_output=expected_output) - @combinations.generate(test_base.default_test_combinations()) - def testFilterFusionScalar(self): - take_all = lambda x: constant_op.constant(True) - is_zero = lambda x: math_ops.equal(x, 0) - greater = lambda x: math_ops.greater(x + 5, 0) - predicates = [take_all, is_zero, greater] - for x in predicates: - for y in predicates: - self._testFilterFusion(lambda x: x, [x, y]) - for z in predicates: - self._testFilterFusion(lambda x: x, [x, y, z]) - - @combinations.generate(test_base.default_test_combinations()) - def testFilterFusionTuple(self): - take_all = lambda x, y: constant_op.constant(True) - is_zero = lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0) - - self._testFilterFusion(lambda x: (x, x), [take_all, take_all]) - self._testFilterFusion(lambda x: (x, 2), [take_all, is_zero]) - if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py index 59f50fa1752..1097b1effc6 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from absl.testing import parameterized from tensorflow.python.data.experimental.ops import testing @@ -33,6 +35,36 @@ from tensorflow.python.ops import random_ops from tensorflow.python.platform import test +def _test_combinations(): + def random(_): + return random_ops.random_uniform([], + minval=1, + maxval=10, + dtype=dtypes.float32, + seed=42) + + def random_with_assert(x): + y = random(x) + assert_op = control_flow_ops.Assert(math_ops.greater_equal(y, 1), [y]) + with ops.control_dependencies([assert_op]): + return y + + cases = [ + ("Increment", lambda x: x + 1, False), + ("Random", random, True), + ("RandomWithAssert", random_with_assert, True), + ("Complex", lambda x: (random(x) + random(x)) / 2, False), + ] + + def reduce_fn(x, y): + name, map_fn, should_optimize = y + return x + combinations.combine( + map_fn=combinations.NamedObject(name, map_fn), + should_optimize=should_optimize) + + return functools.reduce(reduce_fn, cases, []) + + class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase): def _testDataset(self, dataset): @@ -51,10 +83,13 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - def _testHoistFunction(self, function, should_optimize): + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + _test_combinations())) + def testHoistFunction(self, map_fn, should_optimize): dataset = dataset_ops.Dataset.range(5).apply( testing.assert_next( - ["Zip[0]", "Map"] if should_optimize else ["Map"])).map(function) + ["Zip[0]", "Map"] if should_optimize else ["Map"])).map(map_fn) options = dataset_ops.Options() options.experimental_optimization.apply_default_optimizations = False @@ -62,31 +97,6 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.with_options(options) self._testDataset(dataset) - @combinations.generate(test_base.default_test_combinations()) - def testNoRandom(self): - self._testHoistFunction(lambda x: x + 1, should_optimize=False) - - @combinations.generate(test_base.default_test_combinations()) - def testRandom(self): - - def random(_): - return random_ops.random_uniform([], - minval=1, - maxval=10, - dtype=dtypes.float32, - seed=42) - - def random_with_assert(x): - y = random(x) - assert_op = control_flow_ops.Assert(math_ops.greater_equal(y, 1), [y]) - with ops.control_dependencies([assert_op]): - return y - - self._testHoistFunction(random, should_optimize=True) - self._testHoistFunction(random_with_assert, should_optimize=True) - self._testHoistFunction( - lambda x: (random(x) + random(x)) / 2, should_optimize=False) - @combinations.generate(test_base.default_test_combinations()) def testCapturedInputs(self): a = constant_op.constant(1, dtype=dtypes.float32) diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py index a0257f76e93..aa0ab40254f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from absl.testing import parameterized from tensorflow.python.data.experimental.ops import testing @@ -29,6 +31,49 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test +def _test_combinations(): + cases = [] + + identity = lambda x: x + increment = lambda x: x + 1 + minus_five = lambda x: x - 5 + + def increment_and_square(x): + y = x + 1 + return y * y + + functions = [identity, increment, minus_five, increment_and_square] + + take_all = lambda x: constant_op.constant(True) + is_zero = lambda x: math_ops.equal(x, 0) + is_odd = lambda x: math_ops.equal(x % 2, 0) + greater = lambda x: math_ops.greater(x + 5, 0) + predicates = [take_all, is_zero, is_odd, greater] + + for i, function in enumerate(functions): + for j, predicate in enumerate(predicates): + cases.append((function, "Scalar{}{}".format(i, j), predicate)) + + replicate = lambda x: (x, x) + with_two = lambda x: (x, 2) + functions = [replicate, with_two] + take_all = lambda x, y: constant_op.constant(True) + is_zero = lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0) + predicates = [take_all, is_zero] + + for i, function in enumerate(functions): + for j, predicate in enumerate(predicates): + cases.append((function, "Tuple{}{}".format(i, j), predicate)) + + def reduce_fn(x, y): + function, name, predicate = y + return x + combinations.combine( + function=function, + predicate=combinations.NamedObject(name, predicate)) + + return functools.reduce(reduce_fn, cases, []) + + class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase): def _testDataset(self, dataset, function, predicate): @@ -43,7 +88,10 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output.append(r) self.assertDatasetProduces(dataset, expected_output=expected_output) - def _testMapAndFilterFusion(self, function, predicate): + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + _test_combinations())) + def testMapAndFilterFusion(self, function, predicate): dataset = dataset_ops.Dataset.range(10).apply( testing.assert_next(["Map", "Filter", "Map"])).map(function).filter(predicate) @@ -53,41 +101,6 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.with_options(options) self._testDataset(dataset, function, predicate) - @combinations.generate(test_base.default_test_combinations()) - def testMapAndFilterFusionScalar(self): - identity = lambda x: x - increment = lambda x: x + 1 - minus_five = lambda x: x - 5 - - def increment_and_square(x): - y = x + 1 - return y * y - - functions = [identity, increment, minus_five, increment_and_square] - - take_all = lambda x: constant_op.constant(True) - is_zero = lambda x: math_ops.equal(x, 0) - is_odd = lambda x: math_ops.equal(x % 2, 0) - greater = lambda x: math_ops.greater(x + 5, 0) - predicates = [take_all, is_zero, is_odd, greater] - - for function in functions: - for predicate in predicates: - self._testMapAndFilterFusion(function, predicate) - - @combinations.generate(test_base.default_test_combinations()) - def testMapAndFilterFusionTuple(self): - replicate = lambda x: (x, x) - with_two = lambda x: (x, 2) - functions = [replicate, with_two] - take_all = lambda x, y: constant_op.constant(True) - is_zero = lambda x, y: math_ops.equal(x * math_ops.cast(y, dtypes.int64), 0) - predicates = [take_all, is_zero] - - for function in functions: - for predicate in predicates: - self._testMapAndFilterFusion(function, predicate) - @combinations.generate(test_base.default_test_combinations()) def testCapturedInputs(self): a = constant_op.constant(3, dtype=dtypes.int64) diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py index 28da0474bc9..efe9c4880f2 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from absl.testing import parameterized from tensorflow.python.data.experimental.ops import testing @@ -26,9 +28,44 @@ from tensorflow.python.framework import combinations from tensorflow.python.platform import test +def _test_combinations(): + cases = [] + + identity = lambda x: x + increment = lambda x: x + 1 + + def increment_and_square(x): + y = x + 1 + return y * y + + functions = [identity, increment, increment_and_square] + + for i, x in enumerate(functions): + for j, y in enumerate(functions): + cases.append(("Scalar{}{}".format(i, j), [x, y])) + for k, z in enumerate(functions): + cases.append(("Scalar{}{}{}".format(i, j, k), [x, y, z])) + + with_42 = lambda x: (x, 42) + swap = lambda x, y: (y, x) + + cases.append(("Tuple1", [with_42, swap])) + cases.append(("Tuple2", [with_42, swap, swap])) + + def reduce_fn(x, y): + name, functions = y + return x + combinations.combine( + functions=combinations.NamedObject(name, functions)) + + return functools.reduce(reduce_fn, cases, []) + + class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase): - def _testMapFusion(self, functions): + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + _test_combinations())) + def testMapFusion(self, functions): dataset = dataset_ops.Dataset.range(5).apply( testing.assert_next(["Map", "MemoryCacheImpl"])) for function in functions: @@ -50,31 +87,6 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output.append(r) self.assertDatasetProduces(dataset, expected_output=expected_output) - @combinations.generate(test_base.default_test_combinations()) - def testMapFusionScalar(self): - identity = lambda x: x - increment = lambda x: x + 1 - - def increment_and_square(x): - y = x + 1 - return y * y - - functions = [identity, increment, increment_and_square] - - for x in functions: - for y in functions: - self._testMapFusion([x, y]) - for z in functions: - self._testMapFusion([x, y, z]) - - @combinations.generate(test_base.default_test_combinations()) - def testMapAndFilterFusionTuple(self): - with_42 = lambda x: (x, 42) - swap = lambda x, y: (y, x) - - self._testMapFusion([with_42, swap]) - self._testMapFusion([with_42, swap, swap]) - if __name__ == "__main__": test.main() diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py index a28a3052abc..ac92ddea529 100644 --- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import functools + from absl.testing import parameterized from tensorflow.python.data.experimental.ops import testing @@ -32,9 +34,33 @@ from tensorflow.python.ops import variables from tensorflow.python.platform import test +def _test_combinations(): + def assert_greater(x): + assert_op = control_flow_ops.Assert(math_ops.greater(x, -1), [x]) + with ops.control_dependencies([assert_op]): + return x + + cases = [ + ("Identity", lambda x: x, True), + ("Increment", lambda x: x + 1, True), + ("AssertGreater", assert_greater, True), + ] + + def reduce_fn(x, y): + name, function, should_optimize = y + return x + combinations.combine( + function=combinations.NamedObject(name, function), + should_optimize=should_optimize) + + return functools.reduce(reduce_fn, cases, []) + + class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase): - def _testMapParallelization(self, function, should_optimize): + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + _test_combinations())) + def testMapParallelization(self, function, should_optimize): next_nodes = ["ParallelMap"] if should_optimize else ["Map"] dataset = dataset_ops.Dataset.range(5).apply( testing.assert_next(next_nodes)).map(function) @@ -45,24 +71,6 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( dataset, expected_output=[function(x) for x in range(5)]) - @combinations.generate(test_base.default_test_combinations()) - def testIdentity(self): - self._testMapParallelization(lambda x: x, should_optimize=True) - - @combinations.generate(test_base.default_test_combinations()) - def testIncrement(self): - self._testMapParallelization(lambda x: x + 1, should_optimize=True) - - @combinations.generate(test_base.default_test_combinations()) - def testAssert(self): - - def assert_greater(x): - assert_op = control_flow_ops.Assert(math_ops.greater(x, -1), [x]) - with ops.control_dependencies([assert_op]): - return x - - self._testMapParallelization(assert_greater, should_optimize=True) - @combinations.generate(test_base.default_test_combinations()) def testCapturedConstant(self): captured_t = constant_op.constant(42, dtype=dtypes.int64) diff --git a/tensorflow/python/framework/test_combinations.py b/tensorflow/python/framework/test_combinations.py index 95a3dc4827e..0986585fc21 100644 --- a/tensorflow/python/framework/test_combinations.py +++ b/tensorflow/python/framework/test_combinations.py @@ -400,6 +400,9 @@ class NamedObject(object): def __call__(self, *args, **kwargs): return self._obj(*args, **kwargs) + def __iter__(self): + return self._obj.__iter__() + def __repr__(self): return self._name From 85a46fbf52710e01f28b12ee69ad4df9a888248a Mon Sep 17 00:00:00 2001 From: Yunlu Li Date: Thu, 5 Dec 2019 09:51:55 -0800 Subject: [PATCH 165/383] Sparse tensor definition in TFLite. PiperOrigin-RevId: 283994681 Change-Id: Ie9ddfa9c044a3b347a9e9ecf01b69516fbf9493a --- tensorflow/lite/BUILD | 1 + tensorflow/lite/c/common.c | 34 ++ tensorflow/lite/c/common.h | 31 ++ tensorflow/lite/c/common_test.cc | 26 ++ tensorflow/lite/core/subgraph.cc | 20 +- tensorflow/lite/core/subgraph.h | 9 +- tensorflow/lite/model.cc | 81 ++++- tensorflow/lite/model.h | 2 + tensorflow/lite/model_test.cc | 72 ++++ tensorflow/lite/schema/schema.fbs | 80 +++++ tensorflow/lite/schema/schema_generated.h | 334 +++++++++++++++++- tensorflow/lite/testdata/sparse_tensor.bin | Bin 0 -> 412 bytes tensorflow/lite/testdata/sparse_tensor.json | 63 ++++ .../benchmark/experimental/c/c_api_types.h | 31 ++ 14 files changed, 769 insertions(+), 15 deletions(-) create mode 100644 tensorflow/lite/testdata/sparse_tensor.bin create mode 100644 tensorflow/lite/testdata/sparse_tensor.json diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD index 530b27aa7d3..84150546353 100644 --- a/tensorflow/lite/BUILD +++ b/tensorflow/lite/BUILD @@ -318,6 +318,7 @@ cc_test( "testdata/2_subgraphs.bin", "testdata/empty_model.bin", "testdata/multi_add_flex.bin", + "testdata/sparse_tensor.bin", "testdata/test_min_runtime.bin", "testdata/test_model.bin", "testdata/test_model_broken.bin", diff --git a/tensorflow/lite/c/common.c b/tensorflow/lite/c/common.c index 524bf8091fe..0b17c049e93 100644 --- a/tensorflow/lite/c/common.c +++ b/tensorflow/lite/c/common.c @@ -103,12 +103,46 @@ void TfLiteQuantizationFree(TfLiteQuantization* quantization) { quantization->type = kTfLiteNoQuantization; } +void TfLiteSparsityFree(TfLiteSparsity* sparsity) { + if (sparsity == NULL) { + return; + } + + if (sparsity->traversal_order) { + TfLiteIntArrayFree(sparsity->traversal_order); + sparsity->traversal_order = NULL; + } + + if (sparsity->block_map) { + TfLiteIntArrayFree(sparsity->block_map); + sparsity->block_map = NULL; + } + + if (sparsity->dim_metadata) { + for (int i = 0; i < sparsity->dim_metadata_size; i++) { + TfLiteDimensionMetadata metadata = sparsity->dim_metadata[i]; + if (metadata.format == kTfLiteDimSparseCSR) { + TfLiteIntArrayFree(metadata.array_segments); + metadata.array_segments = NULL; + TfLiteIntArrayFree(metadata.array_indices); + metadata.array_indices = NULL; + } + } + free(sparsity->dim_metadata); + sparsity->dim_metadata = NULL; + } + + free(sparsity); +} + void TfLiteTensorFree(TfLiteTensor* t) { TfLiteTensorDataFree(t); if (t->dims) TfLiteIntArrayFree(t->dims); t->dims = NULL; TfLiteQuantizationFree(&t->quantization); + TfLiteSparsityFree(t->sparsity); + t->sparsity = NULL; } void TfLiteTensorReset(TfLiteType type, const char* name, TfLiteIntArray* dims, diff --git a/tensorflow/lite/c/common.h b/tensorflow/lite/c/common.h index b3b0ddc059d..332b9b68881 100644 --- a/tensorflow/lite/c/common.h +++ b/tensorflow/lite/c/common.h @@ -303,6 +303,29 @@ enum { kTfLiteNullBufferHandle = -1, }; +// Storage format of each dimension in a sparse tensor. +typedef enum { + kTfLiteDimDense = 0, + kTfLiteDimSparseCSR, +} TfLiteDimensionType; + +// Metadata to encode each dimension in a sparse tensor. +typedef struct { + TfLiteDimensionType format; + int dense_size; + TfLiteIntArray* array_segments; + TfLiteIntArray* array_indices; +} TfLiteDimensionMetadata; + +// Parameters used to encode a sparse tensor. For detailed explanation of each +// field please refer to lite/schema/schema.fbs. +typedef struct { + TfLiteIntArray* traversal_order; + TfLiteIntArray* block_map; + TfLiteDimensionMetadata* dim_metadata; + int dim_metadata_size; +} TfLiteSparsity; + // An tensor in the interpreter system which is a wrapper around a buffer of // data including a dimensionality (or NULL if not currently defined). typedef struct { @@ -357,6 +380,11 @@ typedef struct { // Quantization information. Replaces params field above. TfLiteQuantization quantization; + + // Parameters used to encode a sparse tensor. + // This is optional. The field is NULL if a tensor is dense. + // WARNING: This is an experimental interface that is subject to change. + TfLiteSparsity* sparsity; } TfLiteTensor; // Free data memory of tensor `t`. @@ -365,6 +393,9 @@ void TfLiteTensorDataFree(TfLiteTensor* t); // Free quantization data. void TfLiteQuantizationFree(TfLiteQuantization* quantization); +// Free sparsity parameters. +void TfLiteSparsityFree(TfLiteSparsity* sparsity); + // Free memory of tensor `t`. void TfLiteTensorFree(TfLiteTensor* t); diff --git a/tensorflow/lite/c/common_test.cc b/tensorflow/lite/c/common_test.cc index 88ac181faf6..65c6ec63b28 100644 --- a/tensorflow/lite/c/common_test.cc +++ b/tensorflow/lite/c/common_test.cc @@ -96,6 +96,7 @@ TEST(Quantization, TestQuantizationFree) { t.allocation_type = kTfLiteArenaRw; t.dims = nullptr; t.quantization.type = kTfLiteAffineQuantization; + t.sparsity = nullptr; auto* params = reinterpret_cast( malloc(sizeof(TfLiteAffineQuantization))); params->scale = TfLiteFloatArrayCreate(3); @@ -104,6 +105,31 @@ TEST(Quantization, TestQuantizationFree) { TfLiteTensorFree(&t); } +TEST(Sparsity, TestSparsityFree) { + TfLiteTensor t; + // Set these values, otherwise TfLiteTensorFree has uninitialized values. + t.allocation_type = kTfLiteArenaRw; + t.dims = nullptr; + + // A dummy CSR sparse matrix. + t.sparsity = static_cast(malloc(sizeof(TfLiteSparsity))); + t.sparsity->traversal_order = TfLiteIntArrayCreate(2); + t.sparsity->block_map = nullptr; + + t.sparsity->dim_metadata = static_cast( + malloc(sizeof(TfLiteDimensionMetadata) * 2)); + t.sparsity->dim_metadata_size = 2; + + t.sparsity->dim_metadata[0].format = kTfLiteDimDense; + t.sparsity->dim_metadata[0].dense_size = 4; + + t.sparsity->dim_metadata[1].format = kTfLiteDimSparseCSR; + t.sparsity->dim_metadata[1].array_segments = TfLiteIntArrayCreate(2); + t.sparsity->dim_metadata[1].array_indices = TfLiteIntArrayCreate(3); + + TfLiteTensorFree(&t); +} + } // namespace tflite int main(int argc, char** argv) { diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc index e453ff2ff7e..69c39769593 100644 --- a/tensorflow/lite/core/subgraph.cc +++ b/tensorflow/lite/core/subgraph.cc @@ -40,6 +40,15 @@ struct TfLiteQuantizationDeleter { using ScopedTfLiteQuantization = std::unique_ptr; +struct TfLiteSparsityDeleter { + void operator()(TfLiteSparsity* s) { + if (s) TfLiteSparsityFree(s); + } +}; + +using ScopedTfLiteSparsity = + std::unique_ptr; + TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node, const TfLiteRegistration& registration, int node_index, const char* message) { @@ -908,9 +917,10 @@ TfLiteStatus Subgraph::GetNodeAndRegistration( TfLiteStatus Subgraph::SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantization quantization, const char* buffer, - size_t bytes, const Allocation* allocation) { + size_t bytes, const Allocation* allocation, TfLiteSparsity* sparsity) { // Ensure quantization cleanup on failure. ScopedTfLiteQuantization scoped_quantization(&quantization); + ScopedTfLiteSparsity scoped_sparsity(sparsity); if (state_ == kStateInvokableAndImmutable) { ReportError( "SetTensorParametersReadOnly is disallowed when graph is immutable."); @@ -919,10 +929,12 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( TF_LITE_ENSURE(&context_, tensor_index < context_.tensors_size && tensor_index >= 0); + // For most tensors we know exactly how much memory is necessary so we can // ensure the buffer is large enough. However, we need to skip string tensors - // because their sizes change with the contents of the individual strings. - if (type != kTfLiteString) { + // and sparse tensors because their sizes change with the contents. + // TODO(b/145615516): Extend BytesRequired to check sparse tensors. + if (type != kTfLiteString && sparsity == nullptr) { size_t required_bytes; TF_LITE_ENSURE_OK(&context_, BytesRequired(type, dims, rank, &required_bytes)); @@ -939,6 +951,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims); tensor.params = GetLegacyQuantization(quantization); tensor.quantization = *scoped_quantization.release(); + tensor.sparsity = scoped_sparsity.release(); tensor.allocation_type = kTfLiteMmapRo; tensor.allocation = allocation; } else { @@ -950,6 +963,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly( // TODO(suharshs): Update TfLiteTensorReset to include the new quantization // if there are other required callers. tensor.quantization = *scoped_quantization.release(); + tensor.sparsity = scoped_sparsity.release(); } return kTfLiteOk; } diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h index 89a9da7db28..c2572546709 100644 --- a/tensorflow/lite/core/subgraph.h +++ b/tensorflow/lite/core/subgraph.h @@ -94,16 +94,17 @@ class Subgraph { inline TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const std::vector& dims, TfLiteQuantization quantization, - const char* buffer, size_t bytes, - const Allocation* allocation = nullptr) { + const char* buffer, size_t bytes, const Allocation* allocation = nullptr, + TfLiteSparsity* sparsity = nullptr) { return SetTensorParametersReadOnly(tensor_index, type, name, dims.size(), dims.data(), quantization, buffer, bytes, - allocation); + allocation, sparsity); } TfLiteStatus SetTensorParametersReadOnly( int tensor_index, TfLiteType type, const char* name, const size_t rank, const int* dims, TfLiteQuantization quantization, const char* buffer, - size_t bytes, const Allocation* allocation = nullptr); + size_t bytes, const Allocation* allocation = nullptr, + TfLiteSparsity* sparsity = nullptr); // Set description of inputs/outputs/data/fptrs for node `node_index`. // This variant assumes an external buffer has been allocated of size diff --git a/tensorflow/lite/model.cc b/tensorflow/lite/model.cc index d060289a0ee..a4287a57ea0 100644 --- a/tensorflow/lite/model.cc +++ b/tensorflow/lite/model.cc @@ -416,6 +416,77 @@ TfLiteStatus InterpreterBuilder::ParseQuantization( return kTfLiteOk; } +// TODO(b/145614687): Add sparse tensor verification check in +// lite/tools/verifier.cc. +TfLiteStatus InterpreterBuilder::ParseSparsity( + const SparsityParameters* src_sparsity, TfLiteSparsity** sparsity_ptr) { + if (!src_sparsity) { + return kTfLiteOk; + } + + auto* sparsity = + reinterpret_cast(malloc(sizeof(TfLiteSparsity))); + memset(sparsity, 0, sizeof(TfLiteSparsity)); + *sparsity_ptr = sparsity; + + if (src_sparsity->traversal_order()) { + const size_t traversal_order_size = src_sparsity->traversal_order()->size(); + sparsity->traversal_order = TfLiteIntArrayCreate(traversal_order_size); + for (int i = 0; i < traversal_order_size; i++) { + sparsity->traversal_order->data[i] = + src_sparsity->traversal_order()->Get(i); + } + } + + if (src_sparsity->block_map()) { + const size_t block_map_size = src_sparsity->block_map()->size(); + sparsity->block_map = TfLiteIntArrayCreate(block_map_size); + for (int i = 0; i < block_map_size; i++) { + sparsity->block_map->data[i] = src_sparsity->block_map()->Get(i); + } + } + + if (src_sparsity->dim_metadata()) { + const size_t dim_metadata_size = src_sparsity->dim_metadata()->size(); + sparsity->dim_metadata_size = dim_metadata_size; + sparsity->dim_metadata = reinterpret_cast( + malloc(dim_metadata_size * sizeof(TfLiteDimensionMetadata))); + memset(sparsity->dim_metadata, 0, + dim_metadata_size * sizeof(TfLiteDimensionMetadata)); + + for (int i = 0; i < dim_metadata_size; i++) { + const auto* src_metadata = src_sparsity->dim_metadata()->Get(i); + auto* tgt_metadata = &sparsity->dim_metadata[i]; + + tgt_metadata->format = + static_cast(src_metadata->format()); + + if (tgt_metadata->format == kTfLiteDimDense) { + tgt_metadata->dense_size = src_metadata->dense_size(); + } else if (tgt_metadata->format == kTfLiteDimSparseCSR) { + const int array_segments_size = src_metadata->array_segments()->size(); + tgt_metadata->array_segments = + TfLiteIntArrayCreate(array_segments_size); + for (int j = 0; j < array_segments_size; j++) { + tgt_metadata->array_segments->data[j] = + src_metadata->array_segments()->Get(j); + } + const int array_indices_size = src_metadata->array_indices()->size(); + tgt_metadata->array_indices = TfLiteIntArrayCreate(array_indices_size); + for (int j = 0; j < array_indices_size; j++) { + tgt_metadata->array_indices->data[j] = + src_metadata->array_indices()->Get(j); + } + } else { + error_reporter_->Report("Unsupported dimension type."); + return kTfLiteError; + } + } + } + + return kTfLiteOk; +} + TfLiteStatus InterpreterBuilder::ParseTensors( const flatbuffers::Vector>* buffers, const flatbuffers::Vector>* tensors, @@ -474,6 +545,13 @@ TfLiteStatus InterpreterBuilder::ParseTensors( continue; } + const auto* src_sparsity = tensor->sparsity(); + TfLiteSparsity* sparsity = nullptr; + if (ParseSparsity(src_sparsity, &sparsity) != kTfLiteOk) { + status = kTfLiteError; + continue; + } + bool is_variable = tensor->is_variable(); if (buffer_ptr) { if (is_variable) { @@ -486,12 +564,13 @@ TfLiteStatus InterpreterBuilder::ParseTensors( if (subgraph->SetTensorParametersReadOnly( i, type, get_name(tensor), dims, quantization, buffer_ptr, - buffer_size, allocation_) != kTfLiteOk) { + buffer_size, allocation_, sparsity) != kTfLiteOk) { error_reporter_->Report("Tensor %d is invalidly specified in schema.\n", i); status = kTfLiteError; } } else { + // TODO(b/144999664): Non-constant sparse tensor is not supported now. if (subgraph->SetTensorParametersReadWrite(i, type, get_name(tensor), dims, quantization, is_variable) != kTfLiteOk) { diff --git a/tensorflow/lite/model.h b/tensorflow/lite/model.h index fafb38ffd10..b8b4b4457da 100644 --- a/tensorflow/lite/model.h +++ b/tensorflow/lite/model.h @@ -223,6 +223,8 @@ class InterpreterBuilder { TfLiteStatus ParseQuantization(const QuantizationParameters* src_quantization, TfLiteQuantization* quantization, const std::vector& dims); + TfLiteStatus ParseSparsity(const SparsityParameters* src_sparsity, + TfLiteSparsity** sparsity); const ::tflite::Model* model_; const OpResolver& op_resolver_; diff --git a/tensorflow/lite/model_test.cc b/tensorflow/lite/model_test.cc index 7dc582b8862..2675715a613 100644 --- a/tensorflow/lite/model_test.cc +++ b/tensorflow/lite/model_test.cc @@ -331,6 +331,78 @@ TEST(BasicFlatBufferModel, TestReadRuntimeVersionFromModel) { ASSERT_EQ(model2->GetMinimumRuntime(), "1.10.0"); } +// The test model has the following tensor encoded in the TACO format: +// [[1, 0, 2, 3], +// [0, 4, 0, 0], +// [0, 0, 5, 0], +// [0, 0, 0, 6]]. +// TACO supports multiple encodings like CSR, CSC, etc. We chose to use the one +// similar to the blocked-CSR format with 2x2 row-major dense blocks. +TEST(BasicFlatBufferModel, TestParseModelWithSparseTensor) { + // The model only has 1 sparse constant tensor. + auto model = FlatBufferModel::BuildFromFile( + "tensorflow/lite/testdata/sparse_tensor.bin"); + ASSERT_TRUE(model); + + std::unique_ptr interpreter(new Interpreter); + ASSERT_EQ(InterpreterBuilder(*model, TrivialResolver())(&interpreter), + kTfLiteOk); + ASSERT_NE(interpreter, nullptr); + ASSERT_EQ(interpreter->tensors_size(), 1); + TfLiteTensor* t1 = interpreter->tensor(0); + ASSERT_EQ(t1->allocation_type, kTfLiteMmapRo); + + TfLiteIntArray* traversal_order = TfLiteIntArrayCreate(4); + traversal_order->data[0] = 0; + traversal_order->data[1] = 1; + traversal_order->data[2] = 2; + traversal_order->data[3] = 3; + ASSERT_TRUE( + TfLiteIntArrayEqual(t1->sparsity->traversal_order, traversal_order)); + TfLiteIntArrayFree(traversal_order); + + TfLiteIntArray* block_map = TfLiteIntArrayCreate(2); + block_map->data[0] = 0; + block_map->data[1] = 1; + ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->block_map, block_map)); + TfLiteIntArrayFree(block_map); + + ASSERT_EQ(t1->sparsity->dim_metadata_size, 4); + + ASSERT_EQ(t1->sparsity->dim_metadata[0].format, kTfLiteDimDense); + ASSERT_EQ(t1->sparsity->dim_metadata[0].dense_size, 2); + ASSERT_EQ(t1->sparsity->dim_metadata[0].array_segments, nullptr); + ASSERT_EQ(t1->sparsity->dim_metadata[0].array_indices, nullptr); + + ASSERT_EQ(t1->sparsity->dim_metadata[1].format, kTfLiteDimSparseCSR); + ASSERT_EQ(t1->sparsity->dim_metadata[1].dense_size, 0); + TfLiteIntArray* array_segments = TfLiteIntArrayCreate(3); + array_segments->data[0] = 0; + array_segments->data[1] = 2; + array_segments->data[2] = 3; + ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->dim_metadata[1].array_segments, + array_segments)); + TfLiteIntArrayFree(array_segments); + + TfLiteIntArray* array_indices = TfLiteIntArrayCreate(3); + array_indices->data[0] = 0; + array_indices->data[1] = 1; + array_indices->data[2] = 1; + ASSERT_TRUE(TfLiteIntArrayEqual(t1->sparsity->dim_metadata[1].array_indices, + array_indices)); + TfLiteIntArrayFree(array_indices); + + ASSERT_EQ(t1->sparsity->dim_metadata[2].format, kTfLiteDimDense); + ASSERT_EQ(t1->sparsity->dim_metadata[2].dense_size, 2); + ASSERT_EQ(t1->sparsity->dim_metadata[2].array_segments, nullptr); + ASSERT_EQ(t1->sparsity->dim_metadata[2].array_indices, nullptr); + + ASSERT_EQ(t1->sparsity->dim_metadata[3].format, kTfLiteDimDense); + ASSERT_EQ(t1->sparsity->dim_metadata[3].dense_size, 2); + ASSERT_EQ(t1->sparsity->dim_metadata[3].array_segments, nullptr); + ASSERT_EQ(t1->sparsity->dim_metadata[3].array_indices, nullptr); +} + // TODO(aselle): Add tests for serialization of builtin op data types. // These tests will occur with the evaluation tests of individual operators, // not here. diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs index f1fbfc655d6..63fd3bbc4d6 100644 --- a/tensorflow/lite/schema/schema.fbs +++ b/tensorflow/lite/schema/schema.fbs @@ -80,6 +80,82 @@ table QuantizationParameters { quantized_dimension:int; } +// Sparse tensors. +// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1), +// potentially with a k-dimensional block (0 <= k <= n) with dims +// (dn, ..., dn+k-1), the format needs to specify: +// 1. In what order to traverse these dimensions. For example, to store a 2-D +// matrix in row major order, the traversal order would be (d0, d1), +// whereas to store it in column major order, the traversal order would be +// (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order +// could be (d0, d1, d2, d3). +// 2. In the order of (d0, ..., dn-1, dn, ..., dn+k-1), whether each dimension +// is DENSE or SPARSE. +// 3. How each block dimension in (dn, ..., dn+k-1) maps to the original +// tensor dimension in (d0, ..., dn-1). +// 4. Index metadata for each dimension. For a dense dimension, this is just +// the size of that dimension. For a sparse dimension, it's the same as +// the compressed index defined in the Compressed Sparse Row (CSR) format. +// (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html) + +// The storage type for a dimension. Currently we support: +// 1. DENSE: each coordinate in this dimension is stored implicitly. +// 2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The +// compression technique is the same what CSR uses. +// More types like a sparse dimension with a different compression technique +// could be added to the list in the future. +enum DimensionType : byte { + DENSE = 0, + SPARSE_CSR = 1, +} + +table DimensionMetadata { + // Whether each dimension is dense or sparse. + format:DimensionType; + // Index metadata used for each dimension. + // - If format is DimensionType.DENSE then we use the dense_size field to + // store the size of that dimension. Each index in that dimension is + // stored implicitly. + // - If format is DimensionType.SPARSE_CSR then we use array_segments and + // array_indices to encode that dimension. array_segments represents how + // to segment the indices array, each segment corresponds to one element + // in the previous dimension. array_indices represents the index of the + // non-zero elements within this dimension (as those in the CSR matrix + // format, where the first array is row pointers and the second array is + // column indices). + dense_size:int; + array_segments:[int]; + array_indices:[int]; +} + +// Parameters to encode a sparse TfLite tensor. +table SparsityParameters { + // The traversal order of the dimensions defined in the `shape` field of the + // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1, + // ..., dn-1), + // - if not block sparse, the traversal_order is just a permutation of (d0, + // ..., dn-1). For example, a 2-D matrix stored in row-major order would + // have traversal_order = (d0, d1). + // - if block sparse with a k-dimensional block (0 <= k <= n), the + // traversal_order has n + k elements. The first n elements are still a + // permutation of (d0, ..., dn-1). The lask k elements are a permutation + // of (dn, ..., dn+k-1), defining how to traverse a block internally. For + // example, a 2-D matrix with 2-D blocks, both stored in row-major order + // would have traversal_order = (d0, d1, d2, d3). + traversal_order:[int]; + // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n), + // stores how a block dimension in (dn, ..., dn+k-1) maps to the original + // tensor dimension in (d0, ..., dn). + // It's stored in the order of (dn, ..., dn+k-1). + // If not block-sparse, this field is NULL. + block_map:[int]; + // In the order of (d0, ..., dn-1, dn, ..., dn+k-1), the metadata needed for + // each dimension to locate the non-zero values in the original dense tensor. + // The size of the dim_metadata array = the size of the traversal_order array + // = n + k. + dim_metadata:[DimensionMetadata]; +} + table Tensor { // The tensor shape. The meaning of each entry is operator-specific but // builtin ops use: [batch size, height, width, number of channels] (That's @@ -99,6 +175,10 @@ table Tensor { quantization:QuantizationParameters; // Optional. is_variable:bool = false; + + // Parameters to encode a sparse tensor. See the example in + // tensorflow/lite/testdata/sparse_tensor.json. + sparsity:SparsityParameters; // Optional. } // A list of builtin operators. Builtin operators are slightly faster than custom diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h index ea2f1cc0b8b..ae523cc7d5a 100755 --- a/tensorflow/lite/schema/schema_generated.h +++ b/tensorflow/lite/schema/schema_generated.h @@ -28,6 +28,12 @@ struct CustomQuantizationT; struct QuantizationParameters; struct QuantizationParametersT; +struct DimensionMetadata; +struct DimensionMetadataT; + +struct SparsityParameters; +struct SparsityParametersT; + struct Tensor; struct TensorT; @@ -477,6 +483,36 @@ struct QuantizationDetailsUnion { bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const void *obj, QuantizationDetails type); bool VerifyQuantizationDetailsVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); +enum DimensionType { + DimensionType_DENSE = 0, + DimensionType_SPARSE_CSR = 1, + DimensionType_MIN = DimensionType_DENSE, + DimensionType_MAX = DimensionType_SPARSE_CSR +}; + +inline const DimensionType (&EnumValuesDimensionType())[2] { + static const DimensionType values[] = { + DimensionType_DENSE, + DimensionType_SPARSE_CSR + }; + return values; +} + +inline const char * const *EnumNamesDimensionType() { + static const char * const names[] = { + "DENSE", + "SPARSE_CSR", + nullptr + }; + return names; +} + +inline const char *EnumNameDimensionType(DimensionType e) { + if (e < DimensionType_DENSE || e > DimensionType_SPARSE_CSR) return ""; + const size_t index = static_cast(e); + return EnumNamesDimensionType()[index]; +} + enum BuiltinOperator { BuiltinOperator_ADD = 0, BuiltinOperator_AVERAGE_POOL_2D = 1, @@ -2867,6 +2903,206 @@ inline flatbuffers::Offset CreateQuantizationParametersD flatbuffers::Offset CreateQuantizationParameters(flatbuffers::FlatBufferBuilder &_fbb, const QuantizationParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +struct DimensionMetadataT : public flatbuffers::NativeTable { + typedef DimensionMetadata TableType; + DimensionType format; + int32_t dense_size; + std::vector array_segments; + std::vector array_indices; + DimensionMetadataT() + : format(DimensionType_DENSE), + dense_size(0) { + } +}; + +struct DimensionMetadata FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef DimensionMetadataT NativeTableType; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_FORMAT = 4, + VT_DENSE_SIZE = 6, + VT_ARRAY_SEGMENTS = 8, + VT_ARRAY_INDICES = 10 + }; + DimensionType format() const { + return static_cast(GetField(VT_FORMAT, 0)); + } + int32_t dense_size() const { + return GetField(VT_DENSE_SIZE, 0); + } + const flatbuffers::Vector *array_segments() const { + return GetPointer *>(VT_ARRAY_SEGMENTS); + } + const flatbuffers::Vector *array_indices() const { + return GetPointer *>(VT_ARRAY_INDICES); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyField(verifier, VT_FORMAT) && + VerifyField(verifier, VT_DENSE_SIZE) && + VerifyOffset(verifier, VT_ARRAY_SEGMENTS) && + verifier.VerifyVector(array_segments()) && + VerifyOffset(verifier, VT_ARRAY_INDICES) && + verifier.VerifyVector(array_indices()) && + verifier.EndTable(); + } + DimensionMetadataT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct DimensionMetadataBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_format(DimensionType format) { + fbb_.AddElement(DimensionMetadata::VT_FORMAT, static_cast(format), 0); + } + void add_dense_size(int32_t dense_size) { + fbb_.AddElement(DimensionMetadata::VT_DENSE_SIZE, dense_size, 0); + } + void add_array_segments(flatbuffers::Offset> array_segments) { + fbb_.AddOffset(DimensionMetadata::VT_ARRAY_SEGMENTS, array_segments); + } + void add_array_indices(flatbuffers::Offset> array_indices) { + fbb_.AddOffset(DimensionMetadata::VT_ARRAY_INDICES, array_indices); + } + explicit DimensionMetadataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + DimensionMetadataBuilder &operator=(const DimensionMetadataBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateDimensionMetadata( + flatbuffers::FlatBufferBuilder &_fbb, + DimensionType format = DimensionType_DENSE, + int32_t dense_size = 0, + flatbuffers::Offset> array_segments = 0, + flatbuffers::Offset> array_indices = 0) { + DimensionMetadataBuilder builder_(_fbb); + builder_.add_array_indices(array_indices); + builder_.add_array_segments(array_segments); + builder_.add_dense_size(dense_size); + builder_.add_format(format); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateDimensionMetadataDirect( + flatbuffers::FlatBufferBuilder &_fbb, + DimensionType format = DimensionType_DENSE, + int32_t dense_size = 0, + const std::vector *array_segments = nullptr, + const std::vector *array_indices = nullptr) { + auto array_segments__ = array_segments ? _fbb.CreateVector(*array_segments) : 0; + auto array_indices__ = array_indices ? _fbb.CreateVector(*array_indices) : 0; + return tflite::CreateDimensionMetadata( + _fbb, + format, + dense_size, + array_segments__, + array_indices__); +} + +flatbuffers::Offset CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + +struct SparsityParametersT : public flatbuffers::NativeTable { + typedef SparsityParameters TableType; + std::vector traversal_order; + std::vector block_map; + std::vector> dim_metadata; + SparsityParametersT() { + } +}; + +struct SparsityParameters FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { + typedef SparsityParametersT NativeTableType; + enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { + VT_TRAVERSAL_ORDER = 4, + VT_BLOCK_MAP = 6, + VT_DIM_METADATA = 8 + }; + const flatbuffers::Vector *traversal_order() const { + return GetPointer *>(VT_TRAVERSAL_ORDER); + } + const flatbuffers::Vector *block_map() const { + return GetPointer *>(VT_BLOCK_MAP); + } + const flatbuffers::Vector> *dim_metadata() const { + return GetPointer> *>(VT_DIM_METADATA); + } + bool Verify(flatbuffers::Verifier &verifier) const { + return VerifyTableStart(verifier) && + VerifyOffset(verifier, VT_TRAVERSAL_ORDER) && + verifier.VerifyVector(traversal_order()) && + VerifyOffset(verifier, VT_BLOCK_MAP) && + verifier.VerifyVector(block_map()) && + VerifyOffset(verifier, VT_DIM_METADATA) && + verifier.VerifyVector(dim_metadata()) && + verifier.VerifyVectorOfTables(dim_metadata()) && + verifier.EndTable(); + } + SparsityParametersT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; + void UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const; + static flatbuffers::Offset Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); +}; + +struct SparsityParametersBuilder { + flatbuffers::FlatBufferBuilder &fbb_; + flatbuffers::uoffset_t start_; + void add_traversal_order(flatbuffers::Offset> traversal_order) { + fbb_.AddOffset(SparsityParameters::VT_TRAVERSAL_ORDER, traversal_order); + } + void add_block_map(flatbuffers::Offset> block_map) { + fbb_.AddOffset(SparsityParameters::VT_BLOCK_MAP, block_map); + } + void add_dim_metadata(flatbuffers::Offset>> dim_metadata) { + fbb_.AddOffset(SparsityParameters::VT_DIM_METADATA, dim_metadata); + } + explicit SparsityParametersBuilder(flatbuffers::FlatBufferBuilder &_fbb) + : fbb_(_fbb) { + start_ = fbb_.StartTable(); + } + SparsityParametersBuilder &operator=(const SparsityParametersBuilder &); + flatbuffers::Offset Finish() { + const auto end = fbb_.EndTable(start_); + auto o = flatbuffers::Offset(end); + return o; + } +}; + +inline flatbuffers::Offset CreateSparsityParameters( + flatbuffers::FlatBufferBuilder &_fbb, + flatbuffers::Offset> traversal_order = 0, + flatbuffers::Offset> block_map = 0, + flatbuffers::Offset>> dim_metadata = 0) { + SparsityParametersBuilder builder_(_fbb); + builder_.add_dim_metadata(dim_metadata); + builder_.add_block_map(block_map); + builder_.add_traversal_order(traversal_order); + return builder_.Finish(); +} + +inline flatbuffers::Offset CreateSparsityParametersDirect( + flatbuffers::FlatBufferBuilder &_fbb, + const std::vector *traversal_order = nullptr, + const std::vector *block_map = nullptr, + const std::vector> *dim_metadata = nullptr) { + auto traversal_order__ = traversal_order ? _fbb.CreateVector(*traversal_order) : 0; + auto block_map__ = block_map ? _fbb.CreateVector(*block_map) : 0; + auto dim_metadata__ = dim_metadata ? _fbb.CreateVector>(*dim_metadata) : 0; + return tflite::CreateSparsityParameters( + _fbb, + traversal_order__, + block_map__, + dim_metadata__); +} + +flatbuffers::Offset CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); + struct TensorT : public flatbuffers::NativeTable { typedef Tensor TableType; std::vector shape; @@ -2875,6 +3111,7 @@ struct TensorT : public flatbuffers::NativeTable { std::string name; std::unique_ptr quantization; bool is_variable; + std::unique_ptr sparsity; TensorT() : type(TensorType_FLOAT32), buffer(0), @@ -2890,7 +3127,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_BUFFER = 8, VT_NAME = 10, VT_QUANTIZATION = 12, - VT_IS_VARIABLE = 14 + VT_IS_VARIABLE = 14, + VT_SPARSITY = 16 }; const flatbuffers::Vector *shape() const { return GetPointer *>(VT_SHAPE); @@ -2910,6 +3148,9 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { bool is_variable() const { return GetField(VT_IS_VARIABLE, 0) != 0; } + const SparsityParameters *sparsity() const { + return GetPointer(VT_SPARSITY); + } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_SHAPE) && @@ -2921,6 +3162,8 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_QUANTIZATION) && verifier.VerifyTable(quantization()) && VerifyField(verifier, VT_IS_VARIABLE) && + VerifyOffset(verifier, VT_SPARSITY) && + verifier.VerifyTable(sparsity()) && verifier.EndTable(); } TensorT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const; @@ -2949,6 +3192,9 @@ struct TensorBuilder { void add_is_variable(bool is_variable) { fbb_.AddElement(Tensor::VT_IS_VARIABLE, static_cast(is_variable), 0); } + void add_sparsity(flatbuffers::Offset sparsity) { + fbb_.AddOffset(Tensor::VT_SPARSITY, sparsity); + } explicit TensorBuilder(flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); @@ -2968,8 +3214,10 @@ inline flatbuffers::Offset CreateTensor( uint32_t buffer = 0, flatbuffers::Offset name = 0, flatbuffers::Offset quantization = 0, - bool is_variable = false) { + bool is_variable = false, + flatbuffers::Offset sparsity = 0) { TensorBuilder builder_(_fbb); + builder_.add_sparsity(sparsity); builder_.add_quantization(quantization); builder_.add_name(name); builder_.add_buffer(buffer); @@ -2986,7 +3234,8 @@ inline flatbuffers::Offset CreateTensorDirect( uint32_t buffer = 0, const char *name = nullptr, flatbuffers::Offset quantization = 0, - bool is_variable = false) { + bool is_variable = false, + flatbuffers::Offset sparsity = 0) { auto shape__ = shape ? _fbb.CreateVector(*shape) : 0; auto name__ = name ? _fbb.CreateString(name) : 0; return tflite::CreateTensor( @@ -2996,7 +3245,8 @@ inline flatbuffers::Offset CreateTensorDirect( buffer, name__, quantization, - is_variable); + is_variable, + sparsity); } flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder &_fbb, const TensorT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr); @@ -9726,6 +9976,73 @@ inline flatbuffers::Offset CreateQuantizationParameters( _quantized_dimension); } +inline DimensionMetadataT *DimensionMetadata::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new DimensionMetadataT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void DimensionMetadata::UnPackTo(DimensionMetadataT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = format(); _o->format = _e; }; + { auto _e = dense_size(); _o->dense_size = _e; }; + { auto _e = array_segments(); if (_e) { _o->array_segments.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_segments[_i] = _e->Get(_i); } } }; + { auto _e = array_indices(); if (_e) { _o->array_indices.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->array_indices[_i] = _e->Get(_i); } } }; +} + +inline flatbuffers::Offset DimensionMetadata::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateDimensionMetadata(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateDimensionMetadata(flatbuffers::FlatBufferBuilder &_fbb, const DimensionMetadataT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DimensionMetadataT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _format = _o->format; + auto _dense_size = _o->dense_size; + auto _array_segments = _o->array_segments.size() ? _fbb.CreateVector(_o->array_segments) : 0; + auto _array_indices = _o->array_indices.size() ? _fbb.CreateVector(_o->array_indices) : 0; + return tflite::CreateDimensionMetadata( + _fbb, + _format, + _dense_size, + _array_segments, + _array_indices); +} + +inline SparsityParametersT *SparsityParameters::UnPack(const flatbuffers::resolver_function_t *_resolver) const { + auto _o = new SparsityParametersT(); + UnPackTo(_o, _resolver); + return _o; +} + +inline void SparsityParameters::UnPackTo(SparsityParametersT *_o, const flatbuffers::resolver_function_t *_resolver) const { + (void)_o; + (void)_resolver; + { auto _e = traversal_order(); if (_e) { _o->traversal_order.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->traversal_order[_i] = _e->Get(_i); } } }; + { auto _e = block_map(); if (_e) { _o->block_map.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->block_map[_i] = _e->Get(_i); } } }; + { auto _e = dim_metadata(); if (_e) { _o->dim_metadata.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->dim_metadata[_i] = std::unique_ptr(_e->Get(_i)->UnPack(_resolver)); } } }; +} + +inline flatbuffers::Offset SparsityParameters::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT* _o, const flatbuffers::rehasher_function_t *_rehasher) { + return CreateSparsityParameters(_fbb, _o, _rehasher); +} + +inline flatbuffers::Offset CreateSparsityParameters(flatbuffers::FlatBufferBuilder &_fbb, const SparsityParametersT *_o, const flatbuffers::rehasher_function_t *_rehasher) { + (void)_rehasher; + (void)_o; + struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparsityParametersT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va; + auto _traversal_order = _o->traversal_order.size() ? _fbb.CreateVector(_o->traversal_order) : 0; + auto _block_map = _o->block_map.size() ? _fbb.CreateVector(_o->block_map) : 0; + auto _dim_metadata = _o->dim_metadata.size() ? _fbb.CreateVector> (_o->dim_metadata.size(), [](size_t i, _VectorArgs *__va) { return CreateDimensionMetadata(*__va->__fbb, __va->__o->dim_metadata[i].get(), __va->__rehasher); }, &_va ) : 0; + return tflite::CreateSparsityParameters( + _fbb, + _traversal_order, + _block_map, + _dim_metadata); +} + inline TensorT *Tensor::UnPack(const flatbuffers::resolver_function_t *_resolver) const { auto _o = new TensorT(); UnPackTo(_o, _resolver); @@ -9741,6 +10058,7 @@ inline void Tensor::UnPackTo(TensorT *_o, const flatbuffers::resolver_function_t { auto _e = name(); if (_e) _o->name = _e->str(); }; { auto _e = quantization(); if (_e) _o->quantization = std::unique_ptr(_e->UnPack(_resolver)); }; { auto _e = is_variable(); _o->is_variable = _e; }; + { auto _e = sparsity(); if (_e) _o->sparsity = std::unique_ptr(_e->UnPack(_resolver)); }; } inline flatbuffers::Offset Tensor::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TensorT* _o, const flatbuffers::rehasher_function_t *_rehasher) { @@ -9757,6 +10075,7 @@ inline flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder & auto _name = _o->name.empty() ? 0 : _fbb.CreateString(_o->name); auto _quantization = _o->quantization ? CreateQuantizationParameters(_fbb, _o->quantization.get(), _rehasher) : 0; auto _is_variable = _o->is_variable; + auto _sparsity = _o->sparsity ? CreateSparsityParameters(_fbb, _o->sparsity.get(), _rehasher) : 0; return tflite::CreateTensor( _fbb, _shape, @@ -9764,7 +10083,8 @@ inline flatbuffers::Offset CreateTensor(flatbuffers::FlatBufferBuilder & _buffer, _name, _quantization, - _is_variable); + _is_variable, + _sparsity); } inline Conv2DOptionsT *Conv2DOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const { @@ -12544,7 +12864,7 @@ inline bool VerifyQuantizationDetails(flatbuffers::Verifier &verifier, const voi auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - default: return false; + default: return true; } } @@ -12997,7 +13317,7 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } - default: return false; + default: return true; } } diff --git a/tensorflow/lite/testdata/sparse_tensor.bin b/tensorflow/lite/testdata/sparse_tensor.bin new file mode 100644 index 0000000000000000000000000000000000000000..d1445ac648065da9918a1ba72ab8b53374273b5e GIT binary patch literal 412 zcmX|-O%4G;6ohMNKM_P^79tK{Z{q}3mi8iHi_BO!f&*DtSU89SIE7Z<{CJbk-x$r^34h&r zRMgI_rG;MB`uGoOCIuU7THMM+bV)V#`Zn;qjE6x+>Ul3`@0s0 Date: Thu, 5 Dec 2019 09:59:52 -0800 Subject: [PATCH 166/383] Add a flag to dump the current stack trace when emitting a diagnostic. It is often desirable to know where within the program that a diagnostic was emitted, without reverting to assert/unreachable which crash the program. This change adds a flag `mlir-print-stacktrace-on-diagnostic` that attaches the current stack trace as a note to every diagnostic that gets emitted. PiperOrigin-RevId: 283996373 Change-Id: I51392e0a93e8a687d9069afa64d048cb7110242a --- third_party/mlir/lib/IR/Diagnostics.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/third_party/mlir/lib/IR/Diagnostics.cpp b/third_party/mlir/lib/IR/Diagnostics.cpp index b89b44dd3e0..f2f2f83b3a8 100644 --- a/third_party/mlir/lib/IR/Diagnostics.cpp +++ b/third_party/mlir/lib/IR/Diagnostics.cpp @@ -25,15 +25,22 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Mutex.h" #include "llvm/Support/PrettyStackTrace.h" #include "llvm/Support/Regex.h" +#include "llvm/Support/Signals.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/raw_ostream.h" using namespace mlir; using namespace mlir::detail; +static llvm::cl::opt printStackTraceOnDiagnostic( + "mlir-print-stacktrace-on-diagnostic", + llvm::cl::desc("When a diagnostic is emitted, also print the stack trace " + "as an attached note")); + //===----------------------------------------------------------------------===// // DiagnosticArgument //===----------------------------------------------------------------------===// @@ -285,6 +292,18 @@ static InFlightDiagnostic emitDiag(Location location, auto diag = diagEngine.emit(location, severity); if (!message.isTriviallyEmpty()) diag << message; + + // Add the stack trace as a note if necessary. + if (printStackTraceOnDiagnostic) { + std::string bt; + { + llvm::raw_string_ostream stream(bt); + llvm::sys::PrintStackTrace(stream); + } + if (!bt.empty()) + diag.attachNote() << "diagnostic emitted with trace:\n" << bt; + } + return diag; } From 8874d95aaaa5a1643bc66cfcb23e5650e75329d4 Mon Sep 17 00:00:00 2001 From: Ken Franko Date: Thu, 5 Dec 2019 10:04:38 -0800 Subject: [PATCH 167/383] Update documentation for tf.range. PiperOrigin-RevId: 283997583 Change-Id: I4d6049b45bfd9de9b98f417f185404d8285db062 --- tensorflow/python/ops/math_ops.py | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index d890d4266ba..340cbf0606b 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1418,18 +1418,25 @@ def range(start, limit=None, delta=1, dtype=None, name="range"): # pylint: disa For example: ```python - start = 3 - limit = 18 - delta = 3 - tf.range(start, limit, delta) # [3, 6, 9, 12, 15] + >>> start = 3 + >>> limit = 18 + >>> delta = 3 + >>> tf.range(start, limit, delta) + - start = 3 - limit = 1 - delta = -0.5 - tf.range(start, limit, delta) # [3, 2.5, 2, 1.5] + >>> start = 3 + >>> limit = 1 + >>> delta = -0.5 + >>> tf.range(start, limit, delta) + + + >>> limit = 5 + >>> tf.range(limit) + - limit = 5 - tf.range(limit) # [0, 1, 2, 3, 4] ``` Args: From da7a278b676ed50f73a48f3f4377098819154188 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 5 Dec 2019 10:05:54 -0800 Subject: [PATCH 168/383] Add spv.AtomicCompareExchangeWeak PiperOrigin-RevId: 283997917 Change-Id: I69d5502ef1256919e6a776b6c897c7f393deaa09 --- third_party/mlir/BUILD | 1 + .../mlir/Dialect/SPIRV/SPIRVAtomicOps.td | 74 +++++ .../include/mlir/Dialect/SPIRV/SPIRVBase.td | 254 +++++++++--------- .../include/mlir/Dialect/SPIRV/SPIRVOps.td | 1 + .../mlir/lib/Dialect/SPIRV/SPIRVOps.cpp | 77 ++++++ 5 files changed, 281 insertions(+), 126 deletions(-) create mode 100644 third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index b298b0d897f..cbcc7eb18fd 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -950,6 +950,7 @@ filegroup( srcs = [ "include/mlir/Analysis/CallInterfaces.td", "include/mlir/Dialect/SPIRV/SPIRVArithmeticOps.td", + "include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td", "include/mlir/Dialect/SPIRV/SPIRVBase.td", "include/mlir/Dialect/SPIRV/SPIRVBitOps.td", "include/mlir/Dialect/SPIRV/SPIRVCastOps.td", diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td new file mode 100644 index 00000000000..7042bf2cd3e --- /dev/null +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVAtomicOps.td @@ -0,0 +1,74 @@ +//===-- SPIRVAtomicOps.td - MLIR SPIR-V Atomic Ops ---------*- tablegen -*-===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This file contains atomic ops for the SPIR-V dialect. It corresponds to +// "3.32.18. Atomic Instructions" of the SPIR-V specification. +// +//===----------------------------------------------------------------------===// + +#ifndef SPIRV_ATOMIC_OPS +#define SPIRV_ATOMIC_OPS + +// ----- + +def SPV_AtomicCompareExchangeWeakOp : SPV_Op<"AtomicCompareExchangeWeak", []> { + let summary = "Deprecated (use OpAtomicCompareExchange)."; + + let description = [{ + Has the same semantics as OpAtomicCompareExchange. + + Memory must be a valid memory Scope. + + ### Custom assembly form + + ``` {.ebnf} + scope ::= `"CrossDevice"` | `"Device"` | `"Workgroup"` | ... + + memory-semantics ::= `"None"` | `"Acquire"` | "Release"` | ... + + atomic-compare-exchange-weak-op ::= + `spv.AtomicCompareExchangeWeak` scope memory-semantics memory-semantics + ssa-use `,` ssa-use `,` ssa-use + `:` spv-pointer-type + ``` + + For example: + + ``` + %0 = spv.AtomicCompareExchangeWeak "Workgroup" "Acquire" "None" + %pointer, %value, %comparator + : !spv.ptr + ``` + }]; + + let arguments = (ins + SPV_AnyPtr:$pointer, + SPV_ScopeAttr:$memory_scope, + SPV_MemorySemanticsAttr:$equal_semantics, + SPV_MemorySemanticsAttr:$unequal_semantics, + SPV_Integer:$value, + SPV_Integer:$comparator + ); + + let results = (outs + SPV_Integer:$result + ); +} + +// ----- + +#endif // SPIRV_ATOMIC_OPS diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td index dd15895fbb6..c7acc3720e9 100644 --- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td @@ -1031,128 +1031,129 @@ class SPV_OpCode { // Begin opcode section. Generated from SPIR-V spec; DO NOT MODIFY! -def SPV_OC_OpNop : I32EnumAttrCase<"OpNop", 0>; -def SPV_OC_OpUndef : I32EnumAttrCase<"OpUndef", 1>; -def SPV_OC_OpSourceContinued : I32EnumAttrCase<"OpSourceContinued", 2>; -def SPV_OC_OpSource : I32EnumAttrCase<"OpSource", 3>; -def SPV_OC_OpSourceExtension : I32EnumAttrCase<"OpSourceExtension", 4>; -def SPV_OC_OpName : I32EnumAttrCase<"OpName", 5>; -def SPV_OC_OpMemberName : I32EnumAttrCase<"OpMemberName", 6>; -def SPV_OC_OpString : I32EnumAttrCase<"OpString", 7>; -def SPV_OC_OpExtension : I32EnumAttrCase<"OpExtension", 10>; -def SPV_OC_OpExtInstImport : I32EnumAttrCase<"OpExtInstImport", 11>; -def SPV_OC_OpExtInst : I32EnumAttrCase<"OpExtInst", 12>; -def SPV_OC_OpMemoryModel : I32EnumAttrCase<"OpMemoryModel", 14>; -def SPV_OC_OpEntryPoint : I32EnumAttrCase<"OpEntryPoint", 15>; -def SPV_OC_OpExecutionMode : I32EnumAttrCase<"OpExecutionMode", 16>; -def SPV_OC_OpCapability : I32EnumAttrCase<"OpCapability", 17>; -def SPV_OC_OpTypeVoid : I32EnumAttrCase<"OpTypeVoid", 19>; -def SPV_OC_OpTypeBool : I32EnumAttrCase<"OpTypeBool", 20>; -def SPV_OC_OpTypeInt : I32EnumAttrCase<"OpTypeInt", 21>; -def SPV_OC_OpTypeFloat : I32EnumAttrCase<"OpTypeFloat", 22>; -def SPV_OC_OpTypeVector : I32EnumAttrCase<"OpTypeVector", 23>; -def SPV_OC_OpTypeArray : I32EnumAttrCase<"OpTypeArray", 28>; -def SPV_OC_OpTypeRuntimeArray : I32EnumAttrCase<"OpTypeRuntimeArray", 29>; -def SPV_OC_OpTypeStruct : I32EnumAttrCase<"OpTypeStruct", 30>; -def SPV_OC_OpTypePointer : I32EnumAttrCase<"OpTypePointer", 32>; -def SPV_OC_OpTypeFunction : I32EnumAttrCase<"OpTypeFunction", 33>; -def SPV_OC_OpConstantTrue : I32EnumAttrCase<"OpConstantTrue", 41>; -def SPV_OC_OpConstantFalse : I32EnumAttrCase<"OpConstantFalse", 42>; -def SPV_OC_OpConstant : I32EnumAttrCase<"OpConstant", 43>; -def SPV_OC_OpConstantComposite : I32EnumAttrCase<"OpConstantComposite", 44>; -def SPV_OC_OpConstantNull : I32EnumAttrCase<"OpConstantNull", 46>; -def SPV_OC_OpSpecConstantTrue : I32EnumAttrCase<"OpSpecConstantTrue", 48>; -def SPV_OC_OpSpecConstantFalse : I32EnumAttrCase<"OpSpecConstantFalse", 49>; -def SPV_OC_OpSpecConstant : I32EnumAttrCase<"OpSpecConstant", 50>; -def SPV_OC_OpSpecConstantComposite : I32EnumAttrCase<"OpSpecConstantComposite", 51>; -def SPV_OC_OpFunction : I32EnumAttrCase<"OpFunction", 54>; -def SPV_OC_OpFunctionParameter : I32EnumAttrCase<"OpFunctionParameter", 55>; -def SPV_OC_OpFunctionEnd : I32EnumAttrCase<"OpFunctionEnd", 56>; -def SPV_OC_OpFunctionCall : I32EnumAttrCase<"OpFunctionCall", 57>; -def SPV_OC_OpVariable : I32EnumAttrCase<"OpVariable", 59>; -def SPV_OC_OpLoad : I32EnumAttrCase<"OpLoad", 61>; -def SPV_OC_OpStore : I32EnumAttrCase<"OpStore", 62>; -def SPV_OC_OpAccessChain : I32EnumAttrCase<"OpAccessChain", 65>; -def SPV_OC_OpDecorate : I32EnumAttrCase<"OpDecorate", 71>; -def SPV_OC_OpMemberDecorate : I32EnumAttrCase<"OpMemberDecorate", 72>; -def SPV_OC_OpCompositeExtract : I32EnumAttrCase<"OpCompositeExtract", 81>; -def SPV_OC_OpConvertFToU : I32EnumAttrCase<"OpConvertFToU", 109>; -def SPV_OC_OpConvertFToS : I32EnumAttrCase<"OpConvertFToS", 110>; -def SPV_OC_OpConvertSToF : I32EnumAttrCase<"OpConvertSToF", 111>; -def SPV_OC_OpConvertUToF : I32EnumAttrCase<"OpConvertUToF", 112>; -def SPV_OC_OpUConvert : I32EnumAttrCase<"OpUConvert", 113>; -def SPV_OC_OpSConvert : I32EnumAttrCase<"OpSConvert", 114>; -def SPV_OC_OpFConvert : I32EnumAttrCase<"OpFConvert", 115>; -def SPV_OC_OpBitcast : I32EnumAttrCase<"OpBitcast", 124>; -def SPV_OC_OpFNegate : I32EnumAttrCase<"OpFNegate", 127>; -def SPV_OC_OpIAdd : I32EnumAttrCase<"OpIAdd", 128>; -def SPV_OC_OpFAdd : I32EnumAttrCase<"OpFAdd", 129>; -def SPV_OC_OpISub : I32EnumAttrCase<"OpISub", 130>; -def SPV_OC_OpFSub : I32EnumAttrCase<"OpFSub", 131>; -def SPV_OC_OpIMul : I32EnumAttrCase<"OpIMul", 132>; -def SPV_OC_OpFMul : I32EnumAttrCase<"OpFMul", 133>; -def SPV_OC_OpUDiv : I32EnumAttrCase<"OpUDiv", 134>; -def SPV_OC_OpSDiv : I32EnumAttrCase<"OpSDiv", 135>; -def SPV_OC_OpFDiv : I32EnumAttrCase<"OpFDiv", 136>; -def SPV_OC_OpUMod : I32EnumAttrCase<"OpUMod", 137>; -def SPV_OC_OpSRem : I32EnumAttrCase<"OpSRem", 138>; -def SPV_OC_OpSMod : I32EnumAttrCase<"OpSMod", 139>; -def SPV_OC_OpFRem : I32EnumAttrCase<"OpFRem", 140>; -def SPV_OC_OpFMod : I32EnumAttrCase<"OpFMod", 141>; -def SPV_OC_OpLogicalEqual : I32EnumAttrCase<"OpLogicalEqual", 164>; -def SPV_OC_OpLogicalNotEqual : I32EnumAttrCase<"OpLogicalNotEqual", 165>; -def SPV_OC_OpLogicalOr : I32EnumAttrCase<"OpLogicalOr", 166>; -def SPV_OC_OpLogicalAnd : I32EnumAttrCase<"OpLogicalAnd", 167>; -def SPV_OC_OpLogicalNot : I32EnumAttrCase<"OpLogicalNot", 168>; -def SPV_OC_OpSelect : I32EnumAttrCase<"OpSelect", 169>; -def SPV_OC_OpIEqual : I32EnumAttrCase<"OpIEqual", 170>; -def SPV_OC_OpINotEqual : I32EnumAttrCase<"OpINotEqual", 171>; -def SPV_OC_OpUGreaterThan : I32EnumAttrCase<"OpUGreaterThan", 172>; -def SPV_OC_OpSGreaterThan : I32EnumAttrCase<"OpSGreaterThan", 173>; -def SPV_OC_OpUGreaterThanEqual : I32EnumAttrCase<"OpUGreaterThanEqual", 174>; -def SPV_OC_OpSGreaterThanEqual : I32EnumAttrCase<"OpSGreaterThanEqual", 175>; -def SPV_OC_OpULessThan : I32EnumAttrCase<"OpULessThan", 176>; -def SPV_OC_OpSLessThan : I32EnumAttrCase<"OpSLessThan", 177>; -def SPV_OC_OpULessThanEqual : I32EnumAttrCase<"OpULessThanEqual", 178>; -def SPV_OC_OpSLessThanEqual : I32EnumAttrCase<"OpSLessThanEqual", 179>; -def SPV_OC_OpFOrdEqual : I32EnumAttrCase<"OpFOrdEqual", 180>; -def SPV_OC_OpFUnordEqual : I32EnumAttrCase<"OpFUnordEqual", 181>; -def SPV_OC_OpFOrdNotEqual : I32EnumAttrCase<"OpFOrdNotEqual", 182>; -def SPV_OC_OpFUnordNotEqual : I32EnumAttrCase<"OpFUnordNotEqual", 183>; -def SPV_OC_OpFOrdLessThan : I32EnumAttrCase<"OpFOrdLessThan", 184>; -def SPV_OC_OpFUnordLessThan : I32EnumAttrCase<"OpFUnordLessThan", 185>; -def SPV_OC_OpFOrdGreaterThan : I32EnumAttrCase<"OpFOrdGreaterThan", 186>; -def SPV_OC_OpFUnordGreaterThan : I32EnumAttrCase<"OpFUnordGreaterThan", 187>; -def SPV_OC_OpFOrdLessThanEqual : I32EnumAttrCase<"OpFOrdLessThanEqual", 188>; -def SPV_OC_OpFUnordLessThanEqual : I32EnumAttrCase<"OpFUnordLessThanEqual", 189>; -def SPV_OC_OpFOrdGreaterThanEqual : I32EnumAttrCase<"OpFOrdGreaterThanEqual", 190>; -def SPV_OC_OpFUnordGreaterThanEqual : I32EnumAttrCase<"OpFUnordGreaterThanEqual", 191>; -def SPV_OC_OpShiftRightLogical : I32EnumAttrCase<"OpShiftRightLogical", 194>; -def SPV_OC_OpShiftRightArithmetic : I32EnumAttrCase<"OpShiftRightArithmetic", 195>; -def SPV_OC_OpShiftLeftLogical : I32EnumAttrCase<"OpShiftLeftLogical", 196>; -def SPV_OC_OpBitwiseOr : I32EnumAttrCase<"OpBitwiseOr", 197>; -def SPV_OC_OpBitwiseXor : I32EnumAttrCase<"OpBitwiseXor", 198>; -def SPV_OC_OpBitwiseAnd : I32EnumAttrCase<"OpBitwiseAnd", 199>; -def SPV_OC_OpNot : I32EnumAttrCase<"OpNot", 200>; -def SPV_OC_OpBitFieldInsert : I32EnumAttrCase<"OpBitFieldInsert", 201>; -def SPV_OC_OpBitFieldSExtract : I32EnumAttrCase<"OpBitFieldSExtract", 202>; -def SPV_OC_OpBitFieldUExtract : I32EnumAttrCase<"OpBitFieldUExtract", 203>; -def SPV_OC_OpBitReverse : I32EnumAttrCase<"OpBitReverse", 204>; -def SPV_OC_OpBitCount : I32EnumAttrCase<"OpBitCount", 205>; -def SPV_OC_OpControlBarrier : I32EnumAttrCase<"OpControlBarrier", 224>; -def SPV_OC_OpMemoryBarrier : I32EnumAttrCase<"OpMemoryBarrier", 225>; -def SPV_OC_OpPhi : I32EnumAttrCase<"OpPhi", 245>; -def SPV_OC_OpLoopMerge : I32EnumAttrCase<"OpLoopMerge", 246>; -def SPV_OC_OpSelectionMerge : I32EnumAttrCase<"OpSelectionMerge", 247>; -def SPV_OC_OpLabel : I32EnumAttrCase<"OpLabel", 248>; -def SPV_OC_OpBranch : I32EnumAttrCase<"OpBranch", 249>; -def SPV_OC_OpBranchConditional : I32EnumAttrCase<"OpBranchConditional", 250>; -def SPV_OC_OpReturn : I32EnumAttrCase<"OpReturn", 253>; -def SPV_OC_OpReturnValue : I32EnumAttrCase<"OpReturnValue", 254>; -def SPV_OC_OpUnreachable : I32EnumAttrCase<"OpUnreachable", 255>; -def SPV_OC_OpModuleProcessed : I32EnumAttrCase<"OpModuleProcessed", 330>; -def SPV_OC_OpGroupNonUniformBallot : I32EnumAttrCase<"OpGroupNonUniformBallot", 339>; -def SPV_OC_OpSubgroupBallotKHR : I32EnumAttrCase<"OpSubgroupBallotKHR", 4421>; +def SPV_OC_OpNop : I32EnumAttrCase<"OpNop", 0>; +def SPV_OC_OpUndef : I32EnumAttrCase<"OpUndef", 1>; +def SPV_OC_OpSourceContinued : I32EnumAttrCase<"OpSourceContinued", 2>; +def SPV_OC_OpSource : I32EnumAttrCase<"OpSource", 3>; +def SPV_OC_OpSourceExtension : I32EnumAttrCase<"OpSourceExtension", 4>; +def SPV_OC_OpName : I32EnumAttrCase<"OpName", 5>; +def SPV_OC_OpMemberName : I32EnumAttrCase<"OpMemberName", 6>; +def SPV_OC_OpString : I32EnumAttrCase<"OpString", 7>; +def SPV_OC_OpExtension : I32EnumAttrCase<"OpExtension", 10>; +def SPV_OC_OpExtInstImport : I32EnumAttrCase<"OpExtInstImport", 11>; +def SPV_OC_OpExtInst : I32EnumAttrCase<"OpExtInst", 12>; +def SPV_OC_OpMemoryModel : I32EnumAttrCase<"OpMemoryModel", 14>; +def SPV_OC_OpEntryPoint : I32EnumAttrCase<"OpEntryPoint", 15>; +def SPV_OC_OpExecutionMode : I32EnumAttrCase<"OpExecutionMode", 16>; +def SPV_OC_OpCapability : I32EnumAttrCase<"OpCapability", 17>; +def SPV_OC_OpTypeVoid : I32EnumAttrCase<"OpTypeVoid", 19>; +def SPV_OC_OpTypeBool : I32EnumAttrCase<"OpTypeBool", 20>; +def SPV_OC_OpTypeInt : I32EnumAttrCase<"OpTypeInt", 21>; +def SPV_OC_OpTypeFloat : I32EnumAttrCase<"OpTypeFloat", 22>; +def SPV_OC_OpTypeVector : I32EnumAttrCase<"OpTypeVector", 23>; +def SPV_OC_OpTypeArray : I32EnumAttrCase<"OpTypeArray", 28>; +def SPV_OC_OpTypeRuntimeArray : I32EnumAttrCase<"OpTypeRuntimeArray", 29>; +def SPV_OC_OpTypeStruct : I32EnumAttrCase<"OpTypeStruct", 30>; +def SPV_OC_OpTypePointer : I32EnumAttrCase<"OpTypePointer", 32>; +def SPV_OC_OpTypeFunction : I32EnumAttrCase<"OpTypeFunction", 33>; +def SPV_OC_OpConstantTrue : I32EnumAttrCase<"OpConstantTrue", 41>; +def SPV_OC_OpConstantFalse : I32EnumAttrCase<"OpConstantFalse", 42>; +def SPV_OC_OpConstant : I32EnumAttrCase<"OpConstant", 43>; +def SPV_OC_OpConstantComposite : I32EnumAttrCase<"OpConstantComposite", 44>; +def SPV_OC_OpConstantNull : I32EnumAttrCase<"OpConstantNull", 46>; +def SPV_OC_OpSpecConstantTrue : I32EnumAttrCase<"OpSpecConstantTrue", 48>; +def SPV_OC_OpSpecConstantFalse : I32EnumAttrCase<"OpSpecConstantFalse", 49>; +def SPV_OC_OpSpecConstant : I32EnumAttrCase<"OpSpecConstant", 50>; +def SPV_OC_OpSpecConstantComposite : I32EnumAttrCase<"OpSpecConstantComposite", 51>; +def SPV_OC_OpFunction : I32EnumAttrCase<"OpFunction", 54>; +def SPV_OC_OpFunctionParameter : I32EnumAttrCase<"OpFunctionParameter", 55>; +def SPV_OC_OpFunctionEnd : I32EnumAttrCase<"OpFunctionEnd", 56>; +def SPV_OC_OpFunctionCall : I32EnumAttrCase<"OpFunctionCall", 57>; +def SPV_OC_OpVariable : I32EnumAttrCase<"OpVariable", 59>; +def SPV_OC_OpLoad : I32EnumAttrCase<"OpLoad", 61>; +def SPV_OC_OpStore : I32EnumAttrCase<"OpStore", 62>; +def SPV_OC_OpAccessChain : I32EnumAttrCase<"OpAccessChain", 65>; +def SPV_OC_OpDecorate : I32EnumAttrCase<"OpDecorate", 71>; +def SPV_OC_OpMemberDecorate : I32EnumAttrCase<"OpMemberDecorate", 72>; +def SPV_OC_OpCompositeExtract : I32EnumAttrCase<"OpCompositeExtract", 81>; +def SPV_OC_OpConvertFToU : I32EnumAttrCase<"OpConvertFToU", 109>; +def SPV_OC_OpConvertFToS : I32EnumAttrCase<"OpConvertFToS", 110>; +def SPV_OC_OpConvertSToF : I32EnumAttrCase<"OpConvertSToF", 111>; +def SPV_OC_OpConvertUToF : I32EnumAttrCase<"OpConvertUToF", 112>; +def SPV_OC_OpUConvert : I32EnumAttrCase<"OpUConvert", 113>; +def SPV_OC_OpSConvert : I32EnumAttrCase<"OpSConvert", 114>; +def SPV_OC_OpFConvert : I32EnumAttrCase<"OpFConvert", 115>; +def SPV_OC_OpBitcast : I32EnumAttrCase<"OpBitcast", 124>; +def SPV_OC_OpFNegate : I32EnumAttrCase<"OpFNegate", 127>; +def SPV_OC_OpIAdd : I32EnumAttrCase<"OpIAdd", 128>; +def SPV_OC_OpFAdd : I32EnumAttrCase<"OpFAdd", 129>; +def SPV_OC_OpISub : I32EnumAttrCase<"OpISub", 130>; +def SPV_OC_OpFSub : I32EnumAttrCase<"OpFSub", 131>; +def SPV_OC_OpIMul : I32EnumAttrCase<"OpIMul", 132>; +def SPV_OC_OpFMul : I32EnumAttrCase<"OpFMul", 133>; +def SPV_OC_OpUDiv : I32EnumAttrCase<"OpUDiv", 134>; +def SPV_OC_OpSDiv : I32EnumAttrCase<"OpSDiv", 135>; +def SPV_OC_OpFDiv : I32EnumAttrCase<"OpFDiv", 136>; +def SPV_OC_OpUMod : I32EnumAttrCase<"OpUMod", 137>; +def SPV_OC_OpSRem : I32EnumAttrCase<"OpSRem", 138>; +def SPV_OC_OpSMod : I32EnumAttrCase<"OpSMod", 139>; +def SPV_OC_OpFRem : I32EnumAttrCase<"OpFRem", 140>; +def SPV_OC_OpFMod : I32EnumAttrCase<"OpFMod", 141>; +def SPV_OC_OpLogicalEqual : I32EnumAttrCase<"OpLogicalEqual", 164>; +def SPV_OC_OpLogicalNotEqual : I32EnumAttrCase<"OpLogicalNotEqual", 165>; +def SPV_OC_OpLogicalOr : I32EnumAttrCase<"OpLogicalOr", 166>; +def SPV_OC_OpLogicalAnd : I32EnumAttrCase<"OpLogicalAnd", 167>; +def SPV_OC_OpLogicalNot : I32EnumAttrCase<"OpLogicalNot", 168>; +def SPV_OC_OpSelect : I32EnumAttrCase<"OpSelect", 169>; +def SPV_OC_OpIEqual : I32EnumAttrCase<"OpIEqual", 170>; +def SPV_OC_OpINotEqual : I32EnumAttrCase<"OpINotEqual", 171>; +def SPV_OC_OpUGreaterThan : I32EnumAttrCase<"OpUGreaterThan", 172>; +def SPV_OC_OpSGreaterThan : I32EnumAttrCase<"OpSGreaterThan", 173>; +def SPV_OC_OpUGreaterThanEqual : I32EnumAttrCase<"OpUGreaterThanEqual", 174>; +def SPV_OC_OpSGreaterThanEqual : I32EnumAttrCase<"OpSGreaterThanEqual", 175>; +def SPV_OC_OpULessThan : I32EnumAttrCase<"OpULessThan", 176>; +def SPV_OC_OpSLessThan : I32EnumAttrCase<"OpSLessThan", 177>; +def SPV_OC_OpULessThanEqual : I32EnumAttrCase<"OpULessThanEqual", 178>; +def SPV_OC_OpSLessThanEqual : I32EnumAttrCase<"OpSLessThanEqual", 179>; +def SPV_OC_OpFOrdEqual : I32EnumAttrCase<"OpFOrdEqual", 180>; +def SPV_OC_OpFUnordEqual : I32EnumAttrCase<"OpFUnordEqual", 181>; +def SPV_OC_OpFOrdNotEqual : I32EnumAttrCase<"OpFOrdNotEqual", 182>; +def SPV_OC_OpFUnordNotEqual : I32EnumAttrCase<"OpFUnordNotEqual", 183>; +def SPV_OC_OpFOrdLessThan : I32EnumAttrCase<"OpFOrdLessThan", 184>; +def SPV_OC_OpFUnordLessThan : I32EnumAttrCase<"OpFUnordLessThan", 185>; +def SPV_OC_OpFOrdGreaterThan : I32EnumAttrCase<"OpFOrdGreaterThan", 186>; +def SPV_OC_OpFUnordGreaterThan : I32EnumAttrCase<"OpFUnordGreaterThan", 187>; +def SPV_OC_OpFOrdLessThanEqual : I32EnumAttrCase<"OpFOrdLessThanEqual", 188>; +def SPV_OC_OpFUnordLessThanEqual : I32EnumAttrCase<"OpFUnordLessThanEqual", 189>; +def SPV_OC_OpFOrdGreaterThanEqual : I32EnumAttrCase<"OpFOrdGreaterThanEqual", 190>; +def SPV_OC_OpFUnordGreaterThanEqual : I32EnumAttrCase<"OpFUnordGreaterThanEqual", 191>; +def SPV_OC_OpShiftRightLogical : I32EnumAttrCase<"OpShiftRightLogical", 194>; +def SPV_OC_OpShiftRightArithmetic : I32EnumAttrCase<"OpShiftRightArithmetic", 195>; +def SPV_OC_OpShiftLeftLogical : I32EnumAttrCase<"OpShiftLeftLogical", 196>; +def SPV_OC_OpBitwiseOr : I32EnumAttrCase<"OpBitwiseOr", 197>; +def SPV_OC_OpBitwiseXor : I32EnumAttrCase<"OpBitwiseXor", 198>; +def SPV_OC_OpBitwiseAnd : I32EnumAttrCase<"OpBitwiseAnd", 199>; +def SPV_OC_OpNot : I32EnumAttrCase<"OpNot", 200>; +def SPV_OC_OpBitFieldInsert : I32EnumAttrCase<"OpBitFieldInsert", 201>; +def SPV_OC_OpBitFieldSExtract : I32EnumAttrCase<"OpBitFieldSExtract", 202>; +def SPV_OC_OpBitFieldUExtract : I32EnumAttrCase<"OpBitFieldUExtract", 203>; +def SPV_OC_OpBitReverse : I32EnumAttrCase<"OpBitReverse", 204>; +def SPV_OC_OpBitCount : I32EnumAttrCase<"OpBitCount", 205>; +def SPV_OC_OpControlBarrier : I32EnumAttrCase<"OpControlBarrier", 224>; +def SPV_OC_OpMemoryBarrier : I32EnumAttrCase<"OpMemoryBarrier", 225>; +def SPV_OC_OpAtomicCompareExchangeWeak : I32EnumAttrCase<"OpAtomicCompareExchangeWeak", 231>; +def SPV_OC_OpPhi : I32EnumAttrCase<"OpPhi", 245>; +def SPV_OC_OpLoopMerge : I32EnumAttrCase<"OpLoopMerge", 246>; +def SPV_OC_OpSelectionMerge : I32EnumAttrCase<"OpSelectionMerge", 247>; +def SPV_OC_OpLabel : I32EnumAttrCase<"OpLabel", 248>; +def SPV_OC_OpBranch : I32EnumAttrCase<"OpBranch", 249>; +def SPV_OC_OpBranchConditional : I32EnumAttrCase<"OpBranchConditional", 250>; +def SPV_OC_OpReturn : I32EnumAttrCase<"OpReturn", 253>; +def SPV_OC_OpReturnValue : I32EnumAttrCase<"OpReturnValue", 254>; +def SPV_OC_OpUnreachable : I32EnumAttrCase<"OpUnreachable", 255>; +def SPV_OC_OpModuleProcessed : I32EnumAttrCase<"OpModuleProcessed", 330>; +def SPV_OC_OpGroupNonUniformBallot : I32EnumAttrCase<"OpGroupNonUniformBallot", 339>; +def SPV_OC_OpSubgroupBallotKHR : I32EnumAttrCase<"OpSubgroupBallotKHR", 4421>; def SPV_OpcodeAttr : I32EnumAttr<"Opcode", "valid SPIR-V instructions", [ @@ -1190,10 +1191,11 @@ def SPV_OpcodeAttr : SPV_OC_OpBitwiseAnd, SPV_OC_OpNot, SPV_OC_OpBitFieldInsert, SPV_OC_OpBitFieldSExtract, SPV_OC_OpBitFieldUExtract, SPV_OC_OpBitReverse, SPV_OC_OpBitCount, SPV_OC_OpControlBarrier, SPV_OC_OpMemoryBarrier, - SPV_OC_OpPhi, SPV_OC_OpLoopMerge, SPV_OC_OpSelectionMerge, SPV_OC_OpLabel, - SPV_OC_OpBranch, SPV_OC_OpBranchConditional, SPV_OC_OpReturn, - SPV_OC_OpReturnValue, SPV_OC_OpUnreachable, SPV_OC_OpModuleProcessed, - SPV_OC_OpGroupNonUniformBallot, SPV_OC_OpSubgroupBallotKHR + SPV_OC_OpAtomicCompareExchangeWeak, SPV_OC_OpPhi, SPV_OC_OpLoopMerge, + SPV_OC_OpSelectionMerge, SPV_OC_OpLabel, SPV_OC_OpBranch, + SPV_OC_OpBranchConditional, SPV_OC_OpReturn, SPV_OC_OpReturnValue, + SPV_OC_OpUnreachable, SPV_OC_OpModuleProcessed, SPV_OC_OpGroupNonUniformBallot, + SPV_OC_OpSubgroupBallotKHR ]> { let cppNamespace = "::mlir::spirv"; } diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td index 149c2359fda..000f1ddaa79 100644 --- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td @@ -32,6 +32,7 @@ include "mlir/Dialect/SPIRV/SPIRVBase.td" include "mlir/Dialect/SPIRV/SPIRVArithmeticOps.td" +include "mlir/Dialect/SPIRV/SPIRVAtomicOps.td" include "mlir/Dialect/SPIRV/SPIRVBitOps.td" include "mlir/Dialect/SPIRV/SPIRVCastOps.td" include "mlir/Dialect/SPIRV/SPIRVControlFlowOps.td" diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp index 89abbe894e6..99705f6dcc2 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp @@ -42,6 +42,7 @@ static constexpr const char kBranchWeightAttrName[] = "branch_weights"; static constexpr const char kCallee[] = "callee"; static constexpr const char kDefaultValueAttrName[] = "default_value"; static constexpr const char kExecutionScopeAttrName[] = "execution_scope"; +static constexpr const char kEqualSemanticsAttrName[] = "equal_semantics"; static constexpr const char kFnNameAttrName[] = "fn"; static constexpr const char kIndicesAttrName[] = "indices"; static constexpr const char kInitializerAttrName[] = "initializer"; @@ -50,6 +51,7 @@ static constexpr const char kMemoryScopeAttrName[] = "memory_scope"; static constexpr const char kSpecConstAttrName[] = "spec_const"; static constexpr const char kSpecIdAttrName[] = "spec_id"; static constexpr const char kTypeAttrName[] = "type"; +static constexpr const char kUnequalSemanticsAttrName[] = "unequal_semantics"; static constexpr const char kValueAttrName[] = "value"; static constexpr const char kValuesAttrName[] = "values"; static constexpr const char kVariableAttrName[] = "variable"; @@ -750,6 +752,81 @@ static LogicalResult verify(spirv::AddressOfOp addressOfOp) { return success(); } +//===----------------------------------------------------------------------===// +// spv.AtomicCompareExchangeWeak +//===----------------------------------------------------------------------===// + +static ParseResult parseAtomicCompareExchangeWeakOp(OpAsmParser &parser, + OperationState &state) { + spirv::Scope memoryScope; + spirv::MemorySemantics equalSemantics, unequalSemantics; + SmallVector operandInfo; + Type type; + if (parseEnumAttribute(memoryScope, parser, state, kMemoryScopeAttrName) || + parseEnumAttribute(equalSemantics, parser, state, + kEqualSemanticsAttrName) || + parseEnumAttribute(unequalSemantics, parser, state, + kUnequalSemanticsAttrName) || + parser.parseOperandList(operandInfo, 3)) + return failure(); + + auto loc = parser.getCurrentLocation(); + if (parser.parseColonType(type)) + return failure(); + + auto ptrType = type.dyn_cast(); + if (!ptrType) + return parser.emitError(loc, "expected pointer type"); + + if (parser.resolveOperands( + operandInfo, + {ptrType, ptrType.getPointeeType(), ptrType.getPointeeType()}, + parser.getNameLoc(), state.operands)) + return failure(); + + return parser.addTypeToList(ptrType.getPointeeType(), state.types); +} + +static void print(spirv::AtomicCompareExchangeWeakOp atomOp, + OpAsmPrinter &printer) { + printer << spirv::AtomicCompareExchangeWeakOp::getOperationName() << " \"" + << stringifyScope(atomOp.memory_scope()) << "\" \"" + << stringifyMemorySemantics(atomOp.equal_semantics()) << "\" \"" + << stringifyMemorySemantics(atomOp.unequal_semantics()) << "\" "; + printer.printOperands(atomOp.getOperands()); + printer << " : " << atomOp.pointer()->getType(); +} + +static LogicalResult verify(spirv::AtomicCompareExchangeWeakOp atomOp) { + // According to the spec: + // "The type of Value must be the same as Result Type. The type of the value + // pointed to by Pointer must be the same as Result Type. This type must also + // match the type of Comparator." + if (atomOp.getType() != atomOp.value()->getType()) + return atomOp.emitOpError("value operand must have the same type as the op " + "result, but found ") + << atomOp.value()->getType() << " vs " << atomOp.getType(); + + if (atomOp.getType() != atomOp.comparator()->getType()) + return atomOp.emitOpError( + "comparator operand must have the same type as the op " + "result, but found ") + << atomOp.comparator()->getType() << " vs " << atomOp.getType(); + + Type pointeeType = + atomOp.pointer()->getType().cast().getPointeeType(); + if (atomOp.getType() != pointeeType) + return atomOp.emitOpError( + "pointer operand's pointee type must have the same " + "as the op result type, but found ") + << pointeeType << " vs " << atomOp.getType(); + + // TODO(antiagainst): Unequal cannot be set to Release or Acquire and Release. + // In addition, Unequal cannot be set to a stronger memory-order then Equal. + + return success(); +} + //===----------------------------------------------------------------------===// // spv.BitcastOp //===----------------------------------------------------------------------===// From aad7ef455a75258a5e89975c9eb12435ee1bca66 Mon Sep 17 00:00:00 2001 From: Juhyun Lee Date: Thu, 5 Dec 2019 10:23:03 -0800 Subject: [PATCH 169/383] Add comments to TfLiteConvParams to indicate what's supported in v1 vs v2. PiperOrigin-RevId: 284001572 Change-Id: Iee505945669c71eaf5556fd3793d2504dd562f98 --- tensorflow/lite/c/builtin_op_data.h | 6 +++++- tensorflow/lite/experimental/micro/kernels/conv_test.cc | 8 ++++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h index 5ac525e0a3c..08440c4cf27 100644 --- a/tensorflow/lite/c/builtin_op_data.h +++ b/tensorflow/lite/c/builtin_op_data.h @@ -75,12 +75,16 @@ typedef enum { } TfLiteFusedActivation; typedef struct { + // Parameters for CONV_2D version 1. TfLitePadding padding; int stride_width; int stride_height; + TfLiteFusedActivation activation; + + // Parameters for CONV_2D version 2. + // Note: Version 2 supports dilation values not equal to 1. int dilation_width_factor; int dilation_height_factor; - TfLiteFusedActivation activation; } TfLiteConvParams; typedef struct { diff --git a/tensorflow/lite/experimental/micro/kernels/conv_test.cc b/tensorflow/lite/experimental/micro/kernels/conv_test.cc index 9ac2bea0c0e..352b10cca04 100644 --- a/tensorflow/lite/experimental/micro/kernels/conv_test.cc +++ b/tensorflow/lite/experimental/micro/kernels/conv_test.cc @@ -43,9 +43,9 @@ static TfLiteConvParams common_conv_params = { kTfLitePaddingValid, // padding 2, // stride_width 2, // stride_height + kTfLiteActNone, // activation 1, // dilation_width_factor 1, // dilation_height_factor - kTfLiteActNone, // activation }; template @@ -328,9 +328,9 @@ TF_LITE_MICRO_TEST(SimpleTestQuantizedPerChannel) { TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) { // conv params: - // padding, stride_, dilation_, activation - TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, 1, 1, - kTfLiteActNone}; + // padding, stride_, activation, dilation_ + TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, + kTfLiteActNone, 1, 1}; const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C] const int kInputElements = kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4]; From 6f65a28c16ee5506309979af86b9edb475be873f Mon Sep 17 00:00:00 2001 From: Yunxing Dai Date: Thu, 5 Dec 2019 10:25:13 -0800 Subject: [PATCH 170/383] Re-enable X64 input to program with dynamic shapes. PiperOrigin-RevId: 284002185 Change-Id: Iae45df83dc6c163cc9e82bf418be674472dd3ded --- tensorflow/python/distribute/custom_training_loop_test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/distribute/custom_training_loop_test.py b/tensorflow/python/distribute/custom_training_loop_test.py index 55c2ae6a1ca..925fd640f52 100644 --- a/tensorflow/python/distribute/custom_training_loop_test.py +++ b/tensorflow/python/distribute/custom_training_loop_test.py @@ -217,11 +217,9 @@ class InputIterationTest(test.TestCase, parameterized.TestCase): def _get_dataset(self): if tf2.enabled(): - return dataset_ops.DatasetV2.range(10).\ - map(lambda x: math_ops.cast(x, dtypes.int32)).batch(2) + return dataset_ops.DatasetV2.range(10).batch(2) else: - return dataset_ops.Dataset.range(10).\ - map(lambda x: math_ops.cast(x, dtypes.int32)).batch(2) + return dataset_ops.Dataset.range(10).batch(2) def _validate_outputs(self, actual_results): expected_results = [[i**2, (i+1)**2] for i in range(0, 10, 2)] From ac143d3ed450243bcb35868a86ae9dc3b71ff5ce Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 10:28:30 -0800 Subject: [PATCH 171/383] Modernize shape docs PiperOrigin-RevId: 284003006 Change-Id: Iba09756c294fb3e55c8896156c0e69f933b3b3dc --- tensorflow/python/framework/ops.py | 35 +++++++++++---------- tensorflow/python/framework/tensor_shape.py | 9 +++++- tensorflow/python/ops/array_ops.py | 6 ++++ 3 files changed, 33 insertions(+), 17 deletions(-) diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py index d265ee5f91e..8cc29115f41 100644 --- a/tensorflow/python/framework/ops.py +++ b/tensorflow/python/framework/ops.py @@ -439,29 +439,32 @@ class Tensor(_TensorLike): for more details of what a shape represents. The inferred shape of a tensor is used to provide shape - information without having to launch the graph in a session. This - can be used for debugging, and providing early error messages. For + information without having to execute the underlying kernel. This + can be used for debugging and providing early error messages. For example: ```python - c = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + >>> c = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]) + >>> print(c.shape) # will be TensorShape([2, 3]) + (2, 3) - print(c.shape) - ==> TensorShape([Dimension(2), Dimension(3)]) - - d = tf.constant([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]]) - - print(d.shape) - ==> TensorShape([Dimension(4), Dimension(2)]) + >>> d = tf.constant([[1.0, 0.0], [0.0, 1.0], [1.0, 0.0], [0.0, 1.0]]) + >>> print(d.shape) + (4, 2) # Raises a ValueError, because `c` and `d` do not have compatible # inner dimensions. - e = tf.matmul(c, d) + >>> e = tf.matmul(c, d) + Traceback (most recent call last): + ... + tensorflow.python.framework.errors_impl.InvalidArgumentError: Matrix + size-incompatible: In[0]: [2,3], In[1]: [4,2] [Op:MatMul] name: MatMul/ - f = tf.matmul(c, d, transpose_a=True, transpose_b=True) + # This works because we have compatible shapes. + >>> f = tf.matmul(c, d, transpose_a=True, transpose_b=True) + >>> print(f.shape) + (3, 4) - print(f.shape) - ==> TensorShape([Dimension(3), Dimension(4)]) ``` In some cases, the inferred shape may have unknown dimensions. If @@ -470,7 +473,7 @@ class Tensor(_TensorLike): inferred shape. Returns: - A `TensorShape` representing the shape of this tensor. + A `tf.TensorShape` representing the shape of this tensor. """ if self._shape_val is None: @@ -570,7 +573,7 @@ class Tensor(_TensorLike): return self.shape.ndims def get_shape(self): - """Alias of Tensor.shape.""" + """Alias of `tf.Tensor.shape`.""" return self.shape def set_shape(self, shape): diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py index dc787d54269..e6470bf815d 100644 --- a/tensorflow/python/framework/tensor_shape.py +++ b/tensorflow/python/framework/tensor_shape.py @@ -808,7 +808,14 @@ class TensorShape(object): @property def dims(self): - """Returns a list of Dimensions, or None if the shape is unspecified.""" + """Deprecated. Returns list of dimensions for this shape. + + Suggest `TensorShape.as_list` instead. + + Returns: + A list containing `tf.compat.v1.Dimension`s, or None if the shape is + unspecified. + """ return self._dims @property diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index 20249958486..c7e7eda462e 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -544,6 +544,8 @@ def shape_v2(input, out_type=dtypes.int32, name=None): """Returns the shape of a tensor. This operation returns a 1-D integer tensor representing the shape of `input`. + This represents the minimal set of known information at definition time. + For example: @@ -571,6 +573,10 @@ def shape_v2(input, out_type=dtypes.int32, name=None): `int64`). Defaults to `tf.int32`. name: A name for the operation (optional). + `tf.shape` and `Tensor.shape` should be identical in eager mode. Within + `tf.function` or within a `compat.v1` context, not all dimensions may be + known until execution time. + Returns: A `Tensor` of type `out_type`. """ From ff111665e986fc975fc7aa4ad90b66948371e952 Mon Sep 17 00:00:00 2001 From: Yanhui Liang Date: Thu, 5 Dec 2019 10:29:28 -0800 Subject: [PATCH 172/383] Unify the doc strings of conv layers. PiperOrigin-RevId: 284003254 Change-Id: I48b43d6fbcfb1959068c7c8fa36008f57f76bd92 --- .../python/keras/layers/convolutional.py | 178 +++++++++++------- 1 file changed, 114 insertions(+), 64 deletions(-) diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py index 5ef887fc8e7..6a69af0f580 100644 --- a/tensorflow/python/keras/layers/convolutional.py +++ b/tensorflow/python/keras/layers/convolutional.py @@ -77,8 +77,9 @@ class Conv(Layer): the dilation rate to use for dilated convolution. Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any `strides` value != 1. - activation: Activation function. Set it to None to maintain a - linear activation. + activation: Activation function to use. + If you don't specify anything, no activation is applied + (ie. 'linear' activation: `a(x) = x`). use_bias: Boolean, whether the layer uses a bias. kernel_initializer: An initializer for the convolution kernel. bias_initializer: An initializer for the bias vector. If None, the default @@ -363,17 +364,21 @@ class Conv1D(Conv): incompatible with specifying any `strides` value != 1. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). + (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix. bias_initializer: Initializer for the bias vector. kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the `kernel` weights matrix (see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its "activation").. - kernel_constraint: Constraint function applied to the kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the output of the layer (its "activation") ( + see `keras.regularizers`). + kernel_constraint: Constraint function applied to the kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Examples: ```python @@ -509,24 +514,23 @@ class Conv2D(Conv): activation: Activation function to use. If you don't specify anything, no activation is applied (ie. "linear" activation: `a(x) = x`). Check `keras.activations` for - available activation functions. + available activation functions (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix. Check `keras.initializers` for available initializers. bias_initializer: Initializer for the bias vector. Check `keras.initializers` for available initializers. kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. Check - `keras.regularizers` for available regularizers. - bias_regularizer: Regularizer function applied to the bias vector. Check - `keras.regularizers` for available regularizers. + the `kernel` weights matrix (see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its "activation"). Check - `keras.regularizers` for available regularizers. - kernel_constraint: Constraint function applied to the kernel matrix. Check - `keras.constraints` for available constraints. - bias_constraint: Constraint function applied to the bias vector. Check - `keras.constraints` for available constraints. + the output of the layer (its "activation") ( + see `keras.regularizers`). + kernel_constraint: Constraint function applied to the kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Input shape: 4D tensor with shape: @@ -637,17 +641,22 @@ class Conv3D(Conv): incompatible with specifying any stride value != 1. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). + (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix. bias_initializer: Initializer for the bias vector. kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the `kernel` weights matrix ( + see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its "activation").. - kernel_constraint: Constraint function applied to the kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the output of the layer (its "activation") ( + see `keras.regularizers`). + kernel_constraint: Constraint function applied to the kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Input shape: 5D tensor with shape: @@ -763,17 +772,22 @@ class Conv2DTranspose(Conv2D): incompatible with specifying any stride value != 1. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). + (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. - kernel_initializer: Initializer for the `kernel` weights matrix. - bias_initializer: Initializer for the bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix ( + see `keras.initializers`). + bias_initializer: Initializer for the bias vector ( + see `keras.initializers`). kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the `kernel` weights matrix (see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its "activation").. - kernel_constraint: Constraint function applied to the kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the output of the layer (its "activation") (see `keras.regularizers`). + kernel_constraint: Constraint function applied to the kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Input shape: 4D tensor with shape: @@ -787,12 +801,27 @@ class Conv2DTranspose(Conv2D): or 4D tensor with shape: `(batch, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to padding. + If `output_padding` is specified: + ``` + new_rows = ((rows - 1) * strides[0] + kernel_size[0] + - 2 * padding[0] + output_padding[0]) + new_cols = ((cols - 1) * strides[1] + kernel_size[1] + - 2 * padding[1] + output_padding[1]) + ``` References: - [A guide to convolution arithmetic for deep learning](https://arxiv.org/abs/1603.07285v1) - [Deconvolutional Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf) + + Returns: + A tensor of rank 4 representing + `activation(conv2dtranspose(inputs, kernel) + bias)`. + + Raises: + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, @@ -1033,17 +1062,22 @@ class Conv3DTranspose(Conv3D): incompatible with specifying any stride value != 1. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). + (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix. bias_initializer: Initializer for the bias vector. kernel_regularizer: Regularizer function applied to - the `kernel` weights matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the `kernel` weights matrix ( + see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its "activation"). - kernel_constraint: Constraint function applied to the kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the output of the layer (its "activation") ( + see `keras.regularizers`). + kernel_constraint: Constraint function applied to the kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Input shape: 5D tensor with shape: @@ -1294,8 +1328,9 @@ class SeparableConv(Conv): depth_multiplier: The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. - activation: Activation function. Set it to None to maintain a - linear activation. + activation: Activation function to use. + If you don't specify anything, no activation is applied + (ie. 'linear' activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias. depthwise_initializer: An initializer for the depthwise convolution kernel. pointwise_initializer: An initializer for the pointwise convolution kernel. @@ -1495,29 +1530,35 @@ class SeparableConv1D(SeparableConv): depth_multiplier: The number of depthwise convolution output channels for each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. - activation: Activation function. Set it to None to maintain a - linear activation. + activation: Activation function to use. + If you don't specify anything, no activation is applied + (ie. 'linear' activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias. depthwise_initializer: An initializer for the depthwise convolution kernel. pointwise_initializer: An initializer for the pointwise convolution kernel. bias_initializer: An initializer for the bias vector. If None, the default initializer will be used. depthwise_regularizer: Optional regularizer for the depthwise - convolution kernel. + convolution kernel (see `keras.regularizers`). pointwise_regularizer: Optional regularizer for the pointwise - convolution kernel. - bias_regularizer: Optional regularizer for the bias vector. - activity_regularizer: Optional regularizer function for the output. + convolution kernel (see `keras.regularizers`). + bias_regularizer: Optional regularizer for the bias vector ( + see `keras.regularizers`). + activity_regularizer: Optional regularizer function for the output ( + see `keras.regularizers`). depthwise_constraint: Optional projection function to be applied to the depthwise kernel after being updated by an `Optimizer` (e.g. used for norm constraints or value constraints for layer weights). The function must take as input the unprojected variable and must return the projected variable (which must have the same shape). Constraints are - not safe to use when doing asynchronous distributed training. + not safe to use when doing asynchronous distributed training ( + see `keras.constraints`). pointwise_constraint: Optional projection function to be applied to the - pointwise kernel after being updated by an `Optimizer`. + pointwise kernel after being updated by an `Optimizer` ( + see `keras.constraints`). bias_constraint: Optional projection function to be applied to the - bias after being updated by an `Optimizer`. + bias after being updated by an `Optimizer` ( + see `keras.constraints`). trainable: Boolean, if `True` the weights of this layer will be marked as trainable (and listed in `layer.trainable_weights`). name: A string, the name of the layer. @@ -1660,23 +1701,28 @@ class SeparableConv2D(SeparableConv): channels will be equal to `filters_in * depth_multiplier`. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). + (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. depthwise_initializer: Initializer for the depthwise kernel matrix. pointwise_initializer: Initializer for the pointwise kernel matrix. bias_initializer: Initializer for the bias vector. depthwise_regularizer: Regularizer function applied to - the depthwise kernel matrix. + the depthwise kernel matrix (see `keras.regularizers`). pointwise_regularizer: Regularizer function applied to - the pointwise kernel matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the pointwise kernel matrix (see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its "activation").. + the output of the layer (its "activation") ( + see `keras.regularizers`). depthwise_constraint: Constraint function applied to - the depthwise kernel matrix. + the depthwise kernel matrix ( + see `keras.constraints`). pointwise_constraint: Constraint function applied to - the pointwise kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the pointwise kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Input shape: 4D tensor with shape: @@ -1800,18 +1846,22 @@ class DepthwiseConv2D(Conv2D): If you never set it, then it will be 'channels_last'. activation: Activation function to use. If you don't specify anything, no activation is applied - (ie. 'linear' activation: `a(x) = x`). + (ie. 'linear' activation: `a(x) = x`) (see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. depthwise_initializer: Initializer for the depthwise kernel matrix. bias_initializer: Initializer for the bias vector. depthwise_regularizer: Regularizer function applied to - the depthwise kernel matrix. - bias_regularizer: Regularizer function applied to the bias vector. + the depthwise kernel matrix (see `keras.regularizers`). + bias_regularizer: Regularizer function applied to the bias vector ( + see `keras.regularizers`). activity_regularizer: Regularizer function applied to - the output of the layer (its 'activation'). + the output of the layer (its 'activation') ( + see `keras.regularizers`). depthwise_constraint: Constraint function applied to - the depthwise kernel matrix. - bias_constraint: Constraint function applied to the bias vector. + the depthwise kernel matrix ( + see `keras.constraints`). + bias_constraint: Constraint function applied to the bias vector ( + see `keras.constraints`). Input shape: 4D tensor with shape: From c08424a491008a4c6cc71075b15c264d15ce7122 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Thu, 5 Dec 2019 10:33:23 -0800 Subject: [PATCH 173/383] [XLA/GPU] [NFC] Factor out reduction tiling into a separate function PiperOrigin-RevId: 284004115 Change-Id: I26155cedc255a2260d4891d2b0c06687c6cbb792 --- .../xla/service/gpu/ir_emission_utils.cc | 25 +++++++++++-- .../xla/service/gpu/ir_emission_utils.h | 9 +++-- .../xla/service/gpu/ir_emitter_unnested.cc | 35 +++++++------------ 3 files changed, 40 insertions(+), 29 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc index 72f69ca2017..b2067fe916d 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc @@ -124,6 +124,24 @@ bool IsCublasGemm(const HloInstruction& hlo) { hlo.custom_call_target() == kGemmCallTarget; } +std::array GetReductionTiling( + const ReductionDimensions& reduction_dimensions) { + if (reduction_dimensions.is_row_reduction) { + int64 tile_z = std::min(reduction_dimensions.dimensions[0], 8LL); + if (reduction_dimensions.dimensions[1] == 1) { + CHECK_EQ(reduction_dimensions.dimensions[0], 1); + return {tile_z, 1, 16}; + } + if (reduction_dimensions.dimensions[2] % (kWarpSize * 64) == 0) { + return {tile_z, 1, 64}; + } + return {tile_z, 1, 8}; + } + + // Column reduction. + return {1, 128, 1}; +} + const char* const kCudnnBatchNormForwardInferenceCallTarget = "__cudnn$batchNormalizationForwardInference"; const char* const kCudnnBatchNormForwardTrainingCallTarget = @@ -201,8 +219,7 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) { } ReductionDimensions reduction_dimensions = - GetReductionKindAndContiguousComponents(input->shape(), - reduce.dimensions()); + GetReductionKindAndContiguousComponents(reduce); if (reduction_dimensions.is_row_reduction) { // For row reduction, the tile block is 1 x tile_size_x, and we are reducing @@ -218,7 +235,9 @@ bool IsReductionFromOrToContiguousDimensions(const HloInstruction& reduce) { } ReductionDimensions GetReductionKindAndContiguousComponents( - const Shape& input_shape, absl::Span dims_to_reduce) { + const HloInstruction& reduce) { + const Shape& input_shape = reduce.operand(0)->shape(); + absl::Span dims_to_reduce = reduce.dimensions(); DimensionVector dims_to_keep; for (int64 dim = 0; dim < input_shape.rank(); ++dim) { if (!absl::c_linear_search(dims_to_reduce, dim)) { diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h index db3cd228841..2c37a63c05a 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h @@ -169,14 +169,17 @@ struct ReductionDimensions { std::array dimensions; }; -// Given the input shape and dimensions to reduce for a reduction, returns -// ReductionDimensions. +// Given the reduction operation, returns ReductionDimensions. // // Prerequisite: the reduction instruction passes the check // IsReductionFromOrToContiguousDimensions, which guarantees either the // dimensions to reduce or the dimensions to keep are consecutive. ReductionDimensions GetReductionKindAndContiguousComponents( - const Shape& input_shape, absl::Span dims_to_reduce); + const HloInstruction& reduce); + +// Get tiling per thread for the given reduction in dimensions [D, H, W]. +std::array GetReductionTiling( + const ReductionDimensions& reduction_dimensions); // Emits call to "vprintf" with given format and arguments. llvm::Value* EmitPrintf(absl::string_view fmt, diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 64422180693..0e62e27bd99 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -2876,34 +2876,26 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( const HloInstruction* unnested_hlo, const HloInstruction* first_reduce) { const Shape& input_shape = first_reduce->operand(0)->shape(); ReductionDimensions reduction_dimensions = - GetReductionKindAndContiguousComponents(input_shape, - first_reduce->dimensions()); + GetReductionKindAndContiguousComponents(*first_reduce); VLOG(10) << "is_row_reduction " << reduction_dimensions.is_row_reduction << " " << reduction_dimensions.dimensions[0] << " " << reduction_dimensions.dimensions[1] << " " << reduction_dimensions.dimensions[2]; + std::array reduction_tiling = + GetReductionTiling(reduction_dimensions); + int64 tile_size_y = reduction_tiling[1]; + int64 block_size_z = reduction_tiling[0]; + bool dilated_x = + !reduction_dimensions.is_row_reduction && + !IsUnrollingColumnReductionBeneficial(unnested_hlo, input_shape, + reduction_dimensions.dimensions[2]); + int64 tile_size_x = 1; - int64 tile_size_y = 1; - int64 block_size_z = 1; int64 num_threads_x = 1; - bool dilated_x = true; if (reduction_dimensions.is_row_reduction) { num_threads_x = kWarpSize; - if (reduction_dimensions.dimensions[1] == 1) { - // Scalar reduction is handled differently than the other kind of row - // reduction. - CHECK_EQ(reduction_dimensions.dimensions[0], 1); - tile_size_x = kWarpSize * 16; - } else { - if (reduction_dimensions.dimensions[2] % (kWarpSize * 64) == 0) { - tile_size_x = kWarpSize * 64; - } else { - tile_size_x = kWarpSize * 8; - } - block_size_z = - std::min(reduction_dimensions.dimensions[0], static_cast(8)); - } + tile_size_x = reduction_tiling[2] * kWarpSize; } else { // Column reduction without transpose doesn't require communication among // threads processing elements in the same tile. The current implementation @@ -2913,20 +2905,17 @@ ReductionCodegenInfo IrEmitterUnnested::ComputeReductionCodegenInfo( // num_threads_x and tile_size_x to allow a bigger hardware thread block. int64 hw_threads_per_block_limit = ThreadsPerBlockLimit(ir_emitter_context_->device_description()); - if (IsUnrollingColumnReductionBeneficial( - unnested_hlo, input_shape, reduction_dimensions.dimensions[2])) { + if (!dilated_x) { // Vectorized loads: two elements per thread. tile_size_x = std::min(2 * hw_threads_per_block_limit, reduction_dimensions.dimensions[2]); num_threads_x = tile_size_x / 2; - dilated_x = false; } else { // One element per thread. tile_size_x = std::min(hw_threads_per_block_limit, reduction_dimensions.dimensions[2]); num_threads_x = tile_size_x; } - tile_size_y = 128; } KernelMappingScheme mapping_scheme( From c9b4d7389a85eefd3c1b2e7b0606ab4c2ccca2e2 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 10:35:01 -0800 Subject: [PATCH 174/383] Handle ranked types in tf.BiasAdd, tf.Const, tf.ReluGrad and reshaping ops' lowerings to HLO PiperOrigin-RevId: 284004447 Change-Id: Ib38fab2f0fd9f8b788aa6baeeca32d121df766e4 --- .../compiler/mlir/xla/tests/legalize-tf.mlir | 54 +++++++++++++------ .../mlir/xla/transforms/legalize_tf.cc | 3 +- .../xla/transforms/legalize_tf_patterns.td | 28 +++++++--- 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index 5af7a1ffc31..7bc9614b72e 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -25,18 +25,25 @@ func @fusedBatchNorm_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, // CHECK-LABEL: func @biasAdd_NHWC func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> { - // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} + // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32> return %0 : tensor<1x32x10x32xi32> } // CHECK-LABEL: func @biasAdd_NCHW func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> { - // CHECK-NEXT: %0 = "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} + // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} %0 = "tf.BiasAdd"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", data_format = "NCHW"} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32> return %0 : tensor<1x32x10x32xi32> } +// CHECK-LABEL: func @biasAdd_dynamic +func @biasAdd_dynamic(%arg0: tensor, %arg1: tensor) -> tensor { + // CHECK: "xla_hlo.add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} + %0 = "tf.BiasAdd"(%arg0, %arg1) {data_format = "NCHW"} : (tensor, tensor) -> tensor + return %0 : tensor +} + //===----------------------------------------------------------------------===// // Binary op legalizations. //===----------------------------------------------------------------------===// @@ -666,11 +673,18 @@ func @preventgradient(%arg0: tensor<1xi32>) -> tensor<1xi32> { // CHECK-LABEL: @const func @const() -> tensor<2xi32> { - // CHECK-NEXT: xla_hlo.constant dense<0> : tensor<2xi32> + // CHECK: xla_hlo.constant dense<0> : tensor<2xi32> %0 = "tf.Const"() {device = "", name = "", dtype = "tfdtype$DT_INT32", value = dense<0> : tensor<2xi32>} : () -> (tensor<2xi32>) return %0: tensor<2xi32> } +// CHECK-LABEL: @const_dynamic_output +func @const_dynamic_output() -> tensor<*xi32> { + // CHECK: xla_hlo.constant {value = dense<0> : tensor<2xi32>} : tensor<*xi32> + %0 = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> (tensor<*xi32>) + return %0: tensor<*xi32> +} + // CHECK-LABEL: @opaque_const func @opaque_const() -> tensor>> { // CHECK-NOT: xla_hlo.constant @@ -838,13 +852,14 @@ func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> { } // CHECK-LABEL: func @relu_grad -// CHECK-SAME: (%[[GRADIENTS:.*]]: tensor<4x8xf32>, %[[FEATURES:.*]]: tensor<4x8xf32>) -func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor<4x8xf32>) -> tensor<4x8xf32> { - // CHECK: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32> - // CHECK: %[[PRED:.*]] = "xla_hlo.compare"(%[[FEATURES]], %[[ZERO]]) {comparison_direction = "GT"} : (tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xi1> - // CHECK: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<4x8xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32> - // CHECK: return %[[RESULT]] : tensor<4x8xf32> - %2 = "tf.ReluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32> +// CHECK-SAME: (%[[GRADIENTS:.*]]: tensor<4x8xf32>, %[[FEATURES:.*]]: tensor) +func @relu_grad(%gradients: tensor<4x8xf32>, %features: tensor) -> tensor<4x8xf32> { + // CHECK-DAG: %[[ZERO_SCALAR:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor + // CHECK-DAG: %[[ZERO:.*]] = xla_hlo.constant dense<0.000000e+00> : tensor<4x8xf32> + // CHECK-DAG: %[[PRED:.*]] = "xla_hlo.compare"(%[[FEATURES]], %[[ZERO_SCALAR]]) {comparison_direction = "GT"} : (tensor, tensor) -> tensor<*xi1> + // CHECK-DAG: %[[RESULT:.*]] = "xla_hlo.select"(%[[PRED]], %[[GRADIENTS]], %[[ZERO]]) : (tensor<*xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32> + // CHECK-DAG: return %[[RESULT]] : tensor<4x8xf32> + %2 = "tf.ReluGrad"(%gradients, %features) : (tensor<4x8xf32>, tensor) -> tensor<4x8xf32> return %2 : tensor<4x8xf32> } @@ -1352,35 +1367,42 @@ func @tanh_unranked(%arg0: tensor<*xf32>) -> tensor<*xf32> { // CHECK-LABEL: reshape func @reshape(%arg0: tensor<2xf32>, %arg1: tensor<2xi32>) -> tensor<1x1xf32> { - // CHECK: %0 = "xla_hlo.reshape"(%arg0) : (tensor<2xf32>) -> tensor<1x1xf32> + // CHECK: "xla_hlo.reshape" %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<2xf32>, tensor<2xi32>) -> tensor<1x1xf32> return %0 : tensor<1x1xf32> } // CHECK-LABEL: reshape_dynamic -func @reshape_dynamic(%arg0: tensor<*xf32>, %arg1: tensor<2xi32>) -> tensor { - // CHECK: %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<*xf32>, tensor<2xi32>) -> tensor +func @reshape_dynamic(%arg0: tensor, %arg1: tensor<2xi32>) -> tensor<1x1xf32> { + // CHECK: "xla_hlo.reshape" + %0 = "tf.Reshape"(%arg0, %arg1) : (tensor, tensor<2xi32>) -> tensor<1x1xf32> + return %0 : tensor<1x1xf32> +} + +// CHECK-LABEL: reshape_unranked +func @reshape_unranked(%arg0: tensor<*xf32>, %arg1: tensor<2xi32>) -> tensor { + // CHECK: "tf.Reshape" %0 = "tf.Reshape"(%arg0, %arg1) : (tensor<*xf32>, tensor<2xi32>) -> tensor return %0 : tensor } // CHECK-LABEL: squeeze func @squeeze(%arg0: tensor<1x1x10xf32>) -> tensor<1x10xf32> { - // CHECK-NEXT: %0 = "xla_hlo.reshape"(%arg0) : (tensor<1x1x10xf32>) -> tensor<1x10xf32> + // CHECK: "xla_hlo.reshape" %0 = "tf.Squeeze"(%arg0) : (tensor<1x1x10xf32>) -> tensor<1x10xf32> return %0 : tensor<1x10xf32> } // CHECK-LABEL: squeeze_dynamic func @squeeze_dynamic(%arg0: tensor) -> tensor<*xf32> { - // CHECK-NEXT: %0 = "tf.Squeeze"(%arg0) : (tensor) -> tensor<*xf32> + // CHECK: "tf.Squeeze" %0 = "tf.Squeeze"(%arg0) : (tensor) -> tensor<*xf32> return %0 : tensor<*xf32> } // CHECK-LABEL: expand_dims func @expand_dims(%arg0: tensor<2xf32>, %axis: tensor) -> tensor<1x2xf32> { - // CHECK: "xla_hlo.reshape"{{.*}} : (tensor<2xf32>) -> tensor<1x2xf32> + // CHECK: "xla_hlo.reshape" %0 = "tf.ExpandDims"(%arg0, %axis) : (tensor<2xf32>, tensor) -> tensor<1x2xf32> return %0 : tensor<1x2xf32> } diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc index 262091eb4c2..b427e0124c0 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc @@ -207,7 +207,8 @@ static IntegerAttr getFeatureDimensionAttr(Builder &b, StringAttr format, // Bias op utilities. //===----------------------------------------------------------------------===// -/// Return a 1D DenseIntElementsAttr for the feature dimension of a BiasAdd. +// Return a 1D DenseIntElementsAttr for the feature dimension of a BiasAdd. +// Requires input to have ranked tensor. static DenseIntElementsAttr getBiasFeatureDimension(Builder &b, StringAttr format, Value *input) { diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td index ca3a8406a2e..bbe0f43fdba 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td @@ -49,7 +49,9 @@ def : Pattern< def BiasAddFeatureDimension : NativeCodeCall< "getBiasFeatureDimension($_builder, $0, $1)">; -def : Pat<(TF_BiasAddOp AnyStaticShapeTensor:$input, $bias, $data_format), +// $input needs to be a ranked tensor to identify index of the feature +// dimension depending on the data_format 'NHWC' or 'NCHW'. +def : Pat<(TF_BiasAddOp AnyRankedTensor:$input, $bias, $data_format), (HLO_AddOp $input, $bias, (BiasAddFeatureDimension $data_format, $input))>; @@ -298,7 +300,7 @@ def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b), //===----------------------------------------------------------------------===// def : Pat<(TF_ConstOp:$res ElementsAttr:$value), (HLO_ConstOp $value), - [(AnyStaticShapeTensor $res), (HLO_Tensor $res)]>; + [(HLO_Tensor $res)]>; //===----------------------------------------------------------------------===// // Relu op patterns. @@ -316,11 +318,21 @@ def : Pat<(TF_Relu6Op AnyStaticShapeTensor:$input), (HLO_ConstOp (ConstantSplat<"6"> $input)))>; // ReluGrad(gradients, features) = gradients * (features > 0) -def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyStaticShapeTensor:$features), +// +// $gradients needs to be of static shape so that on_true and on_false operands +// of SelectOp have same shape. +// +// $features needs to be ranked for computation of the broadcast dimensions for +// CompareOp. +// +// TODO(hinsu): Relax $gradients static shape requirement when there is a way +// to create splat tensor of dynamic shape in HLO. +def : Pat<(TF_ReluGradOp AnyStaticShapeTensor:$gradients, AnyRankedTensor:$features), (HLO_SelectOp - (HLO_CompareOp $features, (HLO_ConstOp:$zero (ConstantSplat<"0"> $features)), + (HLO_CompareOp $features, + (HLO_ConstOp (GetScalarOfType<0> $features)), (NullDenseIntElementsAttr), HLO_COMPARISON_DIRECTION_GT), - $gradients, $zero)>; + $gradients, (HLO_ConstOp (ConstantSplat<"0"> $gradients)))>; //===----------------------------------------------------------------------===// // Slice op patterns. @@ -386,8 +398,12 @@ def : Pat<(TF_CastOp HLO_Tensor:$arg, ConstBoolAttrFalse), def : Pat<(TF_TransposeOp:$res $arg, (TF_ConstOp $permutation)), (HLO_TransposeOp $arg, (CastElementsToI64Elements $permutation))>; +// Result of the following ops changing tensor shape needs to have static +// shape as HLO doesn't yet support dynamic reshaping ops. +// +// TODO(hinsu): Update once HLO supports dynamic reshaping ops. foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in { - def : Pat<(TfOp:$res AnyStaticShapeTensor:$arg, $ignored), + def : Pat<(TfOp:$res $arg, $ignored), (HLO_ReshapeOp $arg), [(AnyStaticShapeTensor $res)]>; } From aba24098086f6af7d2a19d9a585515b6312d2aa8 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 10:35:29 -0800 Subject: [PATCH 175/383] Updating documentation for tf.train.latest_checkpoint. PiperOrigin-RevId: 284004545 Change-Id: Iacec6e555e2e1d164179144229264ddb3257242b --- tensorflow/python/training/checkpoint_management.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py index 251c1c2b8b9..8d964f7543a 100644 --- a/tensorflow/python/training/checkpoint_management.py +++ b/tensorflow/python/training/checkpoint_management.py @@ -321,11 +321,21 @@ def _prefix_to_checkpoint_path(prefix, format_version): def latest_checkpoint(checkpoint_dir, latest_filename=None): """Finds the filename of latest saved checkpoint file. + Gets the checkpoint state given the provided checkpoint_dir and looks for a + corresponding TensorFlow 2 (preferred) or TensorFlow 1.x checkpoint path. + The latest_filename argument is only applicable if you are saving checkpoint + using `v1.Saver.save` + + + See the [Training Checkpoints + Guide](https://www.tensorflow.org/guide/checkpoint) for more details and + examples.` + Args: checkpoint_dir: Directory where the variables were saved. latest_filename: Optional name for the protocol buffer file that contains the list of most recent checkpoint filenames. - See the corresponding argument to `Saver.save()`. + See the corresponding argument to `v1.Saver.save`. Returns: The full path to the latest checkpoint or `None` if no checkpoint was found. From d32de51461fbe6f83434785b957fab38bcb86a3a Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 10:56:50 -0800 Subject: [PATCH 176/383] Internal change PiperOrigin-RevId: 284009321 Change-Id: I10d665c32d7b194195ca0563333d6ea8dd487c64 --- .../bucket_by_sequence_length_test.py | 32 ++++++----- .../kernel_tests/copy_to_device_test.py | 47 ++++++++-------- .../experimental/kernel_tests/counter_test.py | 30 ++++++----- .../kernel_tests/csv_dataset_test.py | 54 ++----------------- .../dense_to_sparse_batch_test.py | 10 ++-- .../directed_interleave_dataset_test.py | 11 ++-- .../kernel_tests/get_single_element_test.py | 28 ++++------ .../kernel_tests/group_by_reducer_test.py | 15 ++---- .../kernel_tests/group_by_window_test.py | 17 ++---- .../kernel_tests/ignore_errors_test.py | 11 ++-- .../make_batched_features_dataset_test.py | 15 ++---- .../kernel_tests/make_csv_dataset_test.py | 23 ++------ .../make_tf_record_dataset_test.py | 14 ++--- .../kernel_tests/map_defun_op_test.py | 46 ++-------------- .../kernel_tests/override_threadpool_test.py | 47 +++++++++------- .../kernel_tests/parallel_interleave_test.py | 40 ++------------ .../parse_example_dataset_test.py | 30 +++-------- .../kernel_tests/prefetch_to_device_test.py | 19 +++---- .../kernel_tests/prefetch_with_slack_test.py | 11 ++-- .../kernel_tests/rebatch_dataset_test.py | 44 ++++----------- .../kernel_tests/rejection_resample_test.py | 16 +++--- .../kernel_tests/shuffle_and_repeat_test.py | 16 ++---- .../kernel_tests/sql_dataset_test.py | 42 ++------------- .../kernel_tests/stats_dataset_ops_test.py | 29 ++-------- .../kernel_tests/take_while_test.py | 34 +++--------- .../kernel_tests/tf_record_writer_test.py | 15 ++---- .../experimental/kernel_tests/unique_test.py | 11 ++-- .../experimental/kernel_tests/variant_test.py | 9 ++-- .../kernel_tests/wrap_unwrap_test.py | 12 ++--- 29 files changed, 190 insertions(+), 538 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py index d829863b994..d9c463d744d 100644 --- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py @@ -25,11 +25,11 @@ from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -73,12 +73,14 @@ def _get_record_shape(sparse): return tensor_shape.TensorShape([None]) +@test_util.run_all_in_graph_and_eager_modes class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(param_no_padding=[True, False]))) + @parameterized.named_parameters( + ("WithoutPadding", True), + ("WithPadding", False), + ) def testBucketDropReminder(self, param_no_padding): boundaries = [10, 20, 30] @@ -199,9 +201,10 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, _test_bucket_by_padding(param_no_padding) - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(param_no_padding=[True, False]))) + @parameterized.named_parameters( + ("WithoutPadding", True), + ("WithPadding", False), + ) def testBucket(self, param_no_padding): boundaries = [10, 20, 30] @@ -344,9 +347,10 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(param_no_padding=[True, False]))) + @parameterized.named_parameters( + ("WithoutPadding", True), + ("WithPadding", False), + ) def testTupleElements(self, param_no_padding): def build_dataset(sparse): @@ -377,10 +381,10 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, _test_tuple_elements_by_padding(param_no_padding) - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine(param_drop_remainder=[True, False]))) + @parameterized.named_parameters( + ("DoDropRemainder", True), + ("DoNotDropRemainder", False), + ) def testBucketSparse(self, param_drop_remainder): # pylint: disable=g-doc-args """Tests bucketing of sparse tensors (case where `no_padding` == True). diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py index 2fa149fcbaa..36c61636798 100644 --- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py @@ -17,8 +17,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.core.protobuf import config_pb2 from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import prefetching_ops @@ -26,7 +24,6 @@ from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import structure -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -38,9 +35,9 @@ from tensorflow.python.util import compat as util_compat # TODO(b/117581999): add eager coverage when supported. -class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): +class CopyToDeviceTest(test_base.DatasetTestBase): - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -65,7 +62,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceInt32(self): host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3]) device_dataset = host_dataset.apply( @@ -89,7 +86,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -114,7 +111,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -139,7 +136,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -164,7 +161,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyDictToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -189,7 +186,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopySparseTensorsToDevice(self): def make_tensor(i): @@ -222,7 +219,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopySparseTensorsToDeviceWithPrefetch(self): def make_tensor(i): @@ -255,7 +252,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -276,7 +273,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuWithPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -297,7 +294,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuWithMap(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -335,7 +332,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuInt32(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -355,7 +352,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuInt32AndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -375,7 +372,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuStrings(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -395,7 +392,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuStringsAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -415,7 +412,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDevicePingPongCPUGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -439,7 +436,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -468,7 +465,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceWithReInitAndPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -497,7 +494,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -521,7 +518,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testCopyToDeviceGpuWithReInitAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -545,7 +542,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testIteratorGetNextAsOptionalOnGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py index 455e49aafc7..79e4523ea43 100644 --- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py @@ -17,33 +17,35 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.python.data.experimental.ops import counter from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util from tensorflow.python.platform import test -class CounterTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class CounterTest(test_base.DatasetTestBase): - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine(start=3, step=4, expected_output=[[3, 7, 11]]) + - combinations.combine(start=0, step=-1, expected_output=[[0, -1, -2]])) - ) - def testCounter(self, start, step, expected_output): + def testCounter(self): """Test dataset construction using `count`.""" - dataset = counter.Counter(start, step) + dataset = counter.Counter(start=3, step=4) self.assertEqual( [], dataset_ops.get_legacy_output_shapes(dataset).as_list()) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) get_next = self.getNext(dataset) - for expected in expected_output: - self.assertEqual(expected, self.evaluate(get_next())) + + negative_dataset = counter.Counter(start=0, step=-1) + negative_get_next = self.getNext(negative_dataset) + + self.assertEqual(3, self.evaluate(get_next())) + self.assertEqual(3 + 4, self.evaluate(get_next())) + self.assertEqual(3 + 2 * 4, self.evaluate(get_next())) + + self.assertEqual(0, self.evaluate(negative_get_next())) + self.assertEqual(-1, self.evaluate(negative_get_next())) + self.assertEqual(-2, self.evaluate(negative_get_next())) if __name__ == "__main__": diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py index 941ca209848..4b349ebd811 100644 --- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py @@ -22,22 +22,21 @@ import gzip import os import zlib -from absl.testing import parameterized - from tensorflow.python.data.experimental.ops import error_ops from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.eager import context -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test -class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class CsvDatasetTest(test_base.DatasetTestBase): def _setup_files(self, inputs, linebreak='\n', compression_type=None): filenames = [] @@ -118,31 +117,26 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = readers.CsvDataset(filenames, **kwargs) self._verify_output_or_err(dataset, expected_output, expected_err_re) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_requiredFields(self): record_defaults = [[]] * 4 inputs = [['1,2,3,4']] self._test_by_comparison(inputs, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_int(self): record_defaults = [[0]] * 4 inputs = [['1,2,3,4', '5,6,7,8']] self._test_by_comparison(inputs, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_float(self): record_defaults = [[0.0]] * 4 inputs = [['1.0,2.1,3.2,4.3', '5.4,6.5,7.6,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_string(self): record_defaults = [['']] * 4 inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEmptyFields(self): record_defaults = [[0]] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] @@ -150,7 +144,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]], record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errWithUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4']] @@ -159,7 +152,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_err_re='Unquoted fields cannot have quotes inside', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errWithUnescapedQuotes(self): record_defaults = [['']] * 3 inputs = [['"a"b","c","d"']] @@ -169,7 +161,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): 'Quote inside a string has to be escaped by another quote', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_ignoreErrWithUnescapedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,"2"3",4', '1,"2"3",4",5,5', 'a,b,"c"d"', 'e,f,g']] @@ -178,7 +169,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.apply(error_ops.ignore_errors()) self._verify_output_or_err(dataset, [['e', 'f', 'g']]) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_ignoreErrWithUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4', 'a,b,c"d', '9,8"7,6,5', 'e,f,g']] @@ -187,14 +177,12 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.apply(error_ops.ignore_errors()) self._verify_output_or_err(dataset, [['e', 'f', 'g']]) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4']] self._test_by_comparison( inputs, record_defaults=record_defaults, use_quote_delim=False) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_mixedTypes(self): record_defaults = [ constant_op.constant([], dtype=dtypes.int32), @@ -205,35 +193,30 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): inputs = [['1,2.1,3.2,4.3', '5,6.5,7.6,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withUseQuoteDelimFalse(self): record_defaults = [['']] * 4 inputs = [['1,2,"3,4"', '"5,6",7,8']] self._test_by_comparison( inputs, record_defaults=record_defaults, use_quote_delim=False) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withFieldDelim(self): record_defaults = [[0]] * 4 inputs = [['1:2:3:4', '5:6:7:8']] self._test_by_comparison( inputs, record_defaults=record_defaults, field_delim=':') - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNaValue(self): record_defaults = [[0]] * 4 inputs = [['1,NA,3,4', 'NA,6,7,8']] self._test_by_comparison( inputs, record_defaults=record_defaults, na_value='NA') - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withSelectCols(self): record_defaults = [['']] * 2 inputs = [['1,2,3,4', '"5","6","7","8"']] self._test_by_comparison( inputs, record_defaults=record_defaults, select_cols=[1, 2]) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withSelectColsTooHigh(self): record_defaults = [[0]] * 2 inputs = [['1,2,3,4', '5,6,7,8']] @@ -243,27 +226,23 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): record_defaults=record_defaults, select_cols=[3, 4]) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withOneCol(self): record_defaults = [['NA']] inputs = [['0', '', '2']] self._test_dataset( inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withMultipleFiles(self): record_defaults = [[0]] * 4 inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']] self._test_by_comparison(inputs, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withLeadingAndTrailingSpaces(self): record_defaults = [[0.0]] * 4 inputs = [['0, 1, 2, 3']] expected = [[0.0, 1.0, 2.0, 3.0]] self._test_dataset(inputs, expected, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithMissingDefault(self): record_defaults = [[]] * 2 inputs = [['0,']] @@ -272,7 +251,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_err_re='Field 1 is required but missing in record!', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithFewerDefaultsThanFields(self): record_defaults = [[0.0]] * 2 inputs = [['0,1,2,3']] @@ -281,7 +259,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_err_re='Expect 2 fields but have more in record', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithMoreDefaultsThanFields(self): record_defaults = [[0.0]] * 5 inputs = [['0,1,2,3']] @@ -290,7 +267,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_err_re='Expect 5 fields but have 4 in record', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withHeader(self): record_defaults = [[0]] * 2 inputs = [['col1,col2', '1,2']] @@ -302,7 +278,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): header=True, ) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withHeaderAndNoRecords(self): record_defaults = [[0]] * 2 inputs = [['col1,col2']] @@ -314,7 +289,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): header=True, ) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithHeaderEmptyFile(self): record_defaults = [[0]] * 2 inputs = [[]] @@ -326,14 +300,12 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): header=True, ) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEmptyFile(self): record_defaults = [['']] * 2 inputs = [['']] # Empty file self._test_dataset( inputs, expected_output=[], record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithEmptyRecord(self): record_defaults = [['']] * 2 inputs = [['', '1,2']] # First record is empty @@ -342,7 +314,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_err_re='Expect 2 fields but have 1 in record', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withChainedOps(self): # Testing that one dataset can create multiple iterators fine. # `repeat` creates multiple iterators from the same C++ Dataset. @@ -354,7 +325,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): ds_actual.repeat(5).prefetch(1), ds_expected.repeat(5).prefetch(1)) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withTypeDefaults(self): # Testing using dtypes as record_defaults for required fields record_defaults = [dtypes.float32, [0.0]] @@ -365,7 +335,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): record_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCsvDataset_fieldOrder(self): data = [[ '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19', @@ -383,7 +352,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): ## The following tests exercise parsing logic for quoted fields - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withQuoted(self): record_defaults = [['']] * 4 inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']] @@ -395,7 +363,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset( inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNewLine(self): # In this case, we expect it to behave differently from # TextLineDataset->map(decode_csv) since that flow has bugs @@ -404,7 +371,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']] self._test_dataset(inputs, expected, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNewLineInUnselectedCol(self): record_defaults = [['']] inputs = [['1,"2\n3",4', '5,6,7']] @@ -414,7 +380,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): record_defaults=record_defaults, select_cols=[0]) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withMultipleNewLines(self): # In this case, we expect it to behave differently from # TextLineDataset->map(decode_csv) since that flow has bugs @@ -423,7 +388,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']] self._test_dataset(inputs, expected, record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithTerminateMidRecord(self): record_defaults = [['']] * 4 inputs = [['a,b,c,"a']] @@ -433,7 +397,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): 'Reached end of file without closing quoted field in record', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEscapedQuotes(self): record_defaults = [['']] * 4 inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']] @@ -443,7 +406,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): ## Testing that parsing works with all buffer sizes, quoted/unquoted fields, ## and different types of line breaks - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withInvalidBufferSize(self): record_defaults = [['']] * 4 inputs = [['a,b,c,d']] @@ -470,7 +432,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): record_defaults=record_defaults, buffer_size=i) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withLF(self): record_defaults = [['NA']] * 3 inputs = [['abc,def,ghi', '0,1,2', ',,']] @@ -478,7 +439,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\n', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCR(self): # Test that when the line separator is '\r', parsing works with all buffer # sizes @@ -488,7 +448,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRLF(self): # Test that when the line separator is '\r\n', parsing works with all buffer # sizes @@ -498,7 +457,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r\n', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withBufferSizeAndQuoted(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -507,7 +465,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\n', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRAndQuoted(self): # Test that when the line separator is '\r', parsing works with all buffer # sizes @@ -518,7 +475,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRLFAndQuoted(self): # Test that when the line separator is '\r\n', parsing works with all buffer # sizes @@ -529,7 +485,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r\n', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withGzipCompressionType(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -542,7 +497,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): compression_type='GZIP', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withZlibCompressionType(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -555,7 +509,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): compression_type='ZLIB', record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withScalarDefaults(self): record_defaults = [constant_op.constant(0, dtype=dtypes.int64)] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] @@ -563,7 +516,6 @@ class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]], record_defaults=record_defaults) - @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_with2DDefaults(self): record_defaults = [constant_op.constant([[0]], dtype=dtypes.int64)] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py index 5dd1bb0532c..cca7ae073ee 100644 --- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py @@ -17,21 +17,20 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class DenseToSparseBatchTest(test_base.DatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDataset(self): components = np.random.randint(12, size=(100,)).astype(np.int32) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -54,7 +53,6 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetWithUnknownShape(self): components = np.random.randint(5, size=(40,)).astype(np.int32) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -82,14 +80,12 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetWithInvalidShape(self): input_tensor = array_ops.constant([[1]]) with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"): dataset_ops.Dataset.from_tensors(input_tensor).apply( batching.dense_to_sparse_batch(4, [-2])) - @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetShapeErrors(self): def dataset_fn(input_tensor): diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py index fc18afaa842..4a8c7d1ccc6 100644 --- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py @@ -17,24 +17,22 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import random_seed +from tensorflow.python.framework import test_util from tensorflow.python.platform import test -class DirectedInterleaveDatasetTest(test_base.DatasetTestBase, - parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testBasic(self): selector_dataset = dataset_ops.Dataset.range(10).repeat(100) input_datasets = [ @@ -78,7 +76,6 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase, return freqs - @combinations.generate(test_base.default_test_combinations()) def testSampleFromDatasets(self): random_seed.set_random_seed(1619) num_samples = 5000 @@ -98,7 +95,6 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase, freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples) self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2) - @combinations.generate(test_base.default_test_combinations()) def testSelectFromDatasets(self): words = [b"foo", b"bar", b"baz"] datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words] @@ -111,7 +107,6 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase, with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testErrors(self): with self.assertRaisesRegexp(ValueError, r"vector of length `len\(datasets\)`"): diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py index 59c2ef68d99..f65740c5651 100644 --- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py @@ -23,30 +23,25 @@ from tensorflow.python.data.experimental.ops import get_single_element from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import function -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine( - skip=[0, 5, 10], take=[1], error=[None], error_msg=[None]) + - combinations.combine( - skip=[100], - take=[1], - error=[errors.InvalidArgumentError], - error_msg=["Dataset was empty."]) + combinations.combine( - skip=[0], - take=[2], - error=[errors.InvalidArgumentError], - error_msg=["Dataset had more than one element."]))) + @parameterized.named_parameters( + ("Zero", 0, 1), + ("Five", 5, 1), + ("Ten", 10, 1), + ("Empty", 100, 1, errors.InvalidArgumentError, "Dataset was empty."), + ("MoreThanOne", 0, 2, errors.InvalidArgumentError, + "Dataset had more than one element."), + ) def testGetSingleElement(self, skip, take, error=None, error_msg=None): def make_sparse(x): @@ -67,7 +62,6 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaisesRegexp(error, error_msg): self.evaluate(get_single_element.get_single_element(dataset)) - @combinations.generate(test_base.default_test_combinations()) def testWindow(self): """Test that `get_single_element()` can consume a nested dataset.""" def flat_map_func(ds): @@ -79,7 +73,6 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) - @combinations.generate(test_base.default_test_combinations()) def testSideEffect(self): counter_var = variables.Variable(0) @@ -99,7 +92,6 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(self.evaluate(fn()), b"hello") self.assertEqual(self.evaluate(counter_var), 1) - @combinations.generate(test_base.default_test_combinations()) def testAutomaticControlDependencies(self): counter_var = variables.Variable(1) diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py index bf823143d57..0e9042b2ef8 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py @@ -17,26 +17,25 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class GroupByReducerTest(test_base.DatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testSum(self): reducer = grouping.Reducer( init_func=lambda _: np.int64(0), @@ -50,7 +49,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) - @combinations.generate(test_base.default_test_combinations()) def testAverage(self): def reduce_fn(x, y): @@ -70,7 +68,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[i - 1, i]) - @combinations.generate(test_base.default_test_combinations()) def testConcat(self): components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray) reducer = grouping.Reducer( @@ -87,7 +84,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[b"acegikmoqs"[:i], b"bdfhjlnprt"[:i]]) - @combinations.generate(test_base.default_test_combinations()) def testSparseSum(self): def _sparse(i): return sparse_tensor.SparseTensorValue( @@ -107,7 +103,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) - @combinations.generate(test_base.default_test_combinations()) def testChangingStateShape(self): def reduce_fn(x, _): @@ -135,7 +130,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testTypeMismatch(self): reducer = grouping.Reducer( init_func=lambda x: constant_op.constant(1, dtype=dtypes.int32), @@ -150,7 +144,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): grouping.group_by_reducer(lambda _: np.int64(0), reducer)) # TODO(b/78665031): Remove once non-scalar keys are supported. - @combinations.generate(test_base.default_test_combinations()) def testInvalidKeyShape(self): reducer = grouping.Reducer( init_func=lambda x: np.int64(0), @@ -164,7 +157,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer)) # TODO(b/78665031): Remove once non-int64 keys are supported. - @combinations.generate(test_base.default_test_combinations()) def testInvalidKeyType(self): reducer = grouping.Reducer( init_func=lambda x: np.int64(0), @@ -177,7 +169,6 @@ class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): dataset.apply( grouping.group_by_reducer(lambda _: "wrong", reducer)) - @combinations.generate(test_base.default_test_combinations()) def testTuple(self): def init_fn(_): return np.array([], dtype=np.int64), np.int64(0) diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py index 2495083cf63..e529364e509 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py @@ -17,18 +17,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import string_ops @@ -38,7 +37,8 @@ from tensorflow.python.platform import test # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py. # Currently, they use a constant batch size, though should be made to use a # different batch size per key. -class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class GroupByWindowTest(test_base.DatasetTestBase): def _dynamicPad(self, bucket, window, window_size): # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a @@ -51,7 +51,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): 32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape( [None]), tensor_shape.TensorShape([3]))))) - @combinations.generate(test_base.default_test_combinations()) def testSingleBucket(self): def _map_fn(v): @@ -81,7 +80,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual(expected_unk_int64, bucketed_values[1]) self.assertAllEqual(expected_vec3_str, bucketed_values[2]) - @combinations.generate(test_base.default_test_combinations()) def testEvenOddBuckets(self): def _map_fn(v): @@ -134,7 +132,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2]) - @combinations.generate(test_base.default_test_combinations()) def testEvenOddBucketsFilterOutAllOdd(self): def _map_fn(v): @@ -176,7 +173,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual( np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"]) - @combinations.generate(test_base.default_test_combinations()) def testDynamicWindowSize(self): components = np.arange(100).astype(np.int64) @@ -206,7 +202,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(batches, 15) - @combinations.generate(test_base.default_test_combinations()) def testSimple(self): components = np.random.randint(100, size=(200,)).astype(np.int64) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -227,7 +222,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches])) - @combinations.generate(test_base.default_test_combinations()) def testImmediateOutput(self): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) @@ -246,7 +240,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next())) self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next())) - @combinations.generate(test_base.default_test_combinations()) def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( @@ -259,7 +252,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual([0, 0, 0], self.evaluate(get_next())) self.assertAllEqual([1], self.evaluate(get_next())) - @combinations.generate(test_base.default_test_combinations()) def testEmpty(self): dataset = dataset_ops.Dataset.range(4).apply( grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)) @@ -270,7 +262,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): "Window size must be greater than zero, but got 0."): print(self.evaluate(get_next())) - @combinations.generate(test_base.default_test_combinations()) def testReduceFuncError(self): components = np.random.randint(100, size=(200,)).astype(np.int64) @@ -289,7 +280,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testConsumeWindowDatasetMoreThanOnce(self): components = np.random.randint(50, size=(200,)).astype(np.int64) @@ -321,7 +311,6 @@ class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): counts.append(tight_result.shape[0]) self.assertEqual(len(components), sum(counts)) - @combinations.generate(test_base.default_test_combinations()) def testShortCircuit(self): dataset = dataset_ops.Dataset.range(10) diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py index 5ed72767425..c37439f328b 100644 --- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py @@ -19,15 +19,14 @@ from __future__ import print_function import os -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import error_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.ops import array_ops from tensorflow.python.ops import io_ops @@ -37,9 +36,9 @@ from tensorflow.python.util import compat _NUMPY_RANDOM_SEED = 42 -class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class IgnoreErrorsTest(test_base.DatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) @@ -54,7 +53,6 @@ class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) @@ -69,7 +67,6 @@ class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testReadFileIgnoreError(self): def write_string_to_file(value, filename): @@ -105,7 +102,6 @@ class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testTFRecordDatasetIgnoreError(self): filenames = [] for i in range(5): @@ -130,7 +126,6 @@ class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testZipIgnoreError(self): a = dataset_ops.Dataset.from_tensor_slices([1., 2., 0., 4.]) b = a.map(lambda x: array_ops.check_numerics(1. / x, "error")) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py index 980fd03b073..2ddff457bc4 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py @@ -17,29 +17,26 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.ops import readers -from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.data.util import nest -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util from tensorflow.python.ops import io_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class MakeBatchedFeaturesDatasetTest( - reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase, - parameterized.TestCase): + reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testRead(self): for batch_size in [1, 2]: for num_epochs in [1, 10]: @@ -88,7 +85,6 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self._next_actual_batch() - @combinations.generate(test_base.default_test_combinations()) def testReadWithEquivalentDataset(self): features = { "file": parsing_ops.FixedLenFeature([], dtypes.int64), @@ -107,7 +103,6 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testReadWithFusedShuffleRepeatDataset(self): num_epochs = 5 total_records = num_epochs * self._num_records @@ -156,7 +151,6 @@ class MakeBatchedFeaturesDatasetTest( all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) - @combinations.generate(test_base.default_test_combinations()) def testParallelReadersAndParsers(self): num_epochs = 5 for batch_size in [1, 2]: @@ -192,7 +186,6 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self._next_actual_batch() - @combinations.generate(test_base.default_test_combinations()) def testDropFinalBatch(self): for batch_size in [1, 2]: for num_epochs in [1, 10]: @@ -208,7 +201,6 @@ class MakeBatchedFeaturesDatasetTest( if isinstance(tensor, ops.Tensor): # Guard against SparseTensor. self.assertEqual(tensor.shape[0], batch_size) - @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): dataset = self.make_batch_feature( filenames=self.test_filenames[0], @@ -221,7 +213,6 @@ class MakeBatchedFeaturesDatasetTest( if issubclass(clazz, ops.Tensor): self.assertEqual(32, shape[0]) - @combinations.generate(test_base.default_test_combinations()) def testOldStyleReader(self): with self.assertRaisesRegexp( TypeError, r"The `reader` argument must return a `Dataset` object. " diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py index 5f8382f43c4..16c323b3790 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py @@ -21,21 +21,21 @@ import gzip import os import zlib -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.platform import test -class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class MakeCsvDatasetTest(test_base.DatasetTestBase): def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs): return readers.make_csv_dataset( @@ -126,7 +126,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self._verify_output(dataset, batch_size, num_epochs, label_name, expected_output, expected_keys) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset(self): """Tests making a CSV dataset with keys and defaults provided.""" record_defaults = [ @@ -158,7 +157,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): column_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withBatchSizeAndEpochs(self): """Tests making a CSV dataset with keys and defaults provided.""" record_defaults = [ @@ -190,7 +188,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): column_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withCompressionType(self): """Tests `compression_type` argument.""" record_defaults = [ @@ -224,7 +221,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): compression_type=compression_type, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withCompressionTypeAndNoColumnNames(self): """Tests `compression_type` argument.""" record_defaults = [ @@ -273,7 +269,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): compression_type="ZLIB", ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withBadInputs(self): """Tests that exception is raised when input is malformed. """ @@ -309,7 +304,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): label_name="not_a_real_label", column_names=column_names) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoLabel(self): """Tests making a CSV dataset with no label provided.""" record_defaults = [ @@ -339,7 +333,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): column_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoHeader(self): """Tests that datasets can be created from CSV files with no header line. """ @@ -370,7 +363,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): column_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypes(self): """Tests that defaults can be a dtype instead of a Tensor for required vals. """ @@ -402,7 +394,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): column_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoColNames(self): """Tests that datasets can be created when column names are not specified. @@ -436,7 +427,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): column_defaults=record_defaults, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInferenceMismatch(self): # Test that error is thrown when num fields doesn't match columns column_names = ["col%d" % i for i in range(5)] @@ -452,7 +442,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): batch_size=2, num_epochs=10) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInference(self): """Tests that datasets can be created when no defaults are specified. @@ -479,7 +468,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): header=True, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInferenceFallthrough(self): """Tests that datasets can be created when no defaults are specified. @@ -510,7 +498,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): header=True, ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNAValuesAndFieldDelim(self): """Tests that datasets can be created from different delim and na_value.""" column_names = ["col%d" % i for i in range(5)] @@ -533,7 +520,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): field_delim=" ", ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withSelectCols(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -602,7 +588,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): select_columns=[column_names[i] for i in select_cols], ) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withSelectColsError(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -641,7 +626,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): label_name=None, select_columns=["invalid_col_name"]) - @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withShuffle(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -726,7 +710,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) - @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): column_names = ["col%d" % i for i in range(5)] inputs = [[",".join(x for x in column_names), "0,1,2,3,4", "5,6,7,8,9"], [ diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py index a67ccd92842..ec1760398fa 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py @@ -17,22 +17,19 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.ops import readers -from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import string_ops from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class MakeTFRecordDatasetTest( - reader_dataset_ops_test_base.TFRecordDatasetTestBase, - parameterized.TestCase): + reader_dataset_ops_test_base.TFRecordDatasetTestBase): def _read_test(self, batch_size, num_epochs, file_index=None, num_parallel_reads=1, drop_final_batch=False, parser_fn=False): @@ -66,7 +63,6 @@ class MakeTFRecordDatasetTest( with self.assertRaises(errors.OutOfRangeError): self.evaluate(outputs()) - @combinations.generate(test_base.default_test_combinations()) def testRead(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -82,7 +78,6 @@ class MakeTFRecordDatasetTest( # Basic test: read from both files, with parallel reads. self._read_test(batch_size, num_epochs, num_parallel_reads=8) - @combinations.generate(test_base.default_test_combinations()) def testDropFinalBatch(self): for batch_size in [1, 2, 10]: for num_epochs in [1, 3]: @@ -96,7 +91,6 @@ class MakeTFRecordDatasetTest( self._read_test(batch_size, num_epochs, num_parallel_reads=8, drop_final_batch=True) - @combinations.generate(test_base.default_test_combinations()) def testParserFn(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -151,7 +145,6 @@ class MakeTFRecordDatasetTest( actual.extend(b) self.assertAllEqual(sorted(expected), sorted(actual)) - @combinations.generate(test_base.default_test_combinations()) def testShuffle(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -163,7 +156,6 @@ class MakeTFRecordDatasetTest( self._shuffle_test(batch_size, num_epochs, num_parallel_reads, seed=21345) - @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): dataset = readers.make_tf_record_dataset( file_pattern=self.test_filenames, num_epochs=None, batch_size=32) diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py index a2cc54d104e..a42ce40fb29 100644 --- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py @@ -19,19 +19,17 @@ from __future__ import print_function import time -from absl.testing import parameterized - from tensorflow.python.client import session from tensorflow.python.data.experimental.ops import map_defun from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.eager import function -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_spec +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops from tensorflow.python.ops import data_flow_ops @@ -40,11 +38,9 @@ from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test -# TODO(b/123903858): Add eager and V2 test coverage -class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_v1_only("b/123903858: Add eager and V2 test coverage") +class MapDefunTest(test_base.DatasetTestBase): - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testNoIntraOpLimit(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -59,8 +55,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): expected = elems * 2 + 3 self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunSimple(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -73,8 +67,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): expected = elems * 2 + 3 self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunMismatchedTypes(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)]) @@ -87,8 +79,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(r) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunReduceDim(self): # Tests where the output has a different rank from the input @@ -102,8 +92,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): expected = constant_op.constant([1, 3, 5]) self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunMultipleOutputs(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -117,8 +105,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): expected = [elems, elems * 2 + 3] self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunShapeInference(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -130,8 +116,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])[0] self.assertEqual(result.get_shape(), (3, 2)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunPartialShapeInference(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -142,8 +126,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)]) self.assertEqual(result[0].get_shape().as_list(), [None, 2]) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunRaisesErrorOnRuntimeShapeMismatch(self): @function.defun(input_signature=[ @@ -163,8 +145,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): "All inputs must have the same dimension 0."): sess.run(result, feed_dict={elems1: [1, 2, 3, 4, 5], elems2: [1, 2, 3]}) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunRaisesDefunError(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)]) @@ -177,8 +157,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(result) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunCancelledCorrectly(self): @function.defun(input_signature=[tensor_spec.TensorSpec([5], dtypes.int64)]) @@ -195,8 +173,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): r"indices = 10 is not in \[0, 5\)"): self.evaluate(map_defun_op) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithUnspecifiedOutputShape(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -214,8 +190,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual(self.evaluate(r[1]), self.evaluate(expected + 1)) self.assertAllEqual(self.evaluate(r[2]), self.evaluate(expected + 2)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithDifferentOutputShapeEachRun(self): @function.defun( @@ -230,8 +204,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertAllEqual( sess.run(r, feed_dict={elems: [[0], [1]]}), [[3], [5]]) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithWrongOutputShape(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -244,8 +216,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(r) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithInvalidInput(self): @function.defun( @@ -263,8 +233,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.InvalidArgumentError): sess.run(r, feed_dict={p: 0}) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithParentCancellation(self): # Checks that a cancellation of the parent graph is threaded through to # MapDefunOp correctly. @@ -286,8 +254,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): sess.close() thread.join() - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithCapturedInputs(self): c = constant_op.constant(2) @@ -300,8 +266,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): expected = x + c self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op)) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithVariantTensor(self): @function.defun( @@ -324,8 +288,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): actual = self.evaluate(deserialized) self.assertValuesEqual(expected, actual) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithVariantTensorAsCaptured(self): st = sparse_tensor.SparseTensor( @@ -347,8 +309,6 @@ class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): actual = self.evaluate(deserialized) self.assertValuesEqual(expected, actual) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithStrTensor(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py index d7944042c6e..811a58262ef 100644 --- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py @@ -28,13 +28,14 @@ from tensorflow.python.data.experimental.ops import threadpool from tensorflow.python.data.experimental.ops import unique from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import script_ops from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class OverrideThreadpoolTest(test_base.DatasetTestBase, parameterized.TestCase): @@ -69,13 +70,17 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, # perform work. self.assertLessEqual(len(thread_ids), num_threads) - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine( - num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + - combinations.combine( - num_threads=[4], max_intra_op_parallelism=[-1, 0, 4]))) + @parameterized.named_parameters( + ("1", 1, None), + ("2", 2, None), + ("3", 4, None), + ("4", 8, None), + ("5", 16, None), + ("6", 4, -1), + ("7", 4, 0), + ("8", 4, 1), + ("9", 4, 4), + ) def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): @@ -88,17 +93,20 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, self._testNumThreadsHelper(num_threads, override_threadpool_fn) - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine( - num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + - combinations.combine( - num_threads=[None], max_intra_op_parallelism=[0, 1, 4]) + - combinations.combine( - num_threads=[4], max_intra_op_parallelism=[0, 1, 4]) + - combinations.combine( - num_threads=[None], max_intra_op_parallelism=[None]))) + @parameterized.named_parameters( + ("1", 1, None), + ("2", 2, None), + ("3", 4, None), + ("4", 8, None), + ("5", 16, None), + ("6", None, 0), + ("7", None, 1), + ("8", None, 4), + ("9", 4, 0), + ("10", 4, 1), + ("11", 4, 4), + ("12", None, None), + ) def testNumThreads(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): @@ -113,7 +121,6 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, self._testNumThreadsHelper(num_threads, override_threadpool_fn) - @combinations.generate(test_base.default_test_combinations()) def testMaxIntraOpParallelismAsGraphDefInternal(self): dataset = dataset_ops.Dataset.from_tensors(0) dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1) diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py index 0fb8c78a7c0..1fe5655ec02 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py @@ -22,25 +22,24 @@ import math import threading import time -from absl.testing import parameterized import numpy as np from six.moves import zip_longest from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test -#TODO(feihugis): refactor this test to be parameterized. -class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class ParallelInterleaveTest(test_base.DatasetTestBase): def setUp(self): @@ -117,7 +116,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): num_open -= 1 break - @combinations.generate(test_base.default_test_combinations()) def testPythonImplementation(self): input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6], [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]] @@ -138,7 +136,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(expected, produced, "Values differ at %s. %s != %s" % (index, expected, produced)) - @combinations.generate(test_base.default_test_combinations()) def testPythonImplementationBlockLength(self): input_lists = [[4] * 4, [5] * 5, [6] * 6] * 2 expected_elements = [ @@ -150,7 +147,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(expected, produced, "Values differ at %s. %s != %s" % (index, expected, produced)) - @combinations.generate(test_base.default_test_combinations()) def testPythonImplementationEmptyLists(self): input_lists = [[4, 4, 4, 4], [], [6, 6, 6, 6, 6, 6], [4, 4, 4, 4], [], [6, 6, 6, 6, 6, 6]] @@ -193,23 +189,18 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testSingleThreaded(self): self._testSingleThreaded() - @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedSloppy(self): self._testSingleThreaded(sloppy=True) - @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedPrefetch1Itr(self): self._testSingleThreaded(prefetch_input_elements=1) - @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedPrefetch1ItrSloppy(self): self._testSingleThreaded(prefetch_input_elements=1, sloppy=True) - @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedRagged(self): # Tests a sequence with wildly different elements per iterator. self.skipTest("b/131722904") @@ -268,11 +259,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContention(self): self._testTwoThreadsNoContention() - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionSloppy(self): self._testTwoThreadsNoContention(sloppy=True) @@ -317,11 +306,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRaces(self): self._testTwoThreadsNoContentionWithRaces() - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesSloppy(self): self._testTwoThreadsNoContentionWithRaces(sloppy=True) @@ -356,11 +343,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionBlockLength(self): self._testTwoThreadsNoContentionBlockLength() - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionBlockLengthSloppy(self): self._testTwoThreadsNoContentionBlockLength(sloppy=True) @@ -406,11 +391,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesAndBlocking(self): self._testTwoThreadsNoContentionWithRacesAndBlocking() - @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesAndBlockingSloppy(self): self._testTwoThreadsNoContentionWithRacesAndBlocking(sloppy=True) @@ -428,11 +411,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testEmptyInput(self): self._testEmptyInput() - @combinations.generate(test_base.default_test_combinations()) def testEmptyInputSloppy(self): self._testEmptyInput(sloppy=True) @@ -450,11 +431,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testNonEmptyInputIntoEmptyOutputs(self): self._testNonEmptyInputIntoEmptyOutputs() - @combinations.generate(test_base.default_test_combinations()) def testNonEmptyInputIntoEmptyOutputsSloppy(self): self._testNonEmptyInputIntoEmptyOutputs(sloppy=True) @@ -490,15 +469,12 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): "At index %s: %s expected, got: %s" % (i, expected_element, actual_element)) - @combinations.generate(test_base.default_test_combinations()) def testPartiallyEmptyOutputs(self): self._testPartiallyEmptyOutputs() - @combinations.generate(test_base.default_test_combinations()) def testPartiallyEmptyOutputsSloppy(self): self._testPartiallyEmptyOutputs(sloppy=True, prefetch_input_elements=0) - @combinations.generate(test_base.default_test_combinations()) def testDelayedOutputSloppy(self): # Explicitly control the sequence of events to ensure we correctly avoid # head-of-line blocking. @@ -524,7 +500,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testBlockLengthWithContentionSloppy(self): self.skipTest("b/131722904") self._clear_coordination_events() @@ -582,11 +557,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): self.read_coordination_events[i].acquire() self.write_coordination_events[i].set() - @combinations.generate(test_base.default_test_combinations()) def testEarlyExit(self): self._testEarlyExit() - @combinations.generate(test_base.default_test_combinations()) def testEarlyExitSloppy(self): self._testEarlyExit(sloppy=True) @@ -611,15 +584,12 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2) self.assertItemsEqual(output_values, expected_values) - @combinations.generate(test_base.default_test_combinations()) def testTooManyReaders(self): self._testTooManyReaders() - @combinations.generate(test_base.default_test_combinations()) def testTooManyReadersSloppy(self): self._testTooManyReaders(sloppy=True) - @combinations.generate(test_base.default_test_combinations()) def testSparse(self): def _map_fn(i): return sparse_tensor.SparseTensor( @@ -640,7 +610,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testErrorsInOutputFn(self): self.skipTest("b/131722904") self._clear_coordination_events() @@ -673,7 +642,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testErrorsInInputFn(self): def map_py_fn(x): @@ -719,7 +687,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testErrorsInInterleaveFn(self): def map_py_fn(x): @@ -763,7 +730,6 @@ class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testShutdownRace(self): dataset = dataset_ops.Dataset.range(20) map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1)) diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py index 58cba64617d..794f72365df 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py @@ -20,7 +20,6 @@ from __future__ import print_function import copy -from absl.testing import parameterized import numpy as np from tensorflow.core.example import example_pb2 @@ -29,11 +28,11 @@ from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsi from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor +from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.ops.ragged import ragged_factory_ops from tensorflow.python.platform import test @@ -51,8 +50,8 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d) sequence_example = example_pb2.SequenceExample -class ParseExampleDatasetTest(test_base.DatasetTestBase, - parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class ParseExampleDatasetTest(test_base.DatasetTestBase): def _compare_output_to_expected(self, dict_tensors, expected_tensors): self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys())) @@ -108,7 +107,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[1], None) - @combinations.generate(test_base.default_test_combinations()) def testEmptySerializedWithAllDefaults(self): sparse_name = "st_a" a_name = "a" @@ -147,7 +145,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.run_deprecated_v1 def testEmptySerializedWithoutDefaultsShouldFail(self): input_features = { "st_a": @@ -181,7 +179,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_err=(errors_impl.InvalidArgumentError, "Feature: c \\(data type: float\\) is required")) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.run_deprecated_v1 def testDenseNotMatchingShapeShouldFail(self): original = [ example(features=features({ @@ -199,7 +197,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_err=(errors_impl.InvalidArgumentError, "Key: a, Index: 1. Number of float values")) - @combinations.generate(test_base.default_test_combinations()) def testDenseDefaultNoShapeShouldFail(self): original = [example(features=features({"a": float_feature([1, 1, 3]),})),] @@ -210,7 +207,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)}, expected_err=(ValueError, "Missing shape for feature a")) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparse(self): original = [ example(features=features({ @@ -252,7 +248,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparseFeature(self): original = [ example(features=features({ @@ -289,7 +284,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparseFeatureReuse(self): original = [ example(features=features({ @@ -331,7 +325,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContaining3DSparseFeature(self): original = [ example(features=features({ @@ -377,7 +370,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDense(self): aname = "a" bname = "b*has+a:tricky_name" @@ -415,7 +407,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, # This test is identical as the previous one except # for the creation of 'serialized'. - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseWithConcat(self): aname = "a" bname = "b*has+a:tricky_name" @@ -461,7 +452,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseScalar(self): original = [ example(features=features({ @@ -486,7 +476,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseWithDefaults(self): original = [ example(features=features({ @@ -525,7 +514,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self): expected_st_a = sparse_tensor.SparseTensorValue( # indices, values, shape np.empty((0, 2), dtype=np.int64), # indices @@ -581,7 +569,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testerializedContainingSparseAndSparseFeatureWithReuse(self): expected_idx = sparse_tensor.SparseTensorValue( # indices, values, shape np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), @@ -680,13 +667,11 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingVarLenDenseLargerBatch(self): np.random.seed(3456) for batch_size in (1, 10, 20, 100, 256): self._testSerializedContainingVarLenDenseLargerBatch(batch_size) - @combinations.generate(test_base.default_test_combinations()) def testSerializedShapeMismatch(self): aname = "a" bname = "b" @@ -739,7 +724,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_err=(ValueError, "Cannot reshape a tensor with 0 elements to shape")) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.run_deprecated_v1 def testSerializedContainingVarLenDense(self): aname = "a" bname = "b" @@ -892,7 +877,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, "Unsupported: FixedLenSequenceFeature requires " "allow_missing to be True.")) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithNoPartitions(self): original = [ example( @@ -938,7 +922,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithOnePartition(self): original = [ example( @@ -1057,7 +1040,6 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase, expected_values=expected_output, create_iterator_twice=True) - @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithMultiplePartitions(self): original = [ # rt shape: [(batch), 2, None, None] diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py index 8ac4e239881..f51da6e8b66 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py @@ -17,14 +17,11 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.experimental.ops import prefetching_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import structure -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -34,9 +31,9 @@ from tensorflow.python.platform import test # TODO(b/117581999): add eager coverage when supported. -class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): +class PrefetchToDeviceTest(test_base.DatasetTestBase): - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -60,7 +57,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -85,7 +82,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -109,7 +106,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchSparseTensorsToDevice(self): def make_tensor(i): return sparse_tensor.SparseTensorValue( @@ -139,7 +136,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -159,7 +156,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -187,7 +184,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.deprecated_graph_mode_only def testPrefetchToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py index ff1f1680a76..abc9eb5f0ad 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py @@ -24,17 +24,16 @@ from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import multi_device_iterator_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): - # TODO(b/121264236) - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) + @test_util.run_v1_only("b/121264236") def testPrefetchWithSlackOption(self): """Determines slack_period based on num devices attached to iterator.""" dataset = dataset_ops.Dataset.range(10) @@ -61,7 +60,6 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): self.evaluate(elem_on_1) self.evaluate(elem_on_2) - @combinations.generate(test_base.default_test_combinations()) def testPrefetchWithSlackOptionWithoutIterator(self): """Defaults to slack period of 1 without iterator.""" dataset = dataset_ops.Dataset.range(10) @@ -74,7 +72,6 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): dataset.options()._graph_rewrite_configs()) self.assertDatasetProduces(dataset, range(10)) - @combinations.generate(test_base.default_test_combinations()) def testWithPassthroughDataset(self): """Should still work with a passthrough dataset after prefetch().""" dataset = dataset_ops.Dataset.range(10) @@ -85,7 +82,6 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.with_options(options) self.assertDatasetProduces(dataset, range(1, 11)) - @combinations.generate(test_base.default_test_combinations()) def testErrorWithoutPrefetch(self): """The rewrite fails if there is no prefetch() in the pipeline.""" dataset = dataset_ops.Dataset.range(10) @@ -96,7 +92,6 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): get_next = self.getNext(dataset) self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testErrorWithInvalidDataset(self): """With a nested dataset op after prefetch, the rewrite should fail.""" dataset = dataset_ops.Dataset.range(10) diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py index 30496658529..32bcdbe183b 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py @@ -32,8 +32,8 @@ from tensorflow.python.data.experimental.ops import scan_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -47,11 +47,13 @@ def _flat_shapes(dataset): return nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)) +@test_util.run_all_in_graph_and_eager_modes class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(drop_remainder=[True, False]))) + drop_remainder_cases = [("WithDropRemainder", True), + ("WithoutDropRemainder", False)] + + @parameterized.named_parameters(drop_remainder_cases) def testBasic(self, drop_remainder): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder) @@ -62,16 +64,13 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testScalarInputError(self): dataset = dataset_ops.Dataset.range(1024) distribute._RebatchDataset(dataset.batch(4), num_replicas=4) with self.assertRaisesRegexp(ValueError, "at least one dimension"): distribute._RebatchDataset(dataset, num_replicas=4) - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(drop_remainder=[True, False]))) + @parameterized.named_parameters(drop_remainder_cases) def testBatchNotDivisibleByNumReplicas(self, drop_remainder): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder) @@ -90,7 +89,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): i += 4 self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testBatchSizeNotDivisibleByNumReplicas2(self): dataset = dataset_ops.Dataset.range(32).batch(16, drop_remainder=True) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5) @@ -104,7 +102,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output.extend([[]]) # Last replica gets an empty batch self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testTupleOutput(self): dataset = dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(32) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4) @@ -113,7 +110,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testNestedDictionaryOutput(self): dataset = dataset_ops.Dataset.range(1024).map( lambda x: {"a": x, "b": {"c": x}}).batch(32) @@ -123,9 +119,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(drop_remainder=[True, False]))) + @parameterized.named_parameters(drop_remainder_cases) def testFinalPartialBatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(1032).batch( 32, drop_remainder=drop_remainder) @@ -142,9 +136,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[k for k in range(i, i + 2)] for i in range(1024, 1032, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(drop_remainder=[True, False]))) + @parameterized.named_parameters(drop_remainder_cases) def testFinalPartialBatchAfterRebatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(34).batch( 32, drop_remainder=drop_remainder) @@ -158,7 +150,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += [[32], [33], [], []] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testMultipleBatches(self): dataset = dataset_ops.Dataset.range(128).batch(4).batch(8) self.assertEqual([[None, None]], @@ -179,7 +170,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 128, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testMapAndBatch(self): dataset = dataset_ops.Dataset.range(1024).apply( batching.map_and_batch(math_ops.square, 32)) @@ -190,7 +180,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testMapAndBatchWithCapturedInput(self): captured_t = variables.Variable(42) dataset = dataset_ops.Dataset.range(1024).apply( @@ -204,7 +193,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( rebatched_dataset, expected_output, requires_initialization=True) - @combinations.generate(test_base.default_test_combinations()) def testPaddedBatch(self): dataset = dataset_ops.Dataset.range(128).batch( 4, drop_remainder=True).padded_batch( @@ -225,7 +213,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 128, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testConcatenate(self): dataset1 = dataset_ops.Dataset.range(64).batch(8) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -237,7 +224,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[i, i + 1] for i in range(0, 32, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testConcatenateDifferentShapes(self): dataset1 = dataset_ops.Dataset.range(64).batch(16) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -249,7 +235,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[i, i + 1] for i in range(0, 32, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testZip(self): dataset1 = dataset_ops.Dataset.range(64).batch(8) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -260,7 +245,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testZipDifferentShapes(self): dataset1 = dataset_ops.Dataset.range(64).batch(16) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -272,7 +256,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 32, 2)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testFlatMapBatching(self): dataset = dataset_ops.Dataset.range(2).flat_map( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -291,7 +274,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 32, 8)] # generates 4 elements self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testInterleaveBatching(self): dataset = dataset_ops.Dataset.range(2).interleave( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -308,7 +290,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += expected_output self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testParallelInterleaveBatching(self): dataset = dataset_ops.Dataset.range(2).interleave( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -326,7 +307,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += expected_output self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowStaticBatch(self): dataset = dataset_ops.Dataset.from_tensor_slices( [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)]) @@ -346,7 +326,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for k in range(2)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) @@ -371,7 +350,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatchWithPartialBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) @@ -393,7 +371,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatchWithPartialBatchWithDropRemainder(self): # This test exercises nested batch functionality, dynamic batch size # and drop_remainder=True together. @@ -418,7 +395,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testScanAfterBatch(self): dataset = dataset_ops.Dataset.range(40).batch(10).apply( scan_ops.scan(np.int64(2), lambda state, value: (state, value * state))) @@ -429,7 +405,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[i * 2 for i in range(j*5, (j+1)*5)] for j in range(8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testMakeBatchedFeaturesDataset(self): # Set up fn = os.path.join(self.get_temp_dir(), "tf_record.txt") @@ -463,7 +438,6 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): } for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output) - @combinations.generate(test_base.default_test_combinations()) def testRaggedTensorDataset(self): # Set up a dataset that produces ragged tensors with a static batch size. row_lengths = np.random.randint(8, size=128) diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py index fb1d4ea5d3a..673e77fc3bb 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py @@ -24,9 +24,9 @@ import numpy as np from tensorflow.python.data.experimental.ops import resampling from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import string_ops @@ -34,11 +34,12 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat +@test_util.run_all_in_graph_and_eager_modes class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(initial_known=[True, False]))) + @parameterized.named_parameters( + ("InitialDistributionKnown", True), + ("InitialDistributionUnknown", False)) def testDistribution(self, initial_known): classes = np.random.randint(5, size=(20000,)) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] @@ -71,9 +72,9 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2) - @combinations.generate( - combinations.times(test_base.default_test_combinations(), - combinations.combine(only_initial_dist=[True, False]))) + @parameterized.named_parameters( + ("OnlyInitial", True), + ("NotInitial", False)) def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist): init_dist = [0.5, 0.5] target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0] @@ -98,7 +99,6 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): while True: returned.append(self.evaluate(get_next())) - @combinations.generate(test_base.default_test_combinations()) def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py index 8bb109a6519..92ae528b940 100644 --- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py @@ -17,18 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import shuffle_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.platform import test -class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class ShuffleAndRepeatTest(test_base.DatasetTestBase): def _build_ds(self, seed, count=5, num_elements=20): return dataset_ops.Dataset.range(num_elements).apply( @@ -44,7 +44,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): self.evaluate(get_next()) return outputs - @combinations.generate(test_base.default_test_combinations()) def testCorrectOutput(self): output = self._gen_outputs(lambda: self._build_ds(10), 100) self.assertSequenceEqual( @@ -53,7 +52,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(5): self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20)) - @combinations.generate(test_base.default_test_combinations()) def testReshuffling(self): # Check that the output orders of different epochs are indeed different. output = self._gen_outputs(lambda: self._build_ds(10), 100) @@ -62,20 +60,17 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): epoch2 = output[(i + 1) * 20:(i + 2) * 20] self.assertNotEqual(epoch1, epoch2) - @combinations.generate(test_base.default_test_combinations()) def testSameOrderForSameSeeds(self): output1 = self._gen_outputs(lambda: self._build_ds(10), 100) output2 = self._gen_outputs(lambda: self._build_ds(10), 100) self.assertEqual(output1, output2) - @combinations.generate(test_base.default_test_combinations()) def testDifferentOrderForDifferentSeeds(self): output1 = self._gen_outputs(lambda: self._build_ds(10), 100) output2 = self._gen_outputs(lambda: self._build_ds(20), 100) self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) - @combinations.generate(test_base.default_test_combinations()) def testCountNone(self): output1 = self._gen_outputs( lambda: self._build_ds(10, count=None), 100, verify_exhausted=False) @@ -84,7 +79,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) - @combinations.generate(test_base.default_test_combinations()) def testCountMinusOne(self): output1 = self._gen_outputs( lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False) @@ -93,7 +87,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) - @combinations.generate(test_base.default_test_combinations()) def testInfiniteOutputs(self): # Asserting the iterator is exhausted after producing 100 items should fail. with self.assertRaises(AssertionError): @@ -101,7 +94,6 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(AssertionError): self._gen_outputs(lambda: self._build_ds(10, count=-1), 100) - @combinations.generate(test_base.default_test_combinations()) def testInfiniteEmpty(self): with self.assertRaises(errors.OutOfRangeError): self._gen_outputs(lambda: self._build_ds(10, count=None, num_elements=0), @@ -110,14 +102,12 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): self._gen_outputs(lambda: self._build_ds(10, count=-1, num_elements=0), 100) - @combinations.generate(test_base.default_test_combinations()) def testLargeBufferSize(self): ds = dataset_ops.Dataset.range(20).apply( shuffle_ops.shuffle_and_repeat(buffer_size=21)) get_next = self.getNext(ds) self.evaluate(get_next()) - @combinations.generate(test_base.default_test_combinations()) def testVeryLargeBufferSize(self): num_epochs = 1000 * 1000 # Each element being shuffled and repeated has shape (100,). This will OOM diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py index 8e1dd4bd8dc..f55f62f5cb0 100644 --- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py @@ -18,22 +18,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base -from tensorflow.python.data.kernel_tests import test_base -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, - parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that SqlDataset can read from a database table. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSet(self): for _ in range(2): # Run twice to verify statelessness of db operations. dataset = self._createSqlDataset( @@ -48,7 +44,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, num_test_iterations=2) # Test that SqlDataset works on a join query. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetJoinQuery(self): get_next = self.getNext( self._createSqlDataset( @@ -65,7 +60,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that SqlDataset can read a database entry with a null-terminator # in the middle of the text and place the entry in a `string` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetNullTerminator(self): get_next = self.getNext( self._createSqlDataset( @@ -82,7 +76,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that SqlDataset works when used on two different queries. # Because the output types of the dataset must be determined at graph-creation # time, the two queries must have the same number and types of columns. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetReuseSqlDataset(self): get_next = self.getNext( self._createSqlDataset( @@ -107,7 +100,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that an `OutOfRangeError` is raised on the first call to # `get_next_str_only` if result set is empty. - @combinations.generate(test_base.default_test_combinations()) def testReadEmptyResultSet(self): get_next = self.getNext( self._createSqlDataset( @@ -118,7 +110,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, self.evaluate(get_next()) # Test that an error is raised when `driver_name` is invalid. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithInvalidDriverName(self): with self.assertRaises(errors.InvalidArgumentError): dataset = self._createSqlDataset( @@ -129,7 +120,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, self.assertDatasetProduces(dataset, expected_output=[]) # Test that an error is raised when a column name in `query` is nonexistent - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithInvalidColumnName(self): get_next = self.getNext( self._createSqlDataset( @@ -140,7 +130,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, self.evaluate(get_next()) # Test that an error is raised when there is a syntax error in `query`. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetOfQueryWithSyntaxError(self): get_next = self.getNext( self._createSqlDataset( @@ -152,7 +141,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that an error is raised when the number of columns in `query` # does not match the length of `, output_types`. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithMismatchBetweenColumnsAndOutputTypes(self): get_next = self.getNext( self._createSqlDataset( @@ -166,7 +154,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # than a select query. In particular, the error refers to the number of # output types passed to the op not matching the number of columns in the # result set of the query (namely, 0 for an insert statement.) - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetOfInsertQuery(self): get_next = self.getNext( self._createSqlDataset( @@ -178,7 +165,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int8` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8(self): get_next = self.getNext( self._createSqlDataset( @@ -192,7 +178,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int8` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -206,7 +191,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int8` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -221,7 +205,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int16` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16(self): get_next = self.getNext( self._createSqlDataset( @@ -235,7 +218,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int16` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -249,7 +231,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int16` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -265,7 +246,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int32` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32(self): get_next = self.getNext( self._createSqlDataset( @@ -277,7 +257,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int32` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -291,7 +270,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int32` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -307,7 +285,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database # table and place it in an `int32` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32VarCharColumnAsInt(self): get_next = self.getNext( self._createSqlDataset( @@ -321,7 +298,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer from a SQLite database table # and place it in an `int64` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64(self): get_next = self.getNext( self._createSqlDataset( @@ -335,7 +311,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int64` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -349,7 +324,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int64` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -365,7 +339,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in a `uint8` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt8(self): get_next = self.getNext( self._createSqlDataset( @@ -379,7 +352,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read the minimum and maximum uint8 values from a # SQLite database table and place them in `uint8` tensors. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt8MinAndMaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -395,7 +367,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer from a SQLite database table # and place it in a `uint16` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt16(self): get_next = self.getNext( self._createSqlDataset( @@ -409,7 +380,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read the minimum and maximum uint16 values from a # SQLite database table and place them in `uint16` tensors. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt16MinAndMaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -426,7 +396,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a # SQLite database table and place them as `True` and `False` respectively # in `bool` tensors. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetBool(self): get_next = self.getNext( self._createSqlDataset( @@ -440,7 +409,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued # from a SQLite database table and place it as `True` in a `bool` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetBoolNotZeroOrOne(self): get_next = self.getNext( self._createSqlDataset( @@ -454,7 +422,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a float from a SQLite database table # and place it in a `float64` tensor. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64(self): get_next = self.getNext( self._createSqlDataset( @@ -470,7 +437,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # Test that `SqlDataset` can read a float from a SQLite database table beyond # the precision of 64-bit IEEE, without throwing an error. Test that # `SqlDataset` identifies such a value as equal to itself. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64OverlyPrecise(self): get_next = self.getNext( self._createSqlDataset( @@ -492,7 +458,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, # representing the largest integer representable as a 64-bit IEEE float # such that the previous integer is also representable as a 64-bit IEEE float. # Test that `SqlDataset` can distinguish these two numbers. - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64LargestConsecutiveWholeNumbersNotEqual(self): get_next = self.getNext( self._createSqlDataset( @@ -507,7 +472,6 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, self.evaluate(get_next()) # Test that SqlDataset can stop correctly when combined with batch - @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithBatchStop(self): dataset = self._createSqlDataset( query="SELECT * FROM data", output_types=(dtypes.int32)) diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py index 5b3aaea95f6..4f04a0a3639 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py @@ -17,7 +17,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base @@ -25,9 +24,7 @@ from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_ from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.experimental.ops import stats_ops -from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -35,10 +32,8 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, - parameterized.TestCase): +class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testBytesProduced(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( @@ -62,7 +57,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, self.assertStatisticsHasCount(handle, "bytes_produced", 100.0, 101) self.assertStatisticsHasSum(handle, "bytes_produced", expected_sum, 101) - @combinations.generate(test_base.default_test_combinations()) def testLatencyStats(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -82,7 +76,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 100.0, 101) - @combinations.generate(test_base.default_test_combinations()) def testPrefetchBufferUtilization(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( @@ -124,7 +117,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, 301, offset=2) - @combinations.generate(test_base.default_test_combinations()) def testPrefetchBufferScalars(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(10).map( @@ -148,7 +140,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testFilteredElementsStats(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(101).filter( @@ -176,7 +167,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, handle, self.regexForNodeName("FilterDataset", "filtered_elements"), 34.0) - @combinations.generate(test_base.default_test_combinations()) def testReinitialize(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -197,7 +187,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, self.assertStatisticsHasCount(handle, "record_latency", (j + 1) * 100.0, (j * 100) + 101) - @combinations.generate(test_base.default_test_combinations()) def testNoAggregatorRegistered(self): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) @@ -209,7 +198,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testMultipleTags(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -233,7 +221,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, handle, "record_latency", 100.0, 201, offset=1) self.assertStatisticsHasCount(handle, "record_latency_2", 100.0, 201) - @combinations.generate(test_base.default_test_combinations()) def testRepeatedTags(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -252,7 +239,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 200.0, 201) - @combinations.generate(test_base.default_test_combinations()) def testMultipleIteratorsSameAggregator(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -273,7 +259,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 200.0, 201) - @combinations.generate(test_base.default_test_combinations()) def testMultipleDatasetWithPrefixes(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -304,7 +289,6 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, self.assertStatisticsHasCount(handle, "dataset2::record_latency", 100.0, 201) - @combinations.generate(test_base.default_test_combinations()) def testMultiplePrefetchStats(self): aggregator = stats_aggregator.StatsAggregator() @@ -330,10 +314,8 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, self.evaluate(next_element()) -class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, - parameterized.TestCase): +class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testMapBufferUtilization(self): def dataset_fn(): @@ -344,7 +326,6 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, self.parallelCallsStats( dataset_fn, {"ParallelMapDataset"}, 10, function_processing_time=True) - @combinations.generate(test_base.default_test_combinations()) def testMapAutoTuneBufferUtilization(self): def dataset_fn(): @@ -355,7 +336,6 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, self.parallelCallsStats( dataset_fn, {"ParallelMapDataset"}, 10, function_processing_time=True) - @combinations.generate(test_base.default_test_combinations()) def testInterleaveAutoTuneBufferUtilization(self): def dataset_fn(): @@ -371,7 +351,6 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, self.parallelCallsStats(dataset_fn, {"ParallelInterleaveDatasetV2"}, 10) - @combinations.generate(test_base.default_test_combinations()) def testMapAndBatchAutoTuneBufferUtilization(self): def dataset_fn(): @@ -391,10 +370,8 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, class FeatureStatsDatasetTest( stats_dataset_test_base.StatsDatasetTestBase, - reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase, - parameterized.TestCase): + reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testFeaturesStats(self): num_epochs = 5 total_records = num_epochs * self._num_records diff --git a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py index 959837faa24..b2b0effb0df 100644 --- a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py @@ -23,21 +23,18 @@ import numpy as np from tensorflow.python.data.experimental.ops import take_while_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import errors +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test +@test_util.run_all_in_graph_and_eager_modes class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine(num_elements=[14, 15], window_size=[2]) + - combinations.combine(num_elements=[100], window_size=[3]))) + @parameterized.parameters((14, 2), (15, 2), (100, 3)) def testTakeWhileDataset(self, num_elements, window_size): def _predicate_func(elem): @@ -52,19 +49,8 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): expected_num_elements = int(num_elements / window_size) * window_size self.assertDatasetProduces(dataset, np.arange(expected_num_elements)) - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine( - num_elements=[10], upper_bound=[2], out_of_bounds=[False]) + - combinations.combine( - num_elements=[16], upper_bound=[7], out_of_bounds=[False]) + - combinations.combine( - num_elements=[100], upper_bound=[99], out_of_bounds=[False]) + - combinations.combine( - num_elements=[100], upper_bound=[101], out_of_bounds=[True]) + - combinations.combine( - num_elements=[0], upper_bound=[1], out_of_bounds=[True]))) + @parameterized.parameters((10, 2, False), (16, 7, False), (100, 99, False), + (100, 101, True), (0, 1, True)) def testTakeWhileDatasetRange(self, num_elements, upper_bound, out_of_bounds): dataset = dataset_ops.Dataset.range(num_elements).apply( take_while_ops.take_while(lambda x: x < upper_bound)) @@ -76,7 +62,6 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): else: self.assertDatasetProduces(dataset, np.arange(upper_bound)) - @combinations.generate(test_base.default_test_combinations()) def testTakeWhileDatasetString(self): def not_equal(string): @@ -94,13 +79,7 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.assertEqual(b"test", self.evaluate(next_element())) - @combinations.generate( - combinations.times( - test_base.default_test_combinations(), - combinations.combine(size=[5], index=[3]) + - combinations.combine(size=[10], index=[0]) + - combinations.combine(size=[100], index=[5]) + - combinations.combine(size=[8], index=[7]))) + @parameterized.parameters((5, 3), (10, 0), (100, 5), (8, 7)) def testTakewhileDatasetShortCircuit(self, size, index): def _predicate_func(data_elem): @@ -119,7 +98,6 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) - @combinations.generate(test_base.default_test_combinations()) def testTakeWhileDatasetWithRepeat(self): dataset = dataset_ops.Dataset.range(10).apply( take_while_ops.take_while(lambda x: x < 2)).repeat(5) diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py index a327fc82466..136a446bbd8 100644 --- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py @@ -19,16 +19,14 @@ from __future__ import print_function import os -from absl.testing import parameterized - from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.experimental.ops import writers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers from tensorflow.python.eager import function -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.lib.io import tf_record from tensorflow.python.ops import string_ops @@ -36,7 +34,8 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat -class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class TFRecordWriterTest(test_base.DatasetTestBase): def setUp(self): super(TFRecordWriterTest, self).setUp() @@ -64,13 +63,11 @@ class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): def _outputFilename(self): return os.path.join(self.get_temp_dir(), "tf_record.out.txt") - @combinations.generate(test_base.default_test_combinations()) def testWrite(self): self.evaluate(self.writer_fn(self._createFile())) for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): self.assertAllEqual(self._record(i), r) - @combinations.generate(test_base.default_test_combinations()) def testWriteZLIB(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) self.evaluate( @@ -79,7 +76,6 @@ class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r) - @combinations.generate(test_base.default_test_combinations()) def testWriteGZIP(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP) self.evaluate( @@ -88,24 +84,20 @@ class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r) - @combinations.generate(test_base.default_test_combinations()) def testFailDataset(self): with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write("whoops") - @combinations.generate(test_base.default_test_combinations()) def testFailDType(self): input_dataset = dataset_ops.Dataset.from_tensors(10) with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset) - @combinations.generate(test_base.default_test_combinations()) def testFailShape(self): input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]]) with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset) - @combinations.generate(test_base.default_test_combinations()) def testSideEffect(self): def writer_fn(): input_dataset = readers.TFRecordDataset(self._createFile()) @@ -120,7 +112,6 @@ class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): self.assertAllEqual(self._record(i), r) - @combinations.generate(test_base.default_test_combinations()) def testShard(self): filename = self._createFile() dataset = readers.TFRecordDataset([filename]) diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py index 9a51c4224ff..42d76a2eb30 100644 --- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py @@ -17,18 +17,17 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.python.data.experimental.ops import unique from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util from tensorflow.python.platform import test from tensorflow.python.util import compat -class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class UniqueTest(test_base.DatasetTestBase): def _testSimpleHelper(self, dtype, test_cases): """Test the `unique()` transformation on a list of test cases. @@ -53,7 +52,7 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): for element in expected ]) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.run_deprecated_v1 def testSimpleInt(self): for dtype in [dtypes.int32, dtypes.int64]: self._testSimpleHelper(dtype, [ @@ -66,7 +65,7 @@ class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]), ]) - @combinations.generate(test_base.graph_only_combinations()) + @test_util.run_deprecated_v1 def testSimpleString(self): self._testSimpleHelper(dtypes.string, [ ([], []), diff --git a/tensorflow/python/data/experimental/kernel_tests/variant_test.py b/tensorflow/python/data/experimental/kernel_tests/variant_test.py index 897aa223371..6a3a1424d12 100644 --- a/tensorflow/python/data/experimental/kernel_tests/variant_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/variant_test.py @@ -17,18 +17,16 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.python.data.experimental.ops import cardinality from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations +from tensorflow.python.framework import test_util from tensorflow.python.platform import test -class VariantTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class VariantTest(test_base.DatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testRoundtripRange(self): dataset = dataset_ops.Dataset.range(10) variant = dataset_ops.to_variant(dataset) @@ -37,7 +35,6 @@ class VariantTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces(dataset, range(10)) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10) - @combinations.generate(test_base.default_test_combinations()) def testRoundtripMap(self): dataset = dataset_ops.Dataset.range(10).map(lambda x: x*x) variant = dataset_ops.to_variant(dataset) diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py index 5d05332f0ab..09627d02994 100644 --- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py @@ -17,20 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from absl.testing import parameterized - from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import combinations from tensorflow.python.framework import ops +from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_dataset_ops from tensorflow.python.platform import test -class WrapDatasetVariantTest(test_base.DatasetTestBase, parameterized.TestCase): +@test_util.run_all_in_graph_and_eager_modes +class WrapDatasetVariantTest(test_base.DatasetTestBase): - @combinations.generate(test_base.default_test_combinations()) def testBasic(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access @@ -44,9 +42,7 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(100): self.assertEqual(i, self.evaluate(get_next())) - # TODO("b/123901304") - @combinations.generate( - combinations.combine(tf_api_version=[1], mode=["graph"])) + @test_util.run_v1_only("b/123901304") def testSkipEagerGPU(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access From 79666d21050b81943500a49e394944fd3fa7646e Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Thu, 5 Dec 2019 11:01:41 -0800 Subject: [PATCH 177/383] Replace absl::Span with llvm::ArrayRef in compile_mlir_util (NFC). This removes some wrapping of std::vector with absl::Span and removes absl::Span dependency in compile_mlir_util. PiperOrigin-RevId: 284010398 Change-Id: Ia6850882dc3309f4cd7a0cdeed137f23b476c475 --- tensorflow/compiler/mlir/tensorflow/BUILD | 1 - .../mlir/tensorflow/utils/compile_mlir_util.cc | 9 +++++---- .../mlir/tensorflow/utils/compile_mlir_util.h | 4 ++-- .../tensorflow/utils/compile_mlir_util_test.cc | 15 ++++++--------- 4 files changed, 13 insertions(+), 16 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index 24031c3c4cd..0532c929658 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -810,7 +810,6 @@ cc_library( "//tensorflow/core:framework", "//tensorflow/core/platform:logging", "//tensorflow/stream_executor/lib", - "@com_google_absl//absl/types:span", "@llvm//:support", "@local_config_mlir//:IR", "@local_config_mlir//:Parser", diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc index a5839cf7645..dc9ec6aa8ea 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.cc @@ -15,7 +15,7 @@ limitations under the License. #include "tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h" -#include "absl/types/span.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "mlir/Dialect/StandardOps/Ops.h" // TF:local_config_mlir #include "mlir/IR/Function.h" // TF:local_config_mlir @@ -58,7 +58,7 @@ Status ParseMlirModule(llvm::StringRef mlir_module_string, // Converts arg_shapes to xla::Shape's and store into xla_input_shapes. Status GetXlaInputShapes( - mlir::ModuleOp module, absl::Span arg_shapes, + mlir::ModuleOp module, llvm::ArrayRef arg_shapes, const xla::CustomShapeRepresentationFn shape_representation_fn, std::vector* xla_input_shapes) { xla_input_shapes->clear(); @@ -150,7 +150,8 @@ void GetInputMappingForMlir(int num_inputs, std::vector* input_mapping) { } // Refine MLIR types based on new shape information. -Status RefineShapes(absl::Span arg_shapes, mlir::ModuleOp module) { +Status RefineShapes(llvm::ArrayRef arg_shapes, + mlir::ModuleOp module) { auto versions = module.getAttrOfType<::mlir::DictionaryAttr>("tf.versions"); if (!versions) { return errors::Internal( @@ -234,7 +235,7 @@ Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op, } Status CompileSerializedMlirToXlaHlo( - llvm::StringRef mlir_module_string, absl::Span arg_shapes, + llvm::StringRef mlir_module_string, llvm::ArrayRef arg_shapes, const XlaCompiler::ShapeRepresentationFn shape_representation_fn, XlaCompiler::CompilationResult* compilation_result) { mlir::MLIRContext mlir_context; diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h index 635c1d67f82..a07927ce432 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h +++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util.h @@ -16,7 +16,7 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_ #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_COMPILE_MLIR_UTIL_H_ -#include "absl/types/span.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringRef.h" #include "mlir/IR/Module.h" // TF:local_config_mlir #include "tensorflow/compiler/tf2xla/xla_compiler.h" @@ -40,7 +40,7 @@ Status ConvertMLIRToXlaComputation(mlir::ModuleOp module_op, // Compiles a serialized MLIR module into XLA HLO, generates all accompanying // metadata and stores them in CompilationResult. Status CompileSerializedMlirToXlaHlo( - llvm::StringRef mlir_module_string, absl::Span arg_shapes, + llvm::StringRef mlir_module_string, llvm::ArrayRef arg_shapes, const XlaCompiler::ShapeRepresentationFn shape_representation_fn, XlaCompiler::CompilationResult* compilation_result); } // namespace tensorflow diff --git a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc index 3574b336f9a..1668cf615f0 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc +++ b/tensorflow/compiler/mlir/tensorflow/utils/compile_mlir_util_test.cc @@ -41,9 +41,9 @@ TEST(CompileSerializedMlirToXlaHloTest, InvalidSerializedMlirModule) { std::vector arg_shapes; XlaCompiler::CompilationResult compilation_result; - Status s = CompileSerializedMlirToXlaHlo( - invalid_mlir_module, absl::Span(arg_shapes), - TestShapeRepresentation, &compilation_result); + Status s = CompileSerializedMlirToXlaHlo(invalid_mlir_module, arg_shapes, + TestShapeRepresentation, + &compilation_result); EXPECT_EQ(s.code(), tensorflow::errors::Code::INVALID_ARGUMENT); } @@ -61,8 +61,7 @@ TEST(CompileSerializedMlirToXlaHloTest, Success) { XlaCompiler::CompilationResult compilation_result; Status s = CompileSerializedMlirToXlaHlo( - mlir_module, absl::Span(arg_shapes), TestShapeRepresentation, - &compilation_result); + mlir_module, arg_shapes, TestShapeRepresentation, &compilation_result); ASSERT_TRUE(s.ok()); const xla::HloModuleConfig module_config( @@ -134,8 +133,7 @@ TEST(CompileSerializedMlirToXlaHloTest, CompileTimeConstantFoldedSuccess) { XlaCompiler::CompilationResult compilation_result; Status s = CompileSerializedMlirToXlaHlo( - mlir_module, absl::Span(arg_shapes), TestShapeRepresentation, - &compilation_result); + mlir_module, arg_shapes, TestShapeRepresentation, &compilation_result); ASSERT_TRUE(s.ok()); const xla::HloModuleConfig module_config( @@ -174,8 +172,7 @@ TEST(CompileSerializedMlirToXlaHloTest, ShapeInference) { XlaCompiler::CompilationResult compilation_result; Status s = CompileSerializedMlirToXlaHlo( - mlir_module, absl::Span(arg_shapes), TestShapeRepresentation, - &compilation_result); + mlir_module, arg_shapes, TestShapeRepresentation, &compilation_result); TF_ASSERT_OK(s); const xla::HloModuleConfig module_config( From df09025fc0d70c2965b0511340b51f00e9b32cbd Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Thu, 5 Dec 2019 11:09:49 -0800 Subject: [PATCH 178/383] [XLA] Ensure buffers in xla::Literal are aligned. PiperOrigin-RevId: 284012421 Change-Id: I862f7a79016aa3a526b446765323caec3cc5e868 --- tensorflow/compiler/xla/literal.cc | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc index 3a219673304..bbd640f6064 100644 --- a/tensorflow/compiler/xla/literal.cc +++ b/tensorflow/compiler/xla/literal.cc @@ -38,6 +38,7 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/hash/hash.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/mem.h" #include "tensorflow/core/platform/types.h" namespace xla { @@ -131,18 +132,23 @@ void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) { } } else if (shape.IsArray()) { if (allocate_arrays) { + // Literals can be used as DMA targets, which can require alignment. We + // force a 16-byte minimum alignment. + constexpr int kMinimumAlignment = 16; if (LayoutUtil::IsSparseArray(shape)) { // For sparse arrays, the buffer must be of the size of the maximum // number of sparse elements possible. const int64 max_sparse_elements = LayoutUtil::MaxSparseElements(shape.layout()); - piece->set_buffer( - new char[max_sparse_elements * - ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]); + piece->set_buffer(static_cast(tensorflow::port::AlignedMalloc( + max_sparse_elements * + ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type()), + kMinimumAlignment))); piece->set_sparse_indices( new SparseIndexArray(max_sparse_elements, shape.rank())); } else { - piece->set_buffer(new char[piece->size_bytes()]); + piece->set_buffer(static_cast(tensorflow::port::AlignedMalloc( + piece->size_bytes(), kMinimumAlignment))); } } } else { @@ -174,7 +180,7 @@ void Literal::DeallocateBuffers() { root_piece_->ForEachMutableSubpiece( [&](const ShapeIndex& index, Piece* piece) { if (piece->buffer() != nullptr) { - delete[] piece->buffer(); + tensorflow::port::AlignedFree(piece->buffer()); delete piece->sparse_indices(); } }); @@ -504,7 +510,7 @@ Status Literal::MoveFrom(Literal&& src_literal, dest_index.push_back(i); } Piece& dest_piece = piece(dest_index); - delete[] dest_piece.buffer(); + tensorflow::port::AlignedFree(dest_piece.buffer()); dest_piece.set_buffer(src_piece.buffer()); delete dest_piece.sparse_indices(); dest_piece.set_sparse_indices(src_piece.sparse_indices()); From 931140aeee4477a2582c798569a7b69a49b25af6 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Thu, 5 Dec 2019 19:14:47 +0000 Subject: [PATCH 179/383] Add _maybe_wrap function to simplify wrapping of AutoCastVariable --- .../experimental/autocast_variable.py | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py index fbe0cde5e72..c6f39c42b42 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py +++ b/tensorflow/python/keras/mixed_precision/experimental/autocast_variable.py @@ -186,65 +186,59 @@ class AutoCastVariable(variables.Variable): def assign(self, value, use_locking=None, name=None, read_value=True): assign_op = self._variable.assign(value, use_locking, name, read_value) - if read_value and resource_variable_ops.is_resource_variable(assign_op): - return create_autocast_variable(assign_op) - return assign_op + return _maybe_wrap(assign_op, wrap=read_value) def assign_add(self, delta, use_locking=None, name=None, read_value=True): assign_op = self._variable.assign_add(delta, use_locking, name, read_value) - if read_value and resource_variable_ops.is_resource_variable(assign_op): - return create_autocast_variable(assign_op) - return assign_op + return _maybe_wrap(assign_op, wrap=read_value) def assign_sub(self, delta, use_locking=None, name=None, read_value=True): assign_op = self._variable.assign_sub(delta, use_locking, name, read_value) - if read_value and resource_variable_ops.is_resource_variable(assign_op): - return create_autocast_variable(assign_op) - return assign_op + return _maybe_wrap(assign_op, wrap=read_value) def scatter_sub(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_sub(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_add(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_add(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_max(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_max(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_min(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_min(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_mul(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_mul(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_div(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_div(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_update(self, sparse_delta, use_locking=False, name=None): var = self._variable.scatter_update(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def batch_scatter_update(self, sparse_delta, use_locking=False, name=None): var = self._variable.batch_scatter_update(sparse_delta, use_locking, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_nd_sub(self, indices, updates, name=None): var = self._variable.scatter_nd_sub(indices, updates, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_nd_add(self, indices, updates, name=None): var = self._variable.scatter_nd_add(indices, updates, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def scatter_nd_update(self, indices, updates, name=None): var = self._variable.scatter_nd_update(indices, updates, name) - return create_autocast_variable(var) + return _maybe_wrap(var) def load(self, value, session=None): return self._variable.load(value, session) @@ -430,3 +424,24 @@ def create_autocast_variable(variable): # pylint: enable=missing-format-attribute return AutoCastDistributedVariable(variable) + + +def _maybe_wrap(variable, wrap=True): + """Creates an AutoCastVariable that wraps another variable if applicable. + + This function is used to wrap the return value of AutoCastVariable.assign. + Unfortunately MirroredVariable.assign will (incorrectly) return a Mirrored + value instead of a MirroredVariable. So we cannot properly wrap it in an + AutoCastVariable. We return the original variable in that case. + + Args: + variable: A tf.Variable or op. + wrap: A boolean to define whether to wrap the variable in an + AutoCastVariable or not. + + Returns: + An AutoCastVariable if wrap is True and variable is a resource variable. + """ + if wrap and resource_variable_ops.is_resource_variable(variable): + return create_autocast_variable(variable) + return variable From 2ae241594410aaf19a5cad2604a2fa5955df3421 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 11:12:03 -0800 Subject: [PATCH 180/383] Fix minor warnings. When compiling with -Wall/-Werror, several warnings related to signed/unsigned comparison and an incorrect format string kill the build. Additionally, when compiling under GCC 4.8.x, `max_align_t` is not a member of `std`. This change fixes these minor errors. PiperOrigin-RevId: 284012863 Change-Id: I52846f0125933d37788b766746c89ab6eb160d8f --- .../lite/experimental/micro/micro_allocator.cc | 17 ++--------------- .../experimental/micro/micro_interpreter.cc | 4 ++-- .../micro/micro_optional_debug_tools.cc | 9 +-------- .../lite/experimental/micro/test_helpers.cc | 2 +- 4 files changed, 6 insertions(+), 26 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_allocator.cc b/tensorflow/lite/experimental/micro/micro_allocator.cc index 48a0901c7ce..82b3b350c23 100644 --- a/tensorflow/lite/experimental/micro/micro_allocator.cc +++ b/tensorflow/lite/experimental/micro/micro_allocator.cc @@ -42,19 +42,6 @@ struct TensorInfo { // requirement for SIMD extensions. constexpr int kBufferAlignment = 16; -// If building with GCC 4.8.x or lower, `max_align_t` is not a member of `std`. -// If using a newer version of GCC, we import `max_align_t` into the local -// anonymous namespace to be able to use it like the global `max_align_t` from -// the older clib. -#ifdef __GNUC__ -#if __GNUC_PREREQ(4, 9) -using std::max_align_t; -#endif -#else -// We assume other compilers don't have this issue. -using std::max_align_t; -#endif - class MicroBuiltinDataAllocator : public BuiltinDataAllocator { public: explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator) @@ -64,7 +51,7 @@ class MicroBuiltinDataAllocator : public BuiltinDataAllocator { // Align to an address that is proper for all primitive types, but no more // than the size. return memory_allocator_->AllocateFromTail( - size, std::min(size, alignof(max_align_t))); + size, std::min(size, alignof(std::max_align_t))); } void Deallocate(void* data) override { // Do not deallocate, builtin data needs to be available for the life time @@ -425,7 +412,7 @@ TfLiteStatus MicroAllocator::InitializeRuntimeTensor( // If we've found a buffer, does it have any data? if (auto* array = buffer->data()) { // If it has any data, is the data size larger than zero? - if (array->size()) { + if (size_t array_size = array->size()) { // We've found a buffer with valid data, so update the runtime tensor // data structure to point to it. result->data.raw = diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index 7185d643514..ba46cbfd95a 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -21,7 +21,7 @@ limitations under the License. namespace tflite { namespace { -const size_t kStackDataAllocatorSize = 128; +const int kStackDataAllocatorSize = 128; class StackDataAllocator : public BuiltinDataAllocator { public: void* Allocate(size_t size) override { @@ -91,7 +91,7 @@ MicroInterpreter::MicroInterpreter(const Model* model, // NOTE: This requires that the flatbuffer is held in memory which can be // modified by this process. if (!FLATBUFFERS_LITTLEENDIAN) { - for (size_t t = 0; t < tensors_size(); ++t) { + for (int t = 0; t < tensors_size(); ++t) { TfLiteTensor* thisTensor = &context_.tensors[t]; if (thisTensor->allocation_type == kTfLiteMmapRo) CorrectTensorEndianness(thisTensor); diff --git a/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc b/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc index 1f6ce531f05..e27317a5443 100644 --- a/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc +++ b/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc @@ -14,13 +14,6 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/experimental/micro/micro_optional_debug_tools.h" -// `cinttypes` requires `__STDC_FORMAT_MACROS` to be defined to expose `PRId32`. -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include - #include "tensorflow/lite/schema/schema_generated.h" namespace tflite { namespace { @@ -129,7 +122,7 @@ void PrintInterpreterState(MicroInterpreter* interpreter) { printf("Node %3zu Operator Custom Name %s\n", node_index, reg->custom_name); } else { - printf("Node %3zu Operator Builtin Code %3" PRId32 " %s\n", node_index, + printf("Node %3zu Operator Builtin Code %3d %s\n", node_index, reg->builtin_code, EnumNamesBuiltinOperator()[reg->builtin_code]); } printf(" Inputs:"); diff --git a/tensorflow/lite/experimental/micro/test_helpers.cc b/tensorflow/lite/experimental/micro/test_helpers.cc index a1b9801ffc9..03e1d91fce0 100644 --- a/tensorflow/lite/experimental/micro/test_helpers.cc +++ b/tensorflow/lite/experimental/micro/test_helpers.cc @@ -47,7 +47,7 @@ class StackAllocator : public flatbuffers::Allocator { return *inst; } - static constexpr size_t kStackAllocatorSize = 4096; + static constexpr int kStackAllocatorSize = 4096; private: uint8_t data_backing_[kStackAllocatorSize]; From b06d620237f183771ff663e01b01ddd8d284f267 Mon Sep 17 00:00:00 2001 From: Blake Hechtman Date: Thu, 5 Dec 2019 11:21:34 -0800 Subject: [PATCH 181/383] [XLA] Reset the state of multioutput fusion after run since it may be inside an hlo pipeline. PiperOrigin-RevId: 284014817 Change-Id: I23005e869d68a21d9fafe202bc56fe891c639b47 --- tensorflow/compiler/xla/service/multi_output_fusion.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc index 74f2c95102a..07b6fb5bf85 100644 --- a/tensorflow/compiler/xla/service/multi_output_fusion.cc +++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc @@ -108,6 +108,11 @@ StatusOr MultiOutputFusion::Run(HloModule* module) { changed = true; } } + // Clean up state in case this pass is wrapped in an HloPassPipeline. + candidates_.clear(); + candidates_index_.clear(); + all_fusion_candidates_.clear(); + reachability_.reset(); return changed; } From 8589adb8a2f149d97b82549655674353773108b2 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 11:31:28 -0800 Subject: [PATCH 182/383] Allow specification of the workgroup size for GPUToSPIRV lowering. SPIR-V/Vulkan spec requires the workgroups size to be specified with the spv.ExecutionMode operation. This was hard-wired to be set to a particular value. It is now changed to be configurable by clients of the pass or of the patterns that implement the lowering from GPU to SPIRV. PiperOrigin-RevId: 284017482 Change-Id: Iba265f85a858de6940e44831d897ebefd99e7cf4 --- third_party/mlir/BUILD | 1 + .../Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h | 6 ++-- .../GPUToSPIRV/ConvertGPUToSPIRVPass.h | 8 +++-- .../lib/Conversion/GPUToSPIRV/CMakeLists.txt | 1 + .../GPUToSPIRV/ConvertGPUToSPIRV.cpp | 24 +++++++++---- .../GPUToSPIRV/ConvertGPUToSPIRVPass.cpp | 34 ++++++++++++++++--- 6 files changed, 59 insertions(+), 15 deletions(-) diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index cbcc7eb18fd..005ac4d445b 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -762,6 +762,7 @@ cc_library( ":SPIRVDialect", ":SPIRVLowering", ":StandardToSPIRVConversions", + ":Support", ":Transforms", ], alwayslink = 1, diff --git a/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h b/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h index f617986cdcc..134dbf40b4d 100644 --- a/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h +++ b/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.h @@ -27,10 +27,12 @@ namespace mlir { class SPIRVTypeConverter; /// Appends to a pattern list additional patterns for translating GPU Ops to -/// SPIR-V ops. +/// SPIR-V ops. Needs the workgroup size as input since SPIR-V/Vulkan requires +/// the workgroup size to be statically specified. void populateGPUToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, - OwningRewritePatternList &patterns); + OwningRewritePatternList &patterns, + ArrayRef workGroupSize); } // namespace mlir #endif // MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRV_H diff --git a/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h b/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h index be8cad2b3d1..8f0a910c74d 100644 --- a/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h +++ b/third_party/mlir/include/mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h @@ -22,6 +22,8 @@ #ifndef MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRVPASS_H #define MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRVPASS_H +#include "mlir/Support/LLVM.h" + #include namespace mlir { @@ -29,8 +31,10 @@ namespace mlir { class ModuleOp; template class OpPassBase; -/// Pass to convert GPU Ops to SPIR-V ops. -std::unique_ptr> createConvertGPUToSPIRVPass(); +/// Pass to convert GPU Ops to SPIR-V ops. Needs the workgroup size as input +/// since SPIR-V/Vulkan requires the workgroup size to be statically specified. +std::unique_ptr> +createConvertGPUToSPIRVPass(ArrayRef workGroupSize); } // namespace mlir #endif // MLIR_CONVERSION_GPUTOSPIRV_CONVERTGPUTOSPIRVPASS_H diff --git a/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt b/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt index a562439108d..be82894461d 100644 --- a/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt +++ b/third_party/mlir/lib/Conversion/GPUToSPIRV/CMakeLists.txt @@ -10,5 +10,6 @@ target_link_libraries(MLIRGPUtoSPIRVTransforms MLIRSPIRV MLIRStandardOps MLIRStandardToSPIRVTransforms + MLIRSupport MLIRTransforms ) diff --git a/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp b/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp index 23e7b9166bb..2c1847d99ed 100644 --- a/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp +++ b/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRV.cpp @@ -54,11 +54,21 @@ public: /// attribute gpu.kernel) within a spv.module. class KernelFnConversion final : public SPIRVOpLowering { public: - using SPIRVOpLowering::SPIRVOpLowering; + KernelFnConversion(MLIRContext *context, SPIRVTypeConverter &converter, + ArrayRef workGroupSize, + PatternBenefit benefit = 1) + : SPIRVOpLowering(context, converter, benefit) { + auto config = workGroupSize.take_front(3); + workGroupSizeAsInt32.assign(config.begin(), config.end()); + workGroupSizeAsInt32.resize(3, 1); + } PatternMatchResult matchAndRewrite(FuncOp funcOp, ArrayRef operands, ConversionPatternRewriter &rewriter) const override; + +private: + SmallVector workGroupSizeAsInt32; }; } // namespace @@ -172,10 +182,10 @@ KernelFnConversion::matchAndRewrite(FuncOp funcOp, ArrayRef operands, argABI.push_back(spirv::getInterfaceVarABIAttr( 0, argNum, spirv::StorageClass::StorageBuffer, rewriter.getContext())); } - // TODO(ravishankarm) : For now set this to {32, 1, 1}. This is incorrect. The - // actual workgroup size needs to be plumbed through. + auto context = rewriter.getContext(); - auto entryPointAttr = spirv::getEntryPointABIAttr({32, 1, 1}, context); + auto entryPointAttr = + spirv::getEntryPointABIAttr(workGroupSizeAsInt32, context); FuncOp newFuncOp = spirv::lowerAsEntryFunction( funcOp, typeConverter, rewriter, argABI, entryPointAttr); if (!newFuncOp) { @@ -189,9 +199,11 @@ KernelFnConversion::matchAndRewrite(FuncOp funcOp, ArrayRef operands, namespace mlir { void populateGPUToSPIRVPatterns(MLIRContext *context, SPIRVTypeConverter &typeConverter, - OwningRewritePatternList &patterns) { + OwningRewritePatternList &patterns, + ArrayRef workGroupSize) { + patterns.insert(context, typeConverter, workGroupSize); patterns.insert< - ForOpConversion, KernelFnConversion, + ForOpConversion, LaunchConfigConversion, LaunchConfigConversion, LaunchConfigConversion, diff --git a/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp b/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp index 49f161e3794..cec71ca9b3f 100644 --- a/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp +++ b/third_party/mlir/lib/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.cpp @@ -28,6 +28,7 @@ #include "mlir/Dialect/SPIRV/SPIRVLowering.h" #include "mlir/Dialect/SPIRV/SPIRVOps.h" #include "mlir/Pass/Pass.h" +#include "mlir/Pass/PassRegistry.h" using namespace mlir; @@ -42,7 +43,23 @@ namespace { /// /// 2) Lower the body of the spirv::ModuleOp. class GPUToSPIRVPass : public ModulePass { +public: + GPUToSPIRVPass(ArrayRef workGroupSize) + : workGroupSize(workGroupSize.begin(), workGroupSize.end()) {} void runOnModule() override; + +private: + SmallVector workGroupSize; +}; + +/// Command line option to specify the workgroup size. +struct GPUToSPIRVPassOptions : public PassOptions { + List workGroupSize{ + *this, "workgroup-size", + llvm::cl::desc( + "Workgroup Sizes in the SPIR-V module for x, followed by y, followed " + "by z dimension of the dispatch (others will be ignored)"), + llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated}; }; } // namespace @@ -80,7 +97,7 @@ void GPUToSPIRVPass::runOnModule() { /// Dialect conversion to lower the functions with the spirv::ModuleOps. SPIRVTypeConverter typeConverter; OwningRewritePatternList patterns; - populateGPUToSPIRVPatterns(context, typeConverter, patterns); + populateGPUToSPIRVPatterns(context, typeConverter, patterns, workGroupSize); populateStandardToSPIRVPatterns(context, typeConverter, patterns); ConversionTarget target(*context); @@ -94,9 +111,16 @@ void GPUToSPIRVPass::runOnModule() { } } -std::unique_ptr> mlir::createConvertGPUToSPIRVPass() { - return std::make_unique(); +std::unique_ptr> +mlir::createConvertGPUToSPIRVPass(ArrayRef workGroupSize) { + return std::make_unique(workGroupSize); } -static PassRegistration - pass("convert-gpu-to-spirv", "Convert GPU dialect to SPIR-V dialect"); +static PassRegistration + pass("convert-gpu-to-spirv", "Convert GPU dialect to SPIR-V dialect", + [](const GPUToSPIRVPassOptions &passOptions) { + SmallVector workGroupSize; + workGroupSize.assign(passOptions.workGroupSize.begin(), + passOptions.workGroupSize.end()); + return std::make_unique(workGroupSize); + }); From cec0d99443b8ad1930594ceaf35c51e80247b7d2 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Thu, 5 Dec 2019 11:52:58 -0800 Subject: [PATCH 183/383] Add support for instance specific pass statistics. Statistics are a way to keep track of what the compiler is doing and how effective various optimizations are. It is useful to see what optimizations are contributing to making a particular program run faster. Pass-instance specific statistics take this even further as you can see the effect of placing a particular pass at specific places within the pass pipeline, e.g. they could help answer questions like "what happens if I run CSE again here". Statistics can be added to a pass by simply adding members of type 'Pass::Statistics'. This class takes as a constructor arguments: the parent pass pointer, a name, and a description. Statistics can be dumped by the pass manager in a similar manner to how pass timing information is dumped, i.e. via PassManager::enableStatistics programmatically; or -pass-statistics and -pass-statistics-display via the command line pass manager options. Below is an example: struct MyPass : public OperationPass { Statistic testStat{this, "testStat", "A test statistic"}; void runOnOperation() { ... ++testStat; ... } }; $ mlir-opt -pass-pipeline='func(my-pass,my-pass)' foo.mlir -pass-statistics Pipeline Display: ===-------------------------------------------------------------------------=== ... Pass statistics report ... ===-------------------------------------------------------------------------=== 'func' Pipeline MyPass (S) 15 testStat - A test statistic MyPass (S) 6 testStat - A test statistic List Display: ===-------------------------------------------------------------------------=== ... Pass statistics report ... ===-------------------------------------------------------------------------=== MyPass (S) 21 testStat - A test statistic PiperOrigin-RevId: 284022014 Change-Id: If4794eb4500c89a8e98d20094b08f28eba9a1570 --- third_party/mlir/BUILD | 1 + third_party/mlir/g3doc/WritingAPass.md | 78 +++++- third_party/mlir/include/mlir/Pass/Pass.h | 26 ++ .../mlir/include/mlir/Pass/PassManager.h | 35 ++- third_party/mlir/lib/Pass/Pass.cpp | 28 +- third_party/mlir/lib/Pass/PassDetail.h | 8 + .../mlir/lib/Pass/PassManagerOptions.cpp | 28 +- third_party/mlir/lib/Pass/PassStatistics.cpp | 258 ++++++++++++++++++ third_party/mlir/lib/Pass/PassTiming.cpp | 25 +- third_party/mlir/lib/Transforms/CSE.cpp | 7 + .../mlir/test/lib/Pass/TestPassManager.cpp | 15 + 11 files changed, 476 insertions(+), 33 deletions(-) create mode 100644 third_party/mlir/lib/Pass/PassStatistics.cpp diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index 005ac4d445b..2b6da27099d 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -158,6 +158,7 @@ cc_library( "lib/Pass/PassDetail.h", "lib/Pass/PassManagerOptions.cpp", "lib/Pass/PassRegistry.cpp", + "lib/Pass/PassStatistics.cpp", "lib/Pass/PassTiming.cpp", ], hdrs = [ diff --git a/third_party/mlir/g3doc/WritingAPass.md b/third_party/mlir/g3doc/WritingAPass.md index 1e4564aa21d..fc73b7e9ef3 100644 --- a/third_party/mlir/g3doc/WritingAPass.md +++ b/third_party/mlir/g3doc/WritingAPass.md @@ -319,10 +319,10 @@ program has been run through the passes. This provides several benefits: ## Pass Registration -Briefly shown in the example definitions of the various -pass types is the `PassRegistration` class. This is a utility to -register derived pass classes so that they may be created, and inspected, by -utilities like mlir-opt. Registering a pass class takes the form: +Briefly shown in the example definitions of the various pass types is the +`PassRegistration` class. This is a utility to register derived pass classes so +that they may be created, and inspected, by utilities like mlir-opt. Registering +a pass class takes the form: ```c++ static PassRegistration pass("command-line-arg", "description"); @@ -469,6 +469,76 @@ struct MyPassOptions : public PassOptions { static PassRegistration pass("my-pass", "description"); ``` +## Pass Statistics + +Statistics are a way to keep track of what the compiler is doing and how +effective various transformations are. It is often useful to see what effect +specific transformations have on a particular program, and how often they +trigger. Pass statistics are instance specific which allow for taking this a +step further as you are able to see the effect of placing a particular +transformation at specific places within the pass pipeline. For example, they +help answer questions like `What happens if I run CSE again here?`. + +Statistics can be added to a pass by using the 'Pass::Statistic' class. This +class takes as a constructor arguments: the parent pass, a name, and a +description. This class acts like an unsigned integer, and may be incremented +and updated accordingly. These statistics use the same infrastructure as +[`llvm::Statistic`](http://llvm.org/docs/ProgrammersManual.html#the-statistic-class-stats-option) +and thus have similar usage constraints. Collected statistics can be dumped by +the [pass manager](#pass-manager) programmatically via +`PassManager::enableStatistics`; or via `-pass-statistics` and +`-pass-statistics-display` on the command line. + +An example is shown below: + +```c++ +struct MyPass : public OperationPass { + Statistic testStat{this, "testStat", "A test statistic"}; + + void runOnOperation() { + ... + + // Update our statistic after some invariant was hit. + ++testStat; + + ... + } +}; +``` + +The collected statistics may be aggregated in two types of views: + +A pipeline view that models the structure of the pass manager, this is the +default view: + +```shell +$ mlir-opt -pass-pipeline='func(my-pass,my-pass)' foo.mlir -pass-statistics + +===-------------------------------------------------------------------------=== + ... Pass statistics report ... +===-------------------------------------------------------------------------=== +'func' Pipeline + MyPass + (S) 15 testStat - A test statistic + VerifierPass + MyPass + (S) 6 testStat - A test statistic + VerifierPass +VerifierPass +``` + +And a list view that aggregates all instances of a specific pass together: + +```shell +$ mlir-opt -pass-pipeline='func(my-pass, my-pass)' foo.mlir -pass-statistics -pass-statistics-display=list + +===-------------------------------------------------------------------------=== + ... Pass statistics report ... +===-------------------------------------------------------------------------=== +MyPass + (S) 21 testStat - A test statistic +``` + ## Pass Instrumentation MLIR provides a customizable framework to instrument pass execution and analysis diff --git a/third_party/mlir/include/mlir/Pass/Pass.h b/third_party/mlir/include/mlir/Pass/Pass.h index 4fc277e3edc..274ae9d12e3 100644 --- a/third_party/mlir/include/mlir/Pass/Pass.h +++ b/third_party/mlir/include/mlir/Pass/Pass.h @@ -23,6 +23,7 @@ #include "mlir/Pass/PassRegistry.h" #include "mlir/Support/LogicalResult.h" #include "llvm/ADT/PointerIntPair.h" +#include "llvm/ADT/Statistic.h" namespace mlir { namespace detail { @@ -76,6 +77,28 @@ public: /// pass to be to be round-trippable to the textual format. virtual void printAsTextualPipeline(raw_ostream &os); + /// This class represents a single pass statistic. This statistic functions + /// similarly to an unsigned integer value, and may be updated and incremented + /// accordingly. This class can be used to provide additional information + /// about the transformations and analyses performed by a pass. + class Statistic : public llvm::Statistic { + public: + /// The statistic is initialized by the pass owner, a name, and a + /// description. + Statistic(Pass *owner, const char *name, const char *description); + + /// Assign the statistic to the given value. + Statistic &operator=(unsigned value); + + private: + /// Hide some of the details of llvm::Statistic that we don't use. + using llvm::Statistic::getDebugType; + }; + + /// Returns the main statistics for this pass instance. + ArrayRef getStatistics() const { return statistics; } + MutableArrayRef getStatistics() { return statistics; } + protected: explicit Pass(const PassID *passID, llvm::Optional opName = llvm::None) @@ -125,6 +148,9 @@ private: /// The current execution state for the pass. llvm::Optional passState; + /// The set of statistics held by this pass. + std::vector statistics; + /// Allow access to 'clone' and 'run'. friend class OpPassManager; }; diff --git a/third_party/mlir/include/mlir/Pass/PassManager.h b/third_party/mlir/include/mlir/Pass/PassManager.h index fa0788f28b5..5580806422f 100644 --- a/third_party/mlir/include/mlir/Pass/PassManager.h +++ b/third_party/mlir/include/mlir/Pass/PassManager.h @@ -21,6 +21,9 @@ #include "mlir/Support/LogicalResult.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/iterator.h" + +#include namespace llvm { class Any; @@ -54,6 +57,13 @@ public: ~OpPassManager(); OpPassManager &operator=(const OpPassManager &rhs); + /// Iterator over the passes in this pass manager. + using pass_iterator = + llvm::pointee_iterator>::iterator>; + pass_iterator begin(); + pass_iterator end(); + llvm::iterator_range getPasses() { return {begin(), end()}; } + /// Run the held passes over the given operation. LogicalResult run(Operation *op, AnalysisManager am); @@ -93,6 +103,9 @@ public: /// the correctness of per-pass overrides of Pass::printAsTextualPipeline. void printAsTextualPipeline(raw_ostream &os); + /// Merge the pass statistics of this class into 'other'. + void mergeStatisticsInto(OpPassManager &other); + private: OpPassManager(OperationName name, bool disableThreads, bool verifyPasses); @@ -107,10 +120,10 @@ private: // PassManager //===----------------------------------------------------------------------===// -/// An enum describing the different display modes for the pass timing -/// information within the pass manager. -enum class PassTimingDisplayMode { - // In this mode the results are displayed in a list sorted by total time, +/// An enum describing the different display modes for the information within +/// the pass manager. +enum class PassDisplayMode { + // In this mode the results are displayed in a list sorted by total, // with each pass/analysis instance aggregated into one unique result. List, @@ -162,13 +175,23 @@ public: /// Note: Timing should be enabled after all other instrumentations to avoid /// any potential "ghost" timing from other instrumentations being /// unintentionally included in the timing results. - void enableTiming( - PassTimingDisplayMode displayMode = PassTimingDisplayMode::Pipeline); + void enableTiming(PassDisplayMode displayMode = PassDisplayMode::Pipeline); + + /// Prompts the pass manager to print the statistics collected for each of the + /// held passes after each call to 'run'. + void + enableStatistics(PassDisplayMode displayMode = PassDisplayMode::Pipeline); private: + /// Dump the statistics of the passes within this pass manager. + void dumpStatistics(); + /// Flag that specifies if pass timing is enabled. bool passTiming : 1; + /// Flag that specifies if pass statistics should be dumped. + Optional passStatisticsMode; + /// A manager for pass instrumentations. std::unique_ptr instrumentor; diff --git a/third_party/mlir/lib/Pass/Pass.cpp b/third_party/mlir/lib/Pass/Pass.cpp index 6d8e230eeec..fc1ad5bb939 100644 --- a/third_party/mlir/lib/Pass/Pass.cpp +++ b/third_party/mlir/lib/Pass/Pass.cpp @@ -216,6 +216,11 @@ OpPassManager &OpPassManager::operator=(const OpPassManager &rhs) { OpPassManager::~OpPassManager() {} +OpPassManager::pass_iterator OpPassManager::begin() { + return impl->passes.begin(); +} +OpPassManager::pass_iterator OpPassManager::end() { return impl->passes.end(); } + /// Run all of the passes in this manager over the current operation. LogicalResult OpPassManager::run(Operation *op, AnalysisManager am) { // Run each of the held passes. @@ -341,6 +346,17 @@ void OpToOpPassAdaptorBase::mergeInto(OpToOpPassAdaptorBase &rhs) { }); } +/// Returns the adaptor pass name. +std::string OpToOpPassAdaptorBase::getName() { + std::string name = "Pipeline Collection : ["; + llvm::raw_string_ostream os(name); + interleaveComma(getPassManagers(), os, [&](OpPassManager &pm) { + os << '\'' << pm.getOpName() << '\''; + }); + os << ']'; + return os.str(); +} + OpToOpPassAdaptor::OpToOpPassAdaptor(OpPassManager &&mgr) : OpToOpPassAdaptorBase(std::move(mgr)) {} @@ -560,9 +576,15 @@ LogicalResult PassManager::run(ModuleOp module) { // If reproducer generation is enabled, run the pass manager with crash // handling enabled. - if (crashReproducerFileName) - return runWithCrashRecovery(*this, am, module, *crashReproducerFileName); - return OpPassManager::run(module, am); + LogicalResult result = + crashReproducerFileName + ? runWithCrashRecovery(*this, am, module, *crashReproducerFileName) + : OpPassManager::run(module, am); + + // Dump all of the pass statistics if necessary. + if (passStatisticsMode) + dumpStatistics(); + return result; } /// Disable support for multi-threading within the pass manager. diff --git a/third_party/mlir/lib/Pass/PassDetail.h b/third_party/mlir/lib/Pass/PassDetail.h index 29bb04d3ad2..d0a2ea63e7d 100644 --- a/third_party/mlir/lib/Pass/PassDetail.h +++ b/third_party/mlir/lib/Pass/PassDetail.h @@ -48,6 +48,9 @@ public: /// Returns the pass managers held by this adaptor. MutableArrayRef getPassManagers() { return mgrs; } + /// Returns the adaptor pass name. + std::string getName(); + protected: // A set of adaptors to run. SmallVector mgrs; @@ -75,6 +78,11 @@ public: /// Run the held pipeline over all operations. void runOnOperation() override; + /// Return the async pass managers held by this parallel adaptor. + MutableArrayRef> getParallelPassManagers() { + return asyncExecutors; + } + private: // A set of executors, cloned from the main executor, that run asynchronously // on different threads. diff --git a/third_party/mlir/lib/Pass/PassManagerOptions.cpp b/third_party/mlir/lib/Pass/PassManagerOptions.cpp index 58eb35c7f6a..c9b19a61556 100644 --- a/third_party/mlir/lib/Pass/PassManagerOptions.cpp +++ b/third_party/mlir/lib/Pass/PassManagerOptions.cpp @@ -69,14 +69,30 @@ struct PassManagerOptions { llvm::cl::opt passTiming{ "pass-timing", llvm::cl::desc("Display the execution times of each pass")}; - llvm::cl::opt passTimingDisplayMode{ + llvm::cl::opt passTimingDisplayMode{ "pass-timing-display", llvm::cl::desc("Display method for pass timing data"), - llvm::cl::init(PassTimingDisplayMode::Pipeline), + llvm::cl::init(PassDisplayMode::Pipeline), llvm::cl::values( - clEnumValN(PassTimingDisplayMode::List, "list", + clEnumValN(PassDisplayMode::List, "list", "display the results in a list sorted by total time"), - clEnumValN(PassTimingDisplayMode::Pipeline, "pipeline", + clEnumValN(PassDisplayMode::Pipeline, "pipeline", + "display the results with a nested pipeline view"))}; + + //===--------------------------------------------------------------------===// + // Pass Statistics + //===--------------------------------------------------------------------===// + llvm::cl::opt passStatistics{ + "pass-statistics", llvm::cl::desc("Display the statistics of each pass")}; + llvm::cl::opt passStatisticsDisplayMode{ + "pass-statistics-display", + llvm::cl::desc("Display method for pass statistics"), + llvm::cl::init(PassDisplayMode::Pipeline), + llvm::cl::values( + clEnumValN( + PassDisplayMode::List, "list", + "display the results in a merged list sorted by pass name"), + clEnumValN(PassDisplayMode::Pipeline, "pipeline", "display the results with a nested pipeline view"))}; /// Add a pass timing instrumentation if enabled by 'pass-timing' flags. @@ -146,6 +162,10 @@ void mlir::applyPassManagerCLOptions(PassManager &pm) { if ((*options)->disableThreads) pm.disableMultithreading(); + // Enable statistics dumping. + if ((*options)->passStatistics) + pm.enableStatistics((*options)->passStatisticsDisplayMode); + // Add the IR printing instrumentation. (*options)->addPrinterInstrumentation(pm); diff --git a/third_party/mlir/lib/Pass/PassStatistics.cpp b/third_party/mlir/lib/Pass/PassStatistics.cpp new file mode 100644 index 00000000000..461cf882bd2 --- /dev/null +++ b/third_party/mlir/lib/Pass/PassStatistics.cpp @@ -0,0 +1,258 @@ +//===- PassStatistics.cpp -------------------------------------------------===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= + +#include "PassDetail.h" +#include "mlir/Pass/PassManager.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Support/Format.h" + +using namespace mlir; +using namespace mlir::detail; + +constexpr llvm::StringLiteral kPassStatsDescription = + "... Pass statistics report ..."; + +namespace { +/// Information pertaining to a specific statistic. +struct Statistic { + const char *name, *desc; + unsigned value; +}; +} // end anonymous namespace + +/// Utility to print a pass entry in the statistics output. +static void printPassEntry(raw_ostream &os, unsigned indent, StringRef pass, + MutableArrayRef stats = llvm::None) { + os.indent(indent) << pass << "\n"; + if (stats.empty()) + return; + + // Make sure to sort the statistics by name. + llvm::array_pod_sort(stats.begin(), stats.end(), + [](const auto *lhs, const auto *rhs) { + return llvm::array_pod_sort_comparator( + &lhs->name, &rhs->name); + }); + + // Collect the largest name and value length from each of the statistics. + size_t largestName = 0, largestValue = 0; + for (auto &stat : stats) { + largestName = std::max(largestName, (size_t)strlen(stat.name)); + largestValue = + std::max(largestValue, (size_t)llvm::utostr(stat.value).size()); + } + + // Print each of the statistics. + for (auto &stat : stats) { + os.indent(indent + 2) << llvm::format("(S) %*u %-*s - %s\n", largestValue, + stat.value, largestName, stat.name, + stat.desc); + } +} + +/// Print the statistics results in a list form, where each pass is sorted by +/// name. +static void printResultsAsList(raw_ostream &os, OpPassManager &pm) { + llvm::StringMap> mergedStats; + std::function addStats = [&](Pass *pass) { + auto *adaptor = getAdaptorPassBase(pass); + + // If this is not an adaptor, add the stats to the list if there are any. + if (!adaptor) { + auto statistics = pass->getStatistics(); + if (statistics.empty()) + return; + + auto &passEntry = mergedStats[pass->getName()]; + if (passEntry.empty()) { + for (Pass::Statistic *it : pass->getStatistics()) + passEntry.push_back({it->getName(), it->getDesc(), it->getValue()}); + } else { + for (auto &it : llvm::enumerate(pass->getStatistics())) + passEntry[it.index()].value += it.value()->getValue(); + } + return; + } + + // Otherwise, recursively add each of the children. + for (auto &mgr : adaptor->getPassManagers()) + for (Pass &pass : mgr.getPasses()) + addStats(&pass); + }; + for (Pass &pass : pm.getPasses()) + addStats(&pass); + + // Sort the statistics by pass name and then by record name. + std::vector>> passAndStatistics; + for (auto &passIt : mergedStats) + passAndStatistics.push_back({passIt.first(), std::move(passIt.second)}); + llvm::sort(passAndStatistics, [](const auto &lhs, const auto &rhs) { + return lhs.first.compare(rhs.first) < 0; + }); + + // Print the timing information sequentially. + for (auto &statData : passAndStatistics) + printPassEntry(os, /*indent=*/2, statData.first, statData.second); +} + +/// Print the results in pipeline mode that mirrors the internal pass manager +/// structure. +static void printResultsAsPipeline(raw_ostream &os, OpPassManager &pm) { + std::function printPass = [&](unsigned indent, + Pass *pass) { + // Handle the case of an adaptor pass. + if (auto *adaptor = getAdaptorPassBase(pass)) { + // If this adaptor has more than one internal pipeline, print an entry for + // it. + auto mgrs = adaptor->getPassManagers(); + if (mgrs.size() > 1) { + printPassEntry(os, indent, adaptor->getName()); + indent += 2; + } + + // Print each of the children passes. + for (OpPassManager &mgr : mgrs) { + auto name = ("'" + mgr.getOpName().getStringRef() + "' Pipeline").str(); + printPassEntry(os, indent, name); + for (Pass &pass : mgr.getPasses()) + printPass(indent + 2, &pass); + } + return; + } + + // Otherwise, we print the statistics for this pass. + std::vector stats; + for (Pass::Statistic *stat : pass->getStatistics()) + stats.push_back({stat->getName(), stat->getDesc(), stat->getValue()}); + printPassEntry(os, indent, pass->getName(), stats); + }; + for (Pass &pass : pm.getPasses()) + printPass(/*indent=*/0, &pass); +} + +void printStatistics(OpPassManager &pm, PassDisplayMode displayMode) { + auto os = llvm::CreateInfoOutputFile(); + + // Print the stats header. + *os << "===" << std::string(73, '-') << "===\n"; + // Figure out how many spaces for the description name. + unsigned padding = (80 - kPassStatsDescription.size()) / 2; + os->indent(padding) << kPassStatsDescription << '\n'; + *os << "===" << std::string(73, '-') << "===\n"; + + // Defer to a specialized printer for each display mode. + switch (displayMode) { + case PassDisplayMode::List: + printResultsAsList(*os, pm); + break; + case PassDisplayMode::Pipeline: + printResultsAsPipeline(*os, pm); + break; + } + *os << "\n"; + os->flush(); +} + +//===----------------------------------------------------------------------===// +// PassStatistics +//===----------------------------------------------------------------------===// + +Pass::Statistic::Statistic(Pass *owner, const char *name, + const char *description) + : llvm::Statistic{/*DebugType=*/"", name, description} { +#if LLVM_ENABLE_STATS + // Always set the 'initialized' bit to true so that this statistic isn't + // placed in the static registry. + // TODO: This is sort of hack as `llvm::Statistic`s can't be setup to avoid + // automatic registartion with the global registry. We should either add + // support for this in LLVM, or just write our own statistics classes. + Initialized = true; +#endif + + // Register this statistic with the parent. + owner->statistics.push_back(this); +} + +auto Pass::Statistic::operator=(unsigned value) -> Statistic & { + llvm::Statistic::operator=(value); + return *this; +} + +//===----------------------------------------------------------------------===// +// PassManager +//===----------------------------------------------------------------------===// + +/// Merge the pass statistics of this class into 'other'. +void OpPassManager::mergeStatisticsInto(OpPassManager &other) { + auto passes = getPasses(), otherPasses = other.getPasses(); + + for (auto passPair : llvm::zip(passes, otherPasses)) { + Pass &pass = std::get<0>(passPair), &otherPass = std::get<1>(passPair); + + // If this is an adaptor, then recursively merge the pass managers. + if (auto *adaptorPass = getAdaptorPassBase(&pass)) { + auto *otherAdaptorPass = getAdaptorPassBase(&otherPass); + for (auto mgrs : llvm::zip(adaptorPass->getPassManagers(), + otherAdaptorPass->getPassManagers())) + std::get<0>(mgrs).mergeStatisticsInto(std::get<1>(mgrs)); + continue; + } + // Otherwise, merge the statistics for the current pass. + assert(pass.statistics.size() == otherPass.statistics.size()); + for (unsigned i = 0, e = pass.statistics.size(); i != e; ++i) { + assert(pass.statistics[i]->getName() == + StringRef(otherPass.statistics[i]->getName())); + *otherPass.statistics[i] += *pass.statistics[i]; + *pass.statistics[i] = 0; + } + } +} + +/// Prepare the statistics of passes within the given pass manager for +/// consumption(e.g. dumping). +static void prepareStatistics(OpPassManager &pm) { + for (Pass &pass : pm.getPasses()) { + OpToOpPassAdaptorBase *adaptor = getAdaptorPassBase(&pass); + if (!adaptor) + continue; + MutableArrayRef nestedPms = adaptor->getPassManagers(); + + // If this is a parallel adaptor, merge the statistics from the async + // pass managers into the main nested pass managers. + if (auto *parallelAdaptor = dyn_cast(&pass)) { + for (auto &asyncPM : parallelAdaptor->getParallelPassManagers()) { + for (unsigned i = 0, e = asyncPM.size(); i != e; ++i) + asyncPM[i].mergeStatisticsInto(nestedPms[i]); + } + } + + // Prepare the statistics of each of the nested passes. + for (OpPassManager &nestedPM : nestedPms) + prepareStatistics(nestedPM); + } +} + +/// Dump the statistics of the passes within this pass manager. +void PassManager::dumpStatistics() { + prepareStatistics(*this); + printStatistics(*this, *passStatisticsMode); +} + +/// Dump the statistics for each pass after running. +void PassManager::enableStatistics(PassDisplayMode displayMode) { + passStatisticsMode = displayMode; +} diff --git a/third_party/mlir/lib/Pass/PassTiming.cpp b/third_party/mlir/lib/Pass/PassTiming.cpp index 69a2cb723e5..4747249690f 100644 --- a/third_party/mlir/lib/Pass/PassTiming.cpp +++ b/third_party/mlir/lib/Pass/PassTiming.cpp @@ -169,7 +169,7 @@ struct Timer { }; struct PassTiming : public PassInstrumentation { - PassTiming(PassTimingDisplayMode displayMode) : displayMode(displayMode) {} + PassTiming(PassDisplayMode displayMode) : displayMode(displayMode) {} ~PassTiming() override { print(); } /// Setup the instrumentation hooks. @@ -242,7 +242,7 @@ struct PassTiming : public PassInstrumentation { DenseMap> activeThreadTimers; /// The display mode to use when printing the timing results. - PassTimingDisplayMode displayMode; + PassDisplayMode displayMode; /// A mapping of pipeline timers that need to be merged into the parent /// collection. The timers are mapped to the parent info to merge into. @@ -289,15 +289,8 @@ void PassTiming::startPassTimer(Pass *pass) { auto kind = isAdaptorPass(pass) ? TimerKind::PipelineCollection : TimerKind::PassOrAnalysis; Timer *timer = getTimer(pass, kind, [pass]() -> std::string { - if (auto *adaptor = getAdaptorPassBase(pass)) { - std::string name = "Pipeline Collection : ["; - llvm::raw_string_ostream os(name); - interleaveComma(adaptor->getPassManagers(), os, [&](OpPassManager &pm) { - os << '\'' << pm.getOpName() << '\''; - }); - os << ']'; - return os.str(); - } + if (auto *adaptor = getAdaptorPassBase(pass)) + return adaptor->getName(); return pass->getName(); }); @@ -345,8 +338,8 @@ void PassTiming::runAfterAnalysis(llvm::StringRef, AnalysisID *, Operation *) { static void printTimerHeader(llvm::raw_ostream &os, TimeRecord total) { os << "===" << std::string(73, '-') << "===\n"; // Figure out how many spaces to description name. - unsigned Padding = (80 - kPassTimingDescription.size()) / 2; - os.indent(Padding) << kPassTimingDescription << '\n'; + unsigned padding = (80 - kPassTimingDescription.size()) / 2; + os.indent(padding) << kPassTimingDescription << '\n'; os << "===" << std::string(73, '-') << "===\n"; // Print the total time followed by the section headers. @@ -379,10 +372,10 @@ void PassTiming::print() { // Defer to a specialized printer for each display mode. switch (displayMode) { - case PassTimingDisplayMode::List: + case PassDisplayMode::List: printResultsAsList(*os, rootTimer.get(), totalTime); break; - case PassTimingDisplayMode::Pipeline: + case PassDisplayMode::Pipeline: printResultsAsPipeline(*os, rootTimer.get(), totalTime); break; } @@ -472,7 +465,7 @@ void PassTiming::printResultsAsPipeline(raw_ostream &os, Timer *root, /// Add an instrumentation to time the execution of passes and the computation /// of analyses. -void PassManager::enableTiming(PassTimingDisplayMode displayMode) { +void PassManager::enableTiming(PassDisplayMode displayMode) { // Check if pass timing is already enabled. if (passTiming) return; diff --git a/third_party/mlir/lib/Transforms/CSE.cpp b/third_party/mlir/lib/Transforms/CSE.cpp index 70eb69c2f9c..18f9fce5e46 100644 --- a/third_party/mlir/lib/Transforms/CSE.cpp +++ b/third_party/mlir/lib/Transforms/CSE.cpp @@ -124,6 +124,10 @@ struct CSE : public OperationPass { private: /// Operations marked as dead and to be erased. std::vector opsToErase; + + /// Statistics for CSE. + Statistic numCSE{this, "num-cse'd", "Number of operations CSE'd"}; + Statistic numDCE{this, "num-dce'd", "Number of operations trivially DCE'd"}; }; } // end anonymous namespace @@ -143,6 +147,7 @@ LogicalResult CSE::simplifyOperation(ScopedMapTy &knownValues, Operation *op) { // If the operation is already trivially dead just add it to the erase list. if (op->use_empty()) { opsToErase.push_back(op); + ++numDCE; return success(); } @@ -160,6 +165,8 @@ LogicalResult CSE::simplifyOperation(ScopedMapTy &knownValues, Operation *op) { !op->getLoc().isa()) { existing->setLoc(op->getLoc()); } + + ++numCSE; return success(); } diff --git a/third_party/mlir/test/lib/Pass/TestPassManager.cpp b/third_party/mlir/test/lib/Pass/TestPassManager.cpp index aae83fb8993..d1e1a6d13ee 100644 --- a/third_party/mlir/test/lib/Pass/TestPassManager.cpp +++ b/third_party/mlir/test/lib/Pass/TestPassManager.cpp @@ -74,6 +74,18 @@ public: class TestCrashRecoveryPass : public OperationPass { void runOnOperation() final { abort(); } }; + +/// A test pass that contains a statistic. +struct TestStatisticPass : public OperationPass { + TestStatisticPass() = default; + TestStatisticPass(const TestStatisticPass &) {} + + Statistic opCount{this, "num-ops", "Number of operations counted"}; + + void runOnOperation() final { + getOperation()->walk([&](Operation *) { ++opCount; }); + } +}; } // end anonymous namespace static void testNestedPipeline(OpPassManager &pm) { @@ -106,6 +118,9 @@ static PassRegistration unusedCrashP("test-pass-crash", "Test a pass in the pass manager that always crashes"); +static PassRegistration unusedStatP("test-stats-pass", + "Test pass statistics"); + static PassPipelineRegistration<> unused("test-pm-nested-pipeline", "Test a nested pipeline in the pass manager", testNestedPipeline); From d70d9fb0710ec533e522f71b4433983bc5e4c1d5 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Thu, 5 Dec 2019 12:33:47 -0800 Subject: [PATCH 184/383] Re-enable X64 input to program with dynamic shapes. PiperOrigin-RevId: 284029794 Change-Id: I3eb5980cc6fd74e29b6736e8332a62402016c040 --- tensorflow/python/distribute/custom_training_loop_test.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/distribute/custom_training_loop_test.py b/tensorflow/python/distribute/custom_training_loop_test.py index 925fd640f52..55c2ae6a1ca 100644 --- a/tensorflow/python/distribute/custom_training_loop_test.py +++ b/tensorflow/python/distribute/custom_training_loop_test.py @@ -217,9 +217,11 @@ class InputIterationTest(test.TestCase, parameterized.TestCase): def _get_dataset(self): if tf2.enabled(): - return dataset_ops.DatasetV2.range(10).batch(2) + return dataset_ops.DatasetV2.range(10).\ + map(lambda x: math_ops.cast(x, dtypes.int32)).batch(2) else: - return dataset_ops.Dataset.range(10).batch(2) + return dataset_ops.Dataset.range(10).\ + map(lambda x: math_ops.cast(x, dtypes.int32)).batch(2) def _validate_outputs(self, actual_results): expected_results = [[i**2, (i+1)**2] for i in range(0, 10, 2)] From c4972a57701bb426b917ae850e37203e4102eadc Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 12:38:17 -0800 Subject: [PATCH 185/383] Delete std::move on temporaries PiperOrigin-RevId: 284030547 Change-Id: Id8d6fbcce931aa390f4222be350dd5284fbf64b7 --- tensorflow/core/distributed_runtime/rpc/grpc_state.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h index 805a38769f0..0f0aa66d6b7 100644 --- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h +++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h @@ -107,8 +107,7 @@ class RPCState : public GrpcClientCQTag { VLOG(2) << "Starting call: " << method_; - call_ = std::move( - stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_)); + call_ = stub_->PrepareUnaryCall(context_.get(), method_, request_buf_, cq_); call_->StartCall(); call_->Finish(&response_buf_, &status_, this); } @@ -675,7 +674,7 @@ class StreamingRPCDispatcher { context_->set_wait_for_ready(true); std::unique_ptr call = - std::move(stub_->PrepareCall(context_.get(), method_, cq_)); + stub_->PrepareCall(context_.get(), method_, cq_); state_.reset(new StreamingRPCState(std::move(call), context_)); } From 7e2b4b8c968c85e08e7d78faf4cd3dc36cf3a4ba Mon Sep 17 00:00:00 2001 From: Denis Khalikov Date: Thu, 5 Dec 2019 13:10:10 -0800 Subject: [PATCH 186/383] [spirv] Add CompositeInsertOp operation A CompositeInsertOp operation make a copy of a composite object, while modifying one part of it. Closes #292 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/292 from denis0x0D:sandbox/composite_insert 2200962b9057bda53cd2f2866b461e2797196380 PiperOrigin-RevId: 284036551 Change-Id: I47c2fb14883ebf0efcb709a849af79999c282d7c --- third_party/mlir/BUILD | 1 + .../include/mlir/Dialect/SPIRV/SPIRVBase.td | 29 ++--- .../mlir/Dialect/SPIRV/SPIRVCompositeOps.td | 118 ++++++++++++++++++ .../include/mlir/Dialect/SPIRV/SPIRVOps.td | 47 +------ .../mlir/lib/Dialect/SPIRV/SPIRVOps.cpp | 107 ++++++++++++---- 5 files changed, 221 insertions(+), 81 deletions(-) create mode 100644 third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index 2b6da27099d..0854e6be4c3 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -956,6 +956,7 @@ filegroup( "include/mlir/Dialect/SPIRV/SPIRVBase.td", "include/mlir/Dialect/SPIRV/SPIRVBitOps.td", "include/mlir/Dialect/SPIRV/SPIRVCastOps.td", + "include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td", "include/mlir/Dialect/SPIRV/SPIRVControlFlowOps.td", "include/mlir/Dialect/SPIRV/SPIRVGLSLOps.td", "include/mlir/Dialect/SPIRV/SPIRVGroupOps.td", diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td index c7acc3720e9..62095a518e9 100644 --- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVBase.td @@ -1076,6 +1076,7 @@ def SPV_OC_OpAccessChain : I32EnumAttrCase<"OpAccessChain", 65>; def SPV_OC_OpDecorate : I32EnumAttrCase<"OpDecorate", 71>; def SPV_OC_OpMemberDecorate : I32EnumAttrCase<"OpMemberDecorate", 72>; def SPV_OC_OpCompositeExtract : I32EnumAttrCase<"OpCompositeExtract", 81>; +def SPV_OC_OpCompositeInsert : I32EnumAttrCase<"OpCompositeInsert", 82>; def SPV_OC_OpConvertFToU : I32EnumAttrCase<"OpConvertFToU", 109>; def SPV_OC_OpConvertFToS : I32EnumAttrCase<"OpConvertFToS", 110>; def SPV_OC_OpConvertSToF : I32EnumAttrCase<"OpConvertSToF", 111>; @@ -1170,20 +1171,20 @@ def SPV_OpcodeAttr : SPV_OC_OpSpecConstantComposite, SPV_OC_OpFunction, SPV_OC_OpFunctionParameter, SPV_OC_OpFunctionEnd, SPV_OC_OpFunctionCall, SPV_OC_OpVariable, SPV_OC_OpLoad, SPV_OC_OpStore, SPV_OC_OpAccessChain, SPV_OC_OpDecorate, - SPV_OC_OpMemberDecorate, SPV_OC_OpCompositeExtract, SPV_OC_OpConvertFToU, - SPV_OC_OpConvertFToS, SPV_OC_OpConvertSToF, SPV_OC_OpConvertUToF, - SPV_OC_OpUConvert, SPV_OC_OpSConvert, SPV_OC_OpFConvert, SPV_OC_OpBitcast, - SPV_OC_OpFNegate, SPV_OC_OpIAdd, SPV_OC_OpFAdd, SPV_OC_OpISub, SPV_OC_OpFSub, - SPV_OC_OpIMul, SPV_OC_OpFMul, SPV_OC_OpUDiv, SPV_OC_OpSDiv, SPV_OC_OpFDiv, - SPV_OC_OpUMod, SPV_OC_OpSRem, SPV_OC_OpSMod, SPV_OC_OpFRem, SPV_OC_OpFMod, - SPV_OC_OpLogicalEqual, SPV_OC_OpLogicalNotEqual, SPV_OC_OpLogicalOr, - SPV_OC_OpLogicalAnd, SPV_OC_OpLogicalNot, SPV_OC_OpSelect, SPV_OC_OpIEqual, - SPV_OC_OpINotEqual, SPV_OC_OpUGreaterThan, SPV_OC_OpSGreaterThan, - SPV_OC_OpUGreaterThanEqual, SPV_OC_OpSGreaterThanEqual, SPV_OC_OpULessThan, - SPV_OC_OpSLessThan, SPV_OC_OpULessThanEqual, SPV_OC_OpSLessThanEqual, - SPV_OC_OpFOrdEqual, SPV_OC_OpFUnordEqual, SPV_OC_OpFOrdNotEqual, - SPV_OC_OpFUnordNotEqual, SPV_OC_OpFOrdLessThan, SPV_OC_OpFUnordLessThan, - SPV_OC_OpFOrdGreaterThan, SPV_OC_OpFUnordGreaterThan, + SPV_OC_OpMemberDecorate, SPV_OC_OpCompositeExtract, SPV_OC_OpCompositeInsert, + SPV_OC_OpConvertFToU, SPV_OC_OpConvertFToS, SPV_OC_OpConvertSToF, + SPV_OC_OpConvertUToF, SPV_OC_OpUConvert, SPV_OC_OpSConvert, SPV_OC_OpFConvert, + SPV_OC_OpBitcast, SPV_OC_OpFNegate, SPV_OC_OpIAdd, SPV_OC_OpFAdd, + SPV_OC_OpISub, SPV_OC_OpFSub, SPV_OC_OpIMul, SPV_OC_OpFMul, SPV_OC_OpUDiv, + SPV_OC_OpSDiv, SPV_OC_OpFDiv, SPV_OC_OpUMod, SPV_OC_OpSRem, SPV_OC_OpSMod, + SPV_OC_OpFRem, SPV_OC_OpFMod, SPV_OC_OpLogicalEqual, SPV_OC_OpLogicalNotEqual, + SPV_OC_OpLogicalOr, SPV_OC_OpLogicalAnd, SPV_OC_OpLogicalNot, SPV_OC_OpSelect, + SPV_OC_OpIEqual, SPV_OC_OpINotEqual, SPV_OC_OpUGreaterThan, + SPV_OC_OpSGreaterThan, SPV_OC_OpUGreaterThanEqual, SPV_OC_OpSGreaterThanEqual, + SPV_OC_OpULessThan, SPV_OC_OpSLessThan, SPV_OC_OpULessThanEqual, + SPV_OC_OpSLessThanEqual, SPV_OC_OpFOrdEqual, SPV_OC_OpFUnordEqual, + SPV_OC_OpFOrdNotEqual, SPV_OC_OpFUnordNotEqual, SPV_OC_OpFOrdLessThan, + SPV_OC_OpFUnordLessThan, SPV_OC_OpFOrdGreaterThan, SPV_OC_OpFUnordGreaterThan, SPV_OC_OpFOrdLessThanEqual, SPV_OC_OpFUnordLessThanEqual, SPV_OC_OpFOrdGreaterThanEqual, SPV_OC_OpFUnordGreaterThanEqual, SPV_OC_OpShiftRightLogical, SPV_OC_OpShiftRightArithmetic, diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td new file mode 100644 index 00000000000..71650504741 --- /dev/null +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVCompositeOps.td @@ -0,0 +1,118 @@ +//===-- SPIRVCompositeOps.td - MLIR SPIR-V Composite Ops ---*- tablegen -*-===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This file contains composite ops for SPIR-V dialect. It corresponds +// to "3.32.12. Composite Instructions" of the SPIR-V spec. +// +//===----------------------------------------------------------------------===// + +#ifndef SPIRV_COMPOSITE_OPS +#define SPIRV_COMPOSITE_OPS + +include "mlir/Dialect/SPIRV/SPIRVBase.td" + +def SPV_CompositeExtractOp : SPV_Op<"CompositeExtract", [NoSideEffect]> { + let summary = "Extract a part of a composite object."; + + let description = [{ + Result Type must be the type of object selected by the last provided + index. The instruction result is the extracted object. + + Composite is the composite to extract from. + + Indexes walk the type hierarchy, potentially down to component + granularity, to select the part to extract. All indexes must be in + bounds. All composite constituents use zero-based numbering, as + described by their OpType… instruction. + + ### Custom assembly form + + ``` {.ebnf} + composite-extract-op ::= ssa-id `=` `spv.CompositeExtract` ssa-use + `[` integer-literal (',' integer-literal)* `]` + `:` composite-type + ``` + + For example: + + ``` + %0 = spv.Variable : !spv.ptr>, Function> + %1 = spv.Load "Function" %0 ["Volatile"] : !spv.array<4x!spv.array<4xf32>> + %2 = spv.CompositeExtract %1[1 : i32] : !spv.array<4x!spv.array<4xf32>> + ``` + + }]; + + let arguments = (ins + SPV_Composite:$composite, + I32ArrayAttr:$indices + ); + + let results = (outs + SPV_Type:$component + ); + + let hasFolder = 1; +} + +// ----- + +def SPV_CompositeInsertOp : SPV_Op<"CompositeInsert", [NoSideEffect]> { + let summary = [{ + Make a copy of a composite object, while modifying one part of it. + }]; + + let description = [{ + Result Type must be the same type as Composite. + + Object is the object to use as the modified part. + + Composite is the composite to copy all but the modified part from. + + Indexes walk the type hierarchy of Composite to the desired depth, + potentially down to component granularity, to select the part to modify. + All indexes must be in bounds. All composite constituents use zero-based + numbering, as described by their OpType… instruction. The type of the + part selected to modify must match the type of Object. + + ### Custom assembly form + + ``` {.ebnf} + composite-insert-op ::= ssa-id `=` `spv.CompositeInsert` ssa-use, ssa-use + `[` integer-literal (',' integer-literal)* `]` + `:` object-type `into` composite-type + ``` + + For example: + + ``` + %0 = spv.CompositeInsert %object, %composite[1 : i32] : f32 into !spv.array<4xf32> + ``` + }]; + + let arguments = (ins + SPV_Type:$object, + SPV_Composite:$composite, + I32ArrayAttr:$indices + ); + + let results = (outs + SPV_Composite:$result + ); +} + +#endif // SPIRV_COMPOSITE_OPS diff --git a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td index 000f1ddaa79..bbb99da7209 100644 --- a/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td +++ b/third_party/mlir/include/mlir/Dialect/SPIRV/SPIRVOps.td @@ -35,6 +35,7 @@ include "mlir/Dialect/SPIRV/SPIRVArithmeticOps.td" include "mlir/Dialect/SPIRV/SPIRVAtomicOps.td" include "mlir/Dialect/SPIRV/SPIRVBitOps.td" include "mlir/Dialect/SPIRV/SPIRVCastOps.td" +include "mlir/Dialect/SPIRV/SPIRVCompositeOps.td" include "mlir/Dialect/SPIRV/SPIRVControlFlowOps.td" include "mlir/Dialect/SPIRV/SPIRVGLSLOps.td" include "mlir/Dialect/SPIRV/SPIRVGroupOps.td" @@ -108,52 +109,6 @@ def SPV_AccessChainOp : SPV_Op<"AccessChain", [NoSideEffect]> { // ----- -def SPV_CompositeExtractOp : SPV_Op<"CompositeExtract", [NoSideEffect]> { - let summary = "Extract a part of a composite object."; - - let description = [{ - Result Type must be the type of object selected by the last provided - index. The instruction result is the extracted object. - - Composite is the composite to extract from. - - Indexes walk the type hierarchy, potentially down to component - granularity, to select the part to extract. All indexes must be in - bounds. All composite constituents use zero-based numbering, as - described by their OpType… instruction. - - ### Custom assembly form - - ``` {.ebnf} - composite-extract-op ::= ssa-id `=` `spv.CompositeExtract` ssa-use - `[` integer-literal (',' integer-literal)* `]` - `:` composite-type - ``` - - For example: - - ``` - %0 = spv.Variable : !spv.ptr>, Function> - %1 = spv.Load "Function" %0 ["Volatile"] : !spv.array<4x!spv.array<4xf32>> - %2 = spv.CompositeExtract %1[1 : i32] : !spv.array<4x!spv.array<4xf32>> - ``` - - }]; - - let arguments = (ins - SPV_Composite:$composite, - I32ArrayAttr:$indices - ); - - let results = (outs - SPV_Type:$component - ); - - let hasFolder = 1; -} - -// ----- - def SPV_ControlBarrierOp : SPV_Op<"ControlBarrier", []> { let summary = [{ Wait for other invocations of this module to reach the current point of diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp index 99705f6dcc2..34e2e88aca0 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVOps.cpp @@ -377,6 +377,34 @@ static unsigned getBitWidth(Type type) { llvm_unreachable("unhandled bit width computation for type"); } +/// Walks the given type hierarchy with the given indices, potentially down +/// to component granularity, to select an element type. Returns null type and +/// emits errors with the given loc on failure. +static Type getElementType(Type type, ArrayAttr indices, Location loc) { + if (!indices.size()) { + emitError(loc, "expected at least one index"); + return nullptr; + } + + int32_t index; + for (auto indexAttr : indices) { + index = indexAttr.dyn_cast().getInt(); + if (auto cType = type.dyn_cast()) { + if (index < 0 || static_cast(index) >= cType.getNumElements()) { + emitError(loc, "index ") << index << " out of bounds for " << type; + return nullptr; + } + type = cType.getElementType(index); + } else { + emitError(loc, "cannot extract from non-composite type ") + << type << " with index " << index; + return nullptr; + } + } + + return type; +} + /// Returns true if the given `block` only contains one `spv._merge` op. static inline bool isMergeBlock(Block &block) { return !block.empty() && std::next(block.begin()) == block.end() && @@ -1094,28 +1122,11 @@ static void print(spirv::CompositeExtractOp compositeExtractOp, } static LogicalResult verify(spirv::CompositeExtractOp compExOp) { - auto resultType = compExOp.composite()->getType(); auto indicesArrayAttr = compExOp.indices().dyn_cast(); - - if (!indicesArrayAttr.size()) { - return compExOp.emitOpError( - "expected at least one index for spv.CompositeExtractOp"); - } - - int32_t index; - for (auto indexAttr : indicesArrayAttr) { - index = indexAttr.dyn_cast().getInt(); - if (auto cType = resultType.dyn_cast()) { - if (index < 0 || static_cast(index) >= cType.getNumElements()) { - return compExOp.emitOpError("index ") - << index << " out of bounds for " << resultType; - } - resultType = cType.getElementType(index); - } else { - return compExOp.emitError("cannot extract from non-composite type ") - << resultType << " with index " << index; - } - } + auto resultType = getElementType(compExOp.composite()->getType(), + indicesArrayAttr, compExOp.getLoc()); + if (!resultType) + return failure(); if (resultType != compExOp.getType()) { return compExOp.emitOpError("invalid result type: expected ") @@ -1135,6 +1146,60 @@ OpFoldResult spirv::CompositeExtractOp::fold(ArrayRef operands) { return extractCompositeElement(operands[0], indexVector); } +//===----------------------------------------------------------------------===// +// spv.CompositeInsert +//===----------------------------------------------------------------------===// + +static ParseResult parseCompositeInsertOp(OpAsmParser &parser, + OperationState &state) { + SmallVector operands; + Type objectType, compositeType; + Attribute indicesAttr; + auto loc = parser.getCurrentLocation(); + + return failure( + parser.parseOperandList(operands, 2) || + parser.parseAttribute(indicesAttr, kIndicesAttrName, state.attributes) || + parser.parseColonType(objectType) || + parser.parseKeywordType("into", compositeType) || + parser.resolveOperands(operands, {objectType, compositeType}, loc, + state.operands) || + parser.addTypesToList(compositeType, state.types)); +} + +static LogicalResult verify(spirv::CompositeInsertOp compositeInsertOp) { + auto indicesArrayAttr = compositeInsertOp.indices().dyn_cast(); + auto objectType = + getElementType(compositeInsertOp.composite()->getType(), indicesArrayAttr, + compositeInsertOp.getLoc()); + if (!objectType) + return failure(); + + if (objectType != compositeInsertOp.object()->getType()) { + return compositeInsertOp.emitOpError("object operand type should be ") + << objectType << ", but found " + << compositeInsertOp.object()->getType(); + } + + if (compositeInsertOp.composite()->getType() != compositeInsertOp.getType()) { + return compositeInsertOp.emitOpError("result type should be the same as " + "the composite type, but found ") + << compositeInsertOp.composite()->getType() << " vs " + << compositeInsertOp.getType(); + } + + return success(); +} + +static void print(spirv::CompositeInsertOp compositeInsertOp, + OpAsmPrinter &printer) { + printer << spirv::CompositeInsertOp::getOperationName() << " " + << *compositeInsertOp.object() << ", " + << *compositeInsertOp.composite() << compositeInsertOp.indices() + << " : " << compositeInsertOp.object()->getType() << " into " + << compositeInsertOp.composite()->getType(); +} + //===----------------------------------------------------------------------===// // spv.constant //===----------------------------------------------------------------------===// From 769892b35320bc17b5ef73c69d44e7acd51ecc78 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Thu, 5 Dec 2019 13:10:45 -0800 Subject: [PATCH 187/383] [tf.data] Rolling forward a previously rolled back change with a fix. PiperOrigin-RevId: 284036647 Change-Id: I9d50ad7aa8123f6928c055a25bc3dc4d69d2b95d --- .../bucket_by_sequence_length_test.py | 32 +++++------ .../kernel_tests/copy_to_device_test.py | 47 ++++++++-------- .../experimental/kernel_tests/counter_test.py | 30 +++++------ .../kernel_tests/csv_dataset_test.py | 54 +++++++++++++++++-- .../dense_to_sparse_batch_test.py | 10 ++-- .../directed_interleave_dataset_test.py | 11 ++-- .../kernel_tests/get_single_element_test.py | 28 ++++++---- .../kernel_tests/group_by_reducer_test.py | 15 ++++-- .../kernel_tests/group_by_window_test.py | 17 ++++-- .../kernel_tests/ignore_errors_test.py | 11 ++-- .../make_batched_features_dataset_test.py | 15 ++++-- .../kernel_tests/make_csv_dataset_test.py | 23 ++++++-- .../make_tf_record_dataset_test.py | 14 +++-- .../kernel_tests/map_defun_op_test.py | 46 ++++++++++++++-- .../kernel_tests/override_threadpool_test.py | 47 +++++++--------- .../kernel_tests/parallel_interleave_test.py | 40 ++++++++++++-- .../parse_example_dataset_test.py | 30 ++++++++--- .../kernel_tests/prefetch_to_device_test.py | 19 ++++--- .../kernel_tests/prefetch_with_slack_test.py | 11 ++-- .../kernel_tests/rebatch_dataset_test.py | 44 +++++++++++---- .../kernel_tests/rejection_resample_test.py | 16 +++--- .../kernel_tests/shuffle_and_repeat_test.py | 16 ++++-- .../kernel_tests/sql_dataset_test.py | 42 +++++++++++++-- .../kernel_tests/stats_dataset_ops_test.py | 30 +++++++++-- .../kernel_tests/take_while_test.py | 34 +++++++++--- .../kernel_tests/tf_record_writer_test.py | 15 ++++-- .../experimental/kernel_tests/unique_test.py | 11 ++-- .../experimental/kernel_tests/variant_test.py | 9 ++-- .../kernel_tests/wrap_unwrap_test.py | 12 +++-- 29 files changed, 539 insertions(+), 190 deletions(-) diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py index d9c463d744d..d829863b994 100644 --- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py @@ -25,11 +25,11 @@ from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test @@ -73,14 +73,12 @@ def _get_record_shape(sparse): return tensor_shape.TensorShape([None]) -@test_util.run_all_in_graph_and_eager_modes class BucketBySequenceLengthTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.named_parameters( - ("WithoutPadding", True), - ("WithPadding", False), - ) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(param_no_padding=[True, False]))) def testBucketDropReminder(self, param_no_padding): boundaries = [10, 20, 30] @@ -201,10 +199,9 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, _test_bucket_by_padding(param_no_padding) - @parameterized.named_parameters( - ("WithoutPadding", True), - ("WithPadding", False), - ) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(param_no_padding=[True, False]))) def testBucket(self, param_no_padding): boundaries = [10, 20, 30] @@ -347,10 +344,9 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, self.assertAllEqual(batches[4], [[1, 1, 1, 1, 1, 1, 1, 1, 1, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]) - @parameterized.named_parameters( - ("WithoutPadding", True), - ("WithPadding", False), - ) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(param_no_padding=[True, False]))) def testTupleElements(self, param_no_padding): def build_dataset(sparse): @@ -381,10 +377,10 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase, _test_tuple_elements_by_padding(param_no_padding) - @parameterized.named_parameters( - ("DoDropRemainder", True), - ("DoNotDropRemainder", False), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(param_drop_remainder=[True, False]))) def testBucketSparse(self, param_drop_remainder): # pylint: disable=g-doc-args """Tests bucketing of sparse tensors (case where `no_padding` == True). diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py index 36c61636798..2fa149fcbaa 100644 --- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py @@ -17,6 +17,8 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.core.protobuf import config_pb2 from tensorflow.python.compat import compat from tensorflow.python.data.experimental.ops import prefetching_ops @@ -24,6 +26,7 @@ from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops from tensorflow.python.data.util import structure +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -35,9 +38,9 @@ from tensorflow.python.util import compat as util_compat # TODO(b/117581999): add eager coverage when supported. -class CopyToDeviceTest(test_base.DatasetTestBase): +class CopyToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -62,7 +65,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceInt32(self): host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3]) device_dataset = host_dataset.apply( @@ -86,7 +89,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -111,7 +114,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -136,7 +139,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -161,7 +164,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyDictToDeviceWithPrefetch(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -186,7 +189,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopySparseTensorsToDevice(self): def make_tensor(i): @@ -219,7 +222,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopySparseTensorsToDeviceWithPrefetch(self): def make_tensor(i): @@ -252,7 +255,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -273,7 +276,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -294,7 +297,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithMap(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -332,7 +335,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuInt32(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -352,7 +355,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuInt32AndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -372,7 +375,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuStrings(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -392,7 +395,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuStringsAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -412,7 +415,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDevicePingPongCPUGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -436,7 +439,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -465,7 +468,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceWithReInitAndPrefetch(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -494,7 +497,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -518,7 +521,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testCopyToDeviceGpuWithReInitAndPrefetch(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -542,7 +545,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testIteratorGetNextAsOptionalOnGPU(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py index 79e4523ea43..455e49aafc7 100644 --- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py @@ -17,35 +17,33 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import counter from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class CounterTest(test_base.DatasetTestBase): +class CounterTest(test_base.DatasetTestBase, parameterized.TestCase): - def testCounter(self): + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(start=3, step=4, expected_output=[[3, 7, 11]]) + + combinations.combine(start=0, step=-1, expected_output=[[0, -1, -2]])) + ) + def testCounter(self, start, step, expected_output): """Test dataset construction using `count`.""" - dataset = counter.Counter(start=3, step=4) + dataset = counter.Counter(start, step) self.assertEqual( [], dataset_ops.get_legacy_output_shapes(dataset).as_list()) self.assertEqual(dtypes.int64, dataset_ops.get_legacy_output_types(dataset)) get_next = self.getNext(dataset) - - negative_dataset = counter.Counter(start=0, step=-1) - negative_get_next = self.getNext(negative_dataset) - - self.assertEqual(3, self.evaluate(get_next())) - self.assertEqual(3 + 4, self.evaluate(get_next())) - self.assertEqual(3 + 2 * 4, self.evaluate(get_next())) - - self.assertEqual(0, self.evaluate(negative_get_next())) - self.assertEqual(-1, self.evaluate(negative_get_next())) - self.assertEqual(-2, self.evaluate(negative_get_next())) + for expected in expected_output: + self.assertEqual(expected, self.evaluate(get_next())) if __name__ == "__main__": diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py index 4b349ebd811..941ca209848 100644 --- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py @@ -22,21 +22,22 @@ import gzip import os import zlib +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import error_ops from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.eager import context +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class CsvDatasetTest(test_base.DatasetTestBase): +class CsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def _setup_files(self, inputs, linebreak='\n', compression_type=None): filenames = [] @@ -117,26 +118,31 @@ class CsvDatasetTest(test_base.DatasetTestBase): dataset = readers.CsvDataset(filenames, **kwargs) self._verify_output_or_err(dataset, expected_output, expected_err_re) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_requiredFields(self): record_defaults = [[]] * 4 inputs = [['1,2,3,4']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_int(self): record_defaults = [[0]] * 4 inputs = [['1,2,3,4', '5,6,7,8']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_float(self): record_defaults = [[0.0]] * 4 inputs = [['1.0,2.1,3.2,4.3', '5.4,6.5,7.6,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_string(self): record_defaults = [['']] * 4 inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEmptyFields(self): record_defaults = [[0]] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] @@ -144,6 +150,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errWithUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4']] @@ -152,6 +159,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Unquoted fields cannot have quotes inside', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errWithUnescapedQuotes(self): record_defaults = [['']] * 3 inputs = [['"a"b","c","d"']] @@ -161,6 +169,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): 'Quote inside a string has to be escaped by another quote', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_ignoreErrWithUnescapedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,"2"3",4', '1,"2"3",4",5,5', 'a,b,"c"d"', 'e,f,g']] @@ -169,6 +178,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): dataset = dataset.apply(error_ops.ignore_errors()) self._verify_output_or_err(dataset, [['e', 'f', 'g']]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_ignoreErrWithUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4', 'a,b,c"d', '9,8"7,6,5', 'e,f,g']] @@ -177,12 +187,14 @@ class CsvDatasetTest(test_base.DatasetTestBase): dataset = dataset.apply(error_ops.ignore_errors()) self._verify_output_or_err(dataset, [['e', 'f', 'g']]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self): record_defaults = [['']] * 3 inputs = [['1,2"3,4']] self._test_by_comparison( inputs, record_defaults=record_defaults, use_quote_delim=False) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_mixedTypes(self): record_defaults = [ constant_op.constant([], dtype=dtypes.int32), @@ -193,30 +205,35 @@ class CsvDatasetTest(test_base.DatasetTestBase): inputs = [['1,2.1,3.2,4.3', '5,6.5,7.6,8.7']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withUseQuoteDelimFalse(self): record_defaults = [['']] * 4 inputs = [['1,2,"3,4"', '"5,6",7,8']] self._test_by_comparison( inputs, record_defaults=record_defaults, use_quote_delim=False) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withFieldDelim(self): record_defaults = [[0]] * 4 inputs = [['1:2:3:4', '5:6:7:8']] self._test_by_comparison( inputs, record_defaults=record_defaults, field_delim=':') + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNaValue(self): record_defaults = [[0]] * 4 inputs = [['1,NA,3,4', 'NA,6,7,8']] self._test_by_comparison( inputs, record_defaults=record_defaults, na_value='NA') + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withSelectCols(self): record_defaults = [['']] * 2 inputs = [['1,2,3,4', '"5","6","7","8"']] self._test_by_comparison( inputs, record_defaults=record_defaults, select_cols=[1, 2]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withSelectColsTooHigh(self): record_defaults = [[0]] * 2 inputs = [['1,2,3,4', '5,6,7,8']] @@ -226,23 +243,27 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, select_cols=[3, 4]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withOneCol(self): record_defaults = [['NA']] inputs = [['0', '', '2']] self._test_dataset( inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withMultipleFiles(self): record_defaults = [[0]] * 4 inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']] self._test_by_comparison(inputs, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withLeadingAndTrailingSpaces(self): record_defaults = [[0.0]] * 4 inputs = [['0, 1, 2, 3']] expected = [[0.0, 1.0, 2.0, 3.0]] self._test_dataset(inputs, expected, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithMissingDefault(self): record_defaults = [[]] * 2 inputs = [['0,']] @@ -251,6 +272,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Field 1 is required but missing in record!', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithFewerDefaultsThanFields(self): record_defaults = [[0.0]] * 2 inputs = [['0,1,2,3']] @@ -259,6 +281,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Expect 2 fields but have more in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithMoreDefaultsThanFields(self): record_defaults = [[0.0]] * 5 inputs = [['0,1,2,3']] @@ -267,6 +290,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Expect 5 fields but have 4 in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withHeader(self): record_defaults = [[0]] * 2 inputs = [['col1,col2', '1,2']] @@ -278,6 +302,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withHeaderAndNoRecords(self): record_defaults = [[0]] * 2 inputs = [['col1,col2']] @@ -289,6 +314,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithHeaderEmptyFile(self): record_defaults = [[0]] * 2 inputs = [[]] @@ -300,12 +326,14 @@ class CsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEmptyFile(self): record_defaults = [['']] * 2 inputs = [['']] # Empty file self._test_dataset( inputs, expected_output=[], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithEmptyRecord(self): record_defaults = [['']] * 2 inputs = [['', '1,2']] # First record is empty @@ -314,6 +342,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected_err_re='Expect 2 fields but have 1 in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withChainedOps(self): # Testing that one dataset can create multiple iterators fine. # `repeat` creates multiple iterators from the same C++ Dataset. @@ -325,6 +354,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): ds_actual.repeat(5).prefetch(1), ds_expected.repeat(5).prefetch(1)) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withTypeDefaults(self): # Testing using dtypes as record_defaults for required fields record_defaults = [dtypes.float32, [0.0]] @@ -335,6 +365,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCsvDataset_fieldOrder(self): data = [[ '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19', @@ -352,6 +383,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): ## The following tests exercise parsing logic for quoted fields + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withQuoted(self): record_defaults = [['']] * 4 inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']] @@ -363,6 +395,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset( inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNewLine(self): # In this case, we expect it to behave differently from # TextLineDataset->map(decode_csv) since that flow has bugs @@ -371,6 +404,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']] self._test_dataset(inputs, expected, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withNewLineInUnselectedCol(self): record_defaults = [['']] inputs = [['1,"2\n3",4', '5,6,7']] @@ -380,6 +414,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, select_cols=[0]) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withMultipleNewLines(self): # In this case, we expect it to behave differently from # TextLineDataset->map(decode_csv) since that flow has bugs @@ -388,6 +423,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']] self._test_dataset(inputs, expected, record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_errorWithTerminateMidRecord(self): record_defaults = [['']] * 4 inputs = [['a,b,c,"a']] @@ -397,6 +433,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): 'Reached end of file without closing quoted field in record', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withEscapedQuotes(self): record_defaults = [['']] * 4 inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']] @@ -406,6 +443,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): ## Testing that parsing works with all buffer sizes, quoted/unquoted fields, ## and different types of line breaks + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withInvalidBufferSize(self): record_defaults = [['']] * 4 inputs = [['a,b,c,d']] @@ -432,6 +470,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): record_defaults=record_defaults, buffer_size=i) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withLF(self): record_defaults = [['NA']] * 3 inputs = [['abc,def,ghi', '0,1,2', ',,']] @@ -439,6 +478,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCR(self): # Test that when the line separator is '\r', parsing works with all buffer # sizes @@ -448,6 +488,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRLF(self): # Test that when the line separator is '\r\n', parsing works with all buffer # sizes @@ -457,6 +498,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withBufferSizeAndQuoted(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -465,6 +507,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRAndQuoted(self): # Test that when the line separator is '\r', parsing works with all buffer # sizes @@ -475,6 +518,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withCRLFAndQuoted(self): # Test that when the line separator is '\r\n', parsing works with all buffer # sizes @@ -485,6 +529,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): self._test_dataset_on_buffer_sizes( inputs, expected, linebreak='\r\n', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withGzipCompressionType(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -497,6 +542,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): compression_type='GZIP', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withZlibCompressionType(self): record_defaults = [['NA']] * 3 inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']] @@ -509,6 +555,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): compression_type='ZLIB', record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_withScalarDefaults(self): record_defaults = [constant_op.constant(0, dtype=dtypes.int64)] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] @@ -516,6 +563,7 @@ class CsvDatasetTest(test_base.DatasetTestBase): inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]], record_defaults=record_defaults) + @combinations.generate(test_base.default_test_combinations()) def testCsvDataset_with2DDefaults(self): record_defaults = [constant_op.constant([[0]], dtype=dtypes.int64)] * 4 inputs = [[',,,', '1,1,1,', ',2,2,2']] diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py index cca7ae073ee..5dd1bb0532c 100644 --- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py @@ -17,20 +17,21 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class DenseToSparseBatchTest(test_base.DatasetTestBase): +class DenseToSparseBatchTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDataset(self): components = np.random.randint(12, size=(100,)).astype(np.int32) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -53,6 +54,7 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetWithUnknownShape(self): components = np.random.randint(5, size=(40,)).astype(np.int32) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -80,12 +82,14 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetWithInvalidShape(self): input_tensor = array_ops.constant([[1]]) with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"): dataset_ops.Dataset.from_tensors(input_tensor).apply( batching.dense_to_sparse_batch(4, [-2])) + @combinations.generate(test_base.default_test_combinations()) def testDenseToSparseBatchDatasetShapeErrors(self): def dataset_fn(input_tensor): diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py index 4a8c7d1ccc6..fc18afaa842 100644 --- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py @@ -17,22 +17,24 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import random_seed -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): +class DirectedInterleaveDatasetTest(test_base.DatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testBasic(self): selector_dataset = dataset_ops.Dataset.range(10).repeat(100) input_datasets = [ @@ -76,6 +78,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): return freqs + @combinations.generate(test_base.default_test_combinations()) def testSampleFromDatasets(self): random_seed.set_random_seed(1619) num_samples = 5000 @@ -95,6 +98,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples) self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2) + @combinations.generate(test_base.default_test_combinations()) def testSelectFromDatasets(self): words = [b"foo", b"bar", b"baz"] datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words] @@ -107,6 +111,7 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testErrors(self): with self.assertRaisesRegexp(ValueError, r"vector of length `len\(datasets\)`"): diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py index f65740c5651..59c2ef68d99 100644 --- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py @@ -23,25 +23,30 @@ from tensorflow.python.data.experimental.ops import get_single_element from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import function +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import variables from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.named_parameters( - ("Zero", 0, 1), - ("Five", 5, 1), - ("Ten", 10, 1), - ("Empty", 100, 1, errors.InvalidArgumentError, "Dataset was empty."), - ("MoreThanOne", 0, 2, errors.InvalidArgumentError, - "Dataset had more than one element."), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + skip=[0, 5, 10], take=[1], error=[None], error_msg=[None]) + + combinations.combine( + skip=[100], + take=[1], + error=[errors.InvalidArgumentError], + error_msg=["Dataset was empty."]) + combinations.combine( + skip=[0], + take=[2], + error=[errors.InvalidArgumentError], + error_msg=["Dataset had more than one element."]))) def testGetSingleElement(self, skip, take, error=None, error_msg=None): def make_sparse(x): @@ -62,6 +67,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaisesRegexp(error, error_msg): self.evaluate(get_single_element.get_single_element(dataset)) + @combinations.generate(test_base.default_test_combinations()) def testWindow(self): """Test that `get_single_element()` can consume a nested dataset.""" def flat_map_func(ds): @@ -73,6 +79,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]) + @combinations.generate(test_base.default_test_combinations()) def testSideEffect(self): counter_var = variables.Variable(0) @@ -92,6 +99,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertEqual(self.evaluate(fn()), b"hello") self.assertEqual(self.evaluate(counter_var), 1) + @combinations.generate(test_base.default_test_combinations()) def testAutomaticControlDependencies(self): counter_var = variables.Variable(1) diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py index 0e9042b2ef8..bf823143d57 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py @@ -17,25 +17,26 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class GroupByReducerTest(test_base.DatasetTestBase): +class GroupByReducerTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testSum(self): reducer = grouping.Reducer( init_func=lambda _: np.int64(0), @@ -49,6 +50,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) + @combinations.generate(test_base.default_test_combinations()) def testAverage(self): def reduce_fn(x, y): @@ -68,6 +70,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[i - 1, i]) + @combinations.generate(test_base.default_test_combinations()) def testConcat(self): components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray) reducer = grouping.Reducer( @@ -84,6 +87,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[b"acegikmoqs"[:i], b"bdfhjlnprt"[:i]]) + @combinations.generate(test_base.default_test_combinations()) def testSparseSum(self): def _sparse(i): return sparse_tensor.SparseTensorValue( @@ -103,6 +107,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): expected_shapes=tensor_shape.TensorShape([]), expected_output=[(i - 1) * i, i * i]) + @combinations.generate(test_base.default_test_combinations()) def testChangingStateShape(self): def reduce_fn(x, _): @@ -130,6 +135,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testTypeMismatch(self): reducer = grouping.Reducer( init_func=lambda x: constant_op.constant(1, dtype=dtypes.int32), @@ -144,6 +150,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda _: np.int64(0), reducer)) # TODO(b/78665031): Remove once non-scalar keys are supported. + @combinations.generate(test_base.default_test_combinations()) def testInvalidKeyShape(self): reducer = grouping.Reducer( init_func=lambda x: np.int64(0), @@ -157,6 +164,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer)) # TODO(b/78665031): Remove once non-int64 keys are supported. + @combinations.generate(test_base.default_test_combinations()) def testInvalidKeyType(self): reducer = grouping.Reducer( init_func=lambda x: np.int64(0), @@ -169,6 +177,7 @@ class GroupByReducerTest(test_base.DatasetTestBase): dataset.apply( grouping.group_by_reducer(lambda _: "wrong", reducer)) + @combinations.generate(test_base.default_test_combinations()) def testTuple(self): def init_fn(_): return np.array([], dtype=np.int64), np.int64(0) diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py index e529364e509..2495083cf63 100644 --- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py @@ -17,17 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_shape -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.ops import string_ops @@ -37,8 +38,7 @@ from tensorflow.python.platform import test # NOTE(mrry): These tests are based on the tests in bucket_ops_test.py. # Currently, they use a constant batch size, though should be made to use a # different batch size per key. -@test_util.run_all_in_graph_and_eager_modes -class GroupByWindowTest(test_base.DatasetTestBase): +class GroupByWindowTest(test_base.DatasetTestBase, parameterized.TestCase): def _dynamicPad(self, bucket, window, window_size): # TODO(mrry): To match `tf.contrib.training.bucket()`, implement a @@ -51,6 +51,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): 32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape( [None]), tensor_shape.TensorShape([3]))))) + @combinations.generate(test_base.default_test_combinations()) def testSingleBucket(self): def _map_fn(v): @@ -80,6 +81,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual(expected_unk_int64, bucketed_values[1]) self.assertAllEqual(expected_vec3_str, bucketed_values[2]) + @combinations.generate(test_base.default_test_combinations()) def testEvenOddBuckets(self): def _map_fn(v): @@ -132,6 +134,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1]) self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2]) + @combinations.generate(test_base.default_test_combinations()) def testEvenOddBucketsFilterOutAllOdd(self): def _map_fn(v): @@ -173,6 +176,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual( np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"]) + @combinations.generate(test_base.default_test_combinations()) def testDynamicWindowSize(self): components = np.arange(100).astype(np.int64) @@ -202,6 +206,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertEqual(batches, 15) + @combinations.generate(test_base.default_test_combinations()) def testSimple(self): components = np.random.randint(100, size=(200,)).astype(np.int64) dataset = dataset_ops.Dataset.from_tensor_slices( @@ -222,6 +227,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertGreaterEqual(num_full_batches, 24) self.assertTrue(all(c == 4 for c in counts[:num_full_batches])) + @combinations.generate(test_base.default_test_combinations()) def testImmediateOutput(self): components = np.array( [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64) @@ -240,6 +246,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next())) self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testSmallGroups(self): components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64) dataset = dataset_ops.Dataset.from_tensor_slices(components).apply( @@ -252,6 +259,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): self.assertAllEqual([0, 0, 0], self.evaluate(get_next())) self.assertAllEqual([1], self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testEmpty(self): dataset = dataset_ops.Dataset.range(4).apply( grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)) @@ -262,6 +270,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): "Window size must be greater than zero, but got 0."): print(self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testReduceFuncError(self): components = np.random.randint(100, size=(200,)).astype(np.int64) @@ -280,6 +289,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testConsumeWindowDatasetMoreThanOnce(self): components = np.random.randint(50, size=(200,)).astype(np.int64) @@ -311,6 +321,7 @@ class GroupByWindowTest(test_base.DatasetTestBase): counts.append(tight_result.shape[0]) self.assertEqual(len(components), sum(counts)) + @combinations.generate(test_base.default_test_combinations()) def testShortCircuit(self): dataset = dataset_ops.Dataset.range(10) diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py index c37439f328b..5ed72767425 100644 --- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py @@ -19,14 +19,15 @@ from __future__ import print_function import os +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import error_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.ops import array_ops from tensorflow.python.ops import io_ops @@ -36,9 +37,9 @@ from tensorflow.python.util import compat _NUMPY_RANDOM_SEED = 42 -@test_util.run_all_in_graph_and_eager_modes -class IgnoreErrorsTest(test_base.DatasetTestBase): +class IgnoreErrorsTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) @@ -53,6 +54,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testParallelMapIgnoreError(self): components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32) @@ -67,6 +69,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testReadFileIgnoreError(self): def write_string_to_file(value, filename): @@ -102,6 +105,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testTFRecordDatasetIgnoreError(self): filenames = [] for i in range(5): @@ -126,6 +130,7 @@ class IgnoreErrorsTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testZipIgnoreError(self): a = dataset_ops.Dataset.from_tensor_slices([1., 2., 0., 4.]) b = a.map(lambda x: array_ops.check_numerics(1. / x, "error")) diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py index 2ddff457bc4..980fd03b073 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py @@ -17,26 +17,29 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.ops import readers +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers as core_readers from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.ops import io_ops from tensorflow.python.ops import parsing_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class MakeBatchedFeaturesDatasetTest( - reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): + reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testRead(self): for batch_size in [1, 2]: for num_epochs in [1, 10]: @@ -85,6 +88,7 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self._next_actual_batch() + @combinations.generate(test_base.default_test_combinations()) def testReadWithEquivalentDataset(self): features = { "file": parsing_ops.FixedLenFeature([], dtypes.int64), @@ -103,6 +107,7 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testReadWithFusedShuffleRepeatDataset(self): num_epochs = 5 total_records = num_epochs * self._num_records @@ -151,6 +156,7 @@ class MakeBatchedFeaturesDatasetTest( all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) + @combinations.generate(test_base.default_test_combinations()) def testParallelReadersAndParsers(self): num_epochs = 5 for batch_size in [1, 2]: @@ -186,6 +192,7 @@ class MakeBatchedFeaturesDatasetTest( with self.assertRaises(errors.OutOfRangeError): self._next_actual_batch() + @combinations.generate(test_base.default_test_combinations()) def testDropFinalBatch(self): for batch_size in [1, 2]: for num_epochs in [1, 10]: @@ -201,6 +208,7 @@ class MakeBatchedFeaturesDatasetTest( if isinstance(tensor, ops.Tensor): # Guard against SparseTensor. self.assertEqual(tensor.shape[0], batch_size) + @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): dataset = self.make_batch_feature( filenames=self.test_filenames[0], @@ -213,6 +221,7 @@ class MakeBatchedFeaturesDatasetTest( if issubclass(clazz, ops.Tensor): self.assertEqual(32, shape[0]) + @combinations.generate(test_base.default_test_combinations()) def testOldStyleReader(self): with self.assertRaisesRegexp( TypeError, r"The `reader` argument must return a `Dataset` object. " diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py index 16c323b3790..5f8382f43c4 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py @@ -21,21 +21,21 @@ import gzip import os import zlib +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import readers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class MakeCsvDatasetTest(test_base.DatasetTestBase): +class MakeCsvDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs): return readers.make_csv_dataset( @@ -126,6 +126,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): self._verify_output(dataset, batch_size, num_epochs, label_name, expected_output, expected_keys) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset(self): """Tests making a CSV dataset with keys and defaults provided.""" record_defaults = [ @@ -157,6 +158,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withBatchSizeAndEpochs(self): """Tests making a CSV dataset with keys and defaults provided.""" record_defaults = [ @@ -188,6 +190,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withCompressionType(self): """Tests `compression_type` argument.""" record_defaults = [ @@ -221,6 +224,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): compression_type=compression_type, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withCompressionTypeAndNoColumnNames(self): """Tests `compression_type` argument.""" record_defaults = [ @@ -269,6 +273,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): compression_type="ZLIB", ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withBadInputs(self): """Tests that exception is raised when input is malformed. """ @@ -304,6 +309,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): label_name="not_a_real_label", column_names=column_names) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoLabel(self): """Tests making a CSV dataset with no label provided.""" record_defaults = [ @@ -333,6 +339,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoHeader(self): """Tests that datasets can be created from CSV files with no header line. """ @@ -363,6 +370,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypes(self): """Tests that defaults can be a dtype instead of a Tensor for required vals. """ @@ -394,6 +402,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNoColNames(self): """Tests that datasets can be created when column names are not specified. @@ -427,6 +436,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): column_defaults=record_defaults, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInferenceMismatch(self): # Test that error is thrown when num fields doesn't match columns column_names = ["col%d" % i for i in range(5)] @@ -442,6 +452,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): batch_size=2, num_epochs=10) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInference(self): """Tests that datasets can be created when no defaults are specified. @@ -468,6 +479,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withTypeInferenceFallthrough(self): """Tests that datasets can be created when no defaults are specified. @@ -498,6 +510,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): header=True, ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withNAValuesAndFieldDelim(self): """Tests that datasets can be created from different delim and na_value.""" column_names = ["col%d" % i for i in range(5)] @@ -520,6 +533,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): field_delim=" ", ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withSelectCols(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -588,6 +602,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): select_columns=[column_names[i] for i in select_cols], ) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withSelectColsError(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -626,6 +641,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): label_name=None, select_columns=["invalid_col_name"]) + @combinations.generate(test_base.default_test_combinations()) def testMakeCSVDataset_withShuffle(self): record_defaults = [ constant_op.constant([], dtypes.int32), @@ -710,6 +726,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase): all_equal = all_equal and np.array_equal(batch1[i], batch2[i]) self.assertFalse(all_equal) + @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): column_names = ["col%d" % i for i in range(5)] inputs = [[",".join(x for x in column_names), "0,1,2,3,4", "5,6,7,8,9"], [ diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py index ec1760398fa..a67ccd92842 100644 --- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py @@ -17,19 +17,22 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base from tensorflow.python.data.experimental.ops import readers +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import string_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class MakeTFRecordDatasetTest( - reader_dataset_ops_test_base.TFRecordDatasetTestBase): + reader_dataset_ops_test_base.TFRecordDatasetTestBase, + parameterized.TestCase): def _read_test(self, batch_size, num_epochs, file_index=None, num_parallel_reads=1, drop_final_batch=False, parser_fn=False): @@ -63,6 +66,7 @@ class MakeTFRecordDatasetTest( with self.assertRaises(errors.OutOfRangeError): self.evaluate(outputs()) + @combinations.generate(test_base.default_test_combinations()) def testRead(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -78,6 +82,7 @@ class MakeTFRecordDatasetTest( # Basic test: read from both files, with parallel reads. self._read_test(batch_size, num_epochs, num_parallel_reads=8) + @combinations.generate(test_base.default_test_combinations()) def testDropFinalBatch(self): for batch_size in [1, 2, 10]: for num_epochs in [1, 3]: @@ -91,6 +96,7 @@ class MakeTFRecordDatasetTest( self._read_test(batch_size, num_epochs, num_parallel_reads=8, drop_final_batch=True) + @combinations.generate(test_base.default_test_combinations()) def testParserFn(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -145,6 +151,7 @@ class MakeTFRecordDatasetTest( actual.extend(b) self.assertAllEqual(sorted(expected), sorted(actual)) + @combinations.generate(test_base.default_test_combinations()) def testShuffle(self): for batch_size in [1, 2]: for num_epochs in [1, 3]: @@ -156,6 +163,7 @@ class MakeTFRecordDatasetTest( self._shuffle_test(batch_size, num_epochs, num_parallel_reads, seed=21345) + @combinations.generate(test_base.default_test_combinations()) def testIndefiniteRepeatShapeInference(self): dataset = readers.make_tf_record_dataset( file_pattern=self.test_filenames, num_epochs=None, batch_size=32) diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py index a42ce40fb29..a2cc54d104e 100644 --- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py @@ -19,17 +19,19 @@ from __future__ import print_function import time +from absl.testing import parameterized + from tensorflow.python.client import session from tensorflow.python.data.experimental.ops import map_defun from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.eager import function +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor from tensorflow.python.framework import tensor_spec -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops from tensorflow.python.ops import data_flow_ops @@ -38,9 +40,11 @@ from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test -@test_util.run_v1_only("b/123903858: Add eager and V2 test coverage") -class MapDefunTest(test_base.DatasetTestBase): +# TODO(b/123903858): Add eager and V2 test coverage +class MapDefunTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testNoIntraOpLimit(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -55,6 +59,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = elems * 2 + 3 self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunSimple(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -67,6 +73,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = elems * 2 + 3 self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunMismatchedTypes(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)]) @@ -79,6 +87,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(r) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunReduceDim(self): # Tests where the output has a different rank from the input @@ -92,6 +102,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = constant_op.constant([1, 3, 5]) self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunMultipleOutputs(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -105,6 +117,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = [elems, elems * 2 + 3] self.assertAllEqual(self.evaluate(r), self.evaluate(expected)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunShapeInference(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -116,6 +130,8 @@ class MapDefunTest(test_base.DatasetTestBase): result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)])[0] self.assertEqual(result.get_shape(), (3, 2)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunPartialShapeInference(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -126,6 +142,8 @@ class MapDefunTest(test_base.DatasetTestBase): result = map_defun.map_defun(fn, [elems], [dtypes.int32], [(2,)]) self.assertEqual(result[0].get_shape().as_list(), [None, 2]) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunRaisesErrorOnRuntimeShapeMismatch(self): @function.defun(input_signature=[ @@ -145,6 +163,8 @@ class MapDefunTest(test_base.DatasetTestBase): "All inputs must have the same dimension 0."): sess.run(result, feed_dict={elems1: [1, 2, 3, 4, 5], elems2: [1, 2, 3]}) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunRaisesDefunError(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.int32)]) @@ -157,6 +177,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(result) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunCancelledCorrectly(self): @function.defun(input_signature=[tensor_spec.TensorSpec([5], dtypes.int64)]) @@ -173,6 +195,8 @@ class MapDefunTest(test_base.DatasetTestBase): r"indices = 10 is not in \[0, 5\)"): self.evaluate(map_defun_op) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithUnspecifiedOutputShape(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -190,6 +214,8 @@ class MapDefunTest(test_base.DatasetTestBase): self.assertAllEqual(self.evaluate(r[1]), self.evaluate(expected + 1)) self.assertAllEqual(self.evaluate(r[2]), self.evaluate(expected + 2)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithDifferentOutputShapeEachRun(self): @function.defun( @@ -204,6 +230,8 @@ class MapDefunTest(test_base.DatasetTestBase): self.assertAllEqual( sess.run(r, feed_dict={elems: [[0], [1]]}), [[3], [5]]) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithWrongOutputShape(self): @function.defun(input_signature=[tensor_spec.TensorSpec([2], dtypes.int32)]) @@ -216,6 +244,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): self.evaluate(r) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithInvalidInput(self): @function.defun( @@ -233,6 +263,8 @@ class MapDefunTest(test_base.DatasetTestBase): with self.assertRaises(errors.InvalidArgumentError): sess.run(r, feed_dict={p: 0}) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithParentCancellation(self): # Checks that a cancellation of the parent graph is threaded through to # MapDefunOp correctly. @@ -254,6 +286,8 @@ class MapDefunTest(test_base.DatasetTestBase): sess.close() thread.join() + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithCapturedInputs(self): c = constant_op.constant(2) @@ -266,6 +300,8 @@ class MapDefunTest(test_base.DatasetTestBase): expected = x + c self.assertAllEqual(self.evaluate(expected), self.evaluate(map_defun_op)) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithVariantTensor(self): @function.defun( @@ -288,6 +324,8 @@ class MapDefunTest(test_base.DatasetTestBase): actual = self.evaluate(deserialized) self.assertValuesEqual(expected, actual) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithVariantTensorAsCaptured(self): st = sparse_tensor.SparseTensor( @@ -309,6 +347,8 @@ class MapDefunTest(test_base.DatasetTestBase): actual = self.evaluate(deserialized) self.assertValuesEqual(expected, actual) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testMapDefunWithStrTensor(self): @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)]) diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py index 811a58262ef..d7944042c6e 100644 --- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py @@ -28,14 +28,13 @@ from tensorflow.python.data.experimental.ops import threadpool from tensorflow.python.data.experimental.ops import unique from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import script_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class OverrideThreadpoolTest(test_base.DatasetTestBase, parameterized.TestCase): @@ -70,17 +69,13 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, # perform work. self.assertLessEqual(len(thread_ids), num_threads) - @parameterized.named_parameters( - ("1", 1, None), - ("2", 2, None), - ("3", 4, None), - ("4", 8, None), - ("5", 16, None), - ("6", 4, -1), - ("7", 4, 0), - ("8", 4, 1), - ("9", 4, 4), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + + combinations.combine( + num_threads=[4], max_intra_op_parallelism=[-1, 0, 4]))) def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): @@ -93,20 +88,17 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, self._testNumThreadsHelper(num_threads, override_threadpool_fn) - @parameterized.named_parameters( - ("1", 1, None), - ("2", 2, None), - ("3", 4, None), - ("4", 8, None), - ("5", 16, None), - ("6", None, 0), - ("7", None, 1), - ("8", None, 4), - ("9", 4, 0), - ("10", 4, 1), - ("11", 4, 4), - ("12", None, None), - ) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + num_threads=[1, 2, 4, 8, 16], max_intra_op_parallelism=[None]) + + combinations.combine( + num_threads=[None], max_intra_op_parallelism=[0, 1, 4]) + + combinations.combine( + num_threads=[4], max_intra_op_parallelism=[0, 1, 4]) + + combinations.combine( + num_threads=[None], max_intra_op_parallelism=[None]))) def testNumThreads(self, num_threads, max_intra_op_parallelism): def override_threadpool_fn(dataset): @@ -121,6 +113,7 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase, self._testNumThreadsHelper(num_threads, override_threadpool_fn) + @combinations.generate(test_base.default_test_combinations()) def testMaxIntraOpParallelismAsGraphDefInternal(self): dataset = dataset_ops.Dataset.from_tensors(0) dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1) diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py index 1fe5655ec02..14d3c9d6d7f 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py @@ -22,24 +22,25 @@ import math import threading import time +from absl.testing import parameterized import numpy as np from six.moves import zip_longest from tensorflow.python.data.experimental.ops import interleave_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import script_ops from tensorflow.python.ops import sparse_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class ParallelInterleaveTest(test_base.DatasetTestBase): +# TODO(feihugis): refactor this test to be parameterized. +class ParallelInterleaveTest(test_base.DatasetTestBase, parameterized.TestCase): def setUp(self): @@ -116,6 +117,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): num_open -= 1 break + @combinations.generate(test_base.default_test_combinations()) def testPythonImplementation(self): input_lists = [[4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6], [4, 4, 4, 4], [5, 5, 5, 5, 5], [6, 6, 6, 6, 6, 6]] @@ -136,6 +138,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.assertEqual(expected, produced, "Values differ at %s. %s != %s" % (index, expected, produced)) + @combinations.generate(test_base.default_test_combinations()) def testPythonImplementationBlockLength(self): input_lists = [[4] * 4, [5] * 5, [6] * 6] * 2 expected_elements = [ @@ -147,6 +150,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.assertEqual(expected, produced, "Values differ at %s. %s != %s" % (index, expected, produced)) + @combinations.generate(test_base.default_test_combinations()) def testPythonImplementationEmptyLists(self): input_lists = [[4, 4, 4, 4], [], [6, 6, 6, 6, 6, 6], [4, 4, 4, 4], [], [6, 6, 6, 6, 6, 6]] @@ -189,18 +193,23 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreaded(self): self._testSingleThreaded() + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedSloppy(self): self._testSingleThreaded(sloppy=True) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedPrefetch1Itr(self): self._testSingleThreaded(prefetch_input_elements=1) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedPrefetch1ItrSloppy(self): self._testSingleThreaded(prefetch_input_elements=1, sloppy=True) + @combinations.generate(test_base.default_test_combinations()) def testSingleThreadedRagged(self): # Tests a sequence with wildly different elements per iterator. self.skipTest("b/131722904") @@ -259,9 +268,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContention(self): self._testTwoThreadsNoContention() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionSloppy(self): self._testTwoThreadsNoContention(sloppy=True) @@ -306,9 +317,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRaces(self): self._testTwoThreadsNoContentionWithRaces() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesSloppy(self): self._testTwoThreadsNoContentionWithRaces(sloppy=True) @@ -343,9 +356,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionBlockLength(self): self._testTwoThreadsNoContentionBlockLength() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionBlockLengthSloppy(self): self._testTwoThreadsNoContentionBlockLength(sloppy=True) @@ -391,9 +406,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesAndBlocking(self): self._testTwoThreadsNoContentionWithRacesAndBlocking() + @combinations.generate(test_base.default_test_combinations()) def testTwoThreadsNoContentionWithRacesAndBlockingSloppy(self): self._testTwoThreadsNoContentionWithRacesAndBlocking(sloppy=True) @@ -411,9 +428,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testEmptyInput(self): self._testEmptyInput() + @combinations.generate(test_base.default_test_combinations()) def testEmptyInputSloppy(self): self._testEmptyInput(sloppy=True) @@ -431,9 +450,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testNonEmptyInputIntoEmptyOutputs(self): self._testNonEmptyInputIntoEmptyOutputs() + @combinations.generate(test_base.default_test_combinations()) def testNonEmptyInputIntoEmptyOutputsSloppy(self): self._testNonEmptyInputIntoEmptyOutputs(sloppy=True) @@ -469,12 +490,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): "At index %s: %s expected, got: %s" % (i, expected_element, actual_element)) + @combinations.generate(test_base.default_test_combinations()) def testPartiallyEmptyOutputs(self): self._testPartiallyEmptyOutputs() + @combinations.generate(test_base.default_test_combinations()) def testPartiallyEmptyOutputsSloppy(self): self._testPartiallyEmptyOutputs(sloppy=True, prefetch_input_elements=0) + @combinations.generate(test_base.default_test_combinations()) def testDelayedOutputSloppy(self): # Explicitly control the sequence of events to ensure we correctly avoid # head-of-line blocking. @@ -500,6 +524,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testBlockLengthWithContentionSloppy(self): self.skipTest("b/131722904") self._clear_coordination_events() @@ -557,9 +582,11 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): self.read_coordination_events[i].acquire() self.write_coordination_events[i].set() + @combinations.generate(test_base.default_test_combinations()) def testEarlyExit(self): self._testEarlyExit() + @combinations.generate(test_base.default_test_combinations()) def testEarlyExitSloppy(self): self._testEarlyExit(sloppy=True) @@ -584,12 +611,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2) self.assertItemsEqual(output_values, expected_values) + @combinations.generate(test_base.default_test_combinations()) def testTooManyReaders(self): self._testTooManyReaders() + @combinations.generate(test_base.default_test_combinations()) def testTooManyReadersSloppy(self): self._testTooManyReaders(sloppy=True) + @combinations.generate(test_base.default_test_combinations()) def testSparse(self): def _map_fn(i): return sparse_tensor.SparseTensor( @@ -610,6 +640,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testErrorsInOutputFn(self): self.skipTest("b/131722904") self._clear_coordination_events() @@ -642,6 +673,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testErrorsInInputFn(self): def map_py_fn(x): @@ -687,6 +719,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testErrorsInInterleaveFn(self): def map_py_fn(x): @@ -730,6 +763,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testShutdownRace(self): dataset = dataset_ops.Dataset.range(20) map_fn = lambda x: dataset_ops.Dataset.range(20 * x, 20 * (x + 1)) diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py index 794f72365df..58cba64617d 100644 --- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py @@ -20,6 +20,7 @@ from __future__ import print_function import copy +from absl.testing import parameterized import numpy as np from tensorflow.core.example import example_pb2 @@ -28,11 +29,11 @@ from tensorflow.python.data.experimental.ops import parsing_ops as contrib_parsi from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.eager import context +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors_impl from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor -from tensorflow.python.framework import test_util from tensorflow.python.ops import parsing_ops from tensorflow.python.ops.ragged import ragged_factory_ops from tensorflow.python.platform import test @@ -50,8 +51,8 @@ feature_lists = lambda d: feature_pb2.FeatureLists(feature_list=d) sequence_example = example_pb2.SequenceExample -@test_util.run_all_in_graph_and_eager_modes -class ParseExampleDatasetTest(test_base.DatasetTestBase): +class ParseExampleDatasetTest(test_base.DatasetTestBase, + parameterized.TestCase): def _compare_output_to_expected(self, dict_tensors, expected_tensors): self.assertEqual(set(dict_tensors.keys()), set(expected_tensors.keys())) @@ -107,6 +108,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): self.assertEqual( dataset_ops.get_legacy_output_shapes(dataset)[k].as_list()[1], None) + @combinations.generate(test_base.default_test_combinations()) def testEmptySerializedWithAllDefaults(self): sparse_name = "st_a" a_name = "a" @@ -145,7 +147,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) - @test_util.run_deprecated_v1 + @combinations.generate(test_base.graph_only_combinations()) def testEmptySerializedWithoutDefaultsShouldFail(self): input_features = { "st_a": @@ -179,7 +181,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_err=(errors_impl.InvalidArgumentError, "Feature: c \\(data type: float\\) is required")) - @test_util.run_deprecated_v1 + @combinations.generate(test_base.graph_only_combinations()) def testDenseNotMatchingShapeShouldFail(self): original = [ example(features=features({ @@ -197,6 +199,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_err=(errors_impl.InvalidArgumentError, "Key: a, Index: 1. Number of float values")) + @combinations.generate(test_base.default_test_combinations()) def testDenseDefaultNoShapeShouldFail(self): original = [example(features=features({"a": float_feature([1, 1, 3]),})),] @@ -207,6 +210,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): {"a": parsing_ops.FixedLenFeature(None, dtypes.float32)}, expected_err=(ValueError, "Missing shape for feature a")) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparse(self): original = [ example(features=features({ @@ -248,6 +252,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparseFeature(self): original = [ example(features=features({ @@ -284,6 +289,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingSparseFeatureReuse(self): original = [ example(features=features({ @@ -325,6 +331,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContaining3DSparseFeature(self): original = [ example(features=features({ @@ -370,6 +377,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDense(self): aname = "a" bname = "b*has+a:tricky_name" @@ -407,6 +415,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): # This test is identical as the previous one except # for the creation of 'serialized'. + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseWithConcat(self): aname = "a" bname = "b*has+a:tricky_name" @@ -452,6 +461,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseScalar(self): original = [ example(features=features({ @@ -476,6 +486,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingDenseWithDefaults(self): original = [ example(features=features({ @@ -514,6 +525,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedSparseAndSparseFeatureAndDenseWithNoDefault(self): expected_st_a = sparse_tensor.SparseTensorValue( # indices, values, shape np.empty((0, 2), dtype=np.int64), # indices @@ -569,6 +581,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testerializedContainingSparseAndSparseFeatureWithReuse(self): expected_idx = sparse_tensor.SparseTensorValue( # indices, values, shape np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64), @@ -667,11 +680,13 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingVarLenDenseLargerBatch(self): np.random.seed(3456) for batch_size in (1, 10, 20, 100, 256): self._testSerializedContainingVarLenDenseLargerBatch(batch_size) + @combinations.generate(test_base.default_test_combinations()) def testSerializedShapeMismatch(self): aname = "a" bname = "b" @@ -724,7 +739,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_err=(ValueError, "Cannot reshape a tensor with 0 elements to shape")) - @test_util.run_deprecated_v1 + @combinations.generate(test_base.graph_only_combinations()) def testSerializedContainingVarLenDense(self): aname = "a" bname = "b" @@ -877,6 +892,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): "Unsupported: FixedLenSequenceFeature requires " "allow_missing to be True.")) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithNoPartitions(self): original = [ example( @@ -922,6 +938,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithOnePartition(self): original = [ example( @@ -1040,6 +1057,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase): expected_values=expected_output, create_iterator_twice=True) + @combinations.generate(test_base.default_test_combinations()) def testSerializedContainingRaggedFeatureWithMultiplePartitions(self): original = [ # rt shape: [(batch), 2, None, None] diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py index f51da6e8b66..8ac4e239881 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py @@ -17,11 +17,14 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.experimental.ops import prefetching_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import structure +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors from tensorflow.python.framework import ops @@ -31,9 +34,9 @@ from tensorflow.python.platform import test # TODO(b/117581999): add eager coverage when supported. -class PrefetchToDeviceTest(test_base.DatasetTestBase): +class PrefetchToDeviceTest(test_base.DatasetTestBase, parameterized.TestCase): - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -57,7 +60,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToSameDevice(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -82,7 +85,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchDictToDevice(self): host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x}) device_dataset = host_dataset.apply( @@ -106,7 +109,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchSparseTensorsToDevice(self): def make_tensor(i): return sparse_tensor.SparseTensorValue( @@ -136,7 +139,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDeviceGpu(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") @@ -156,7 +159,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDeviceWithReInit(self): host_dataset = dataset_ops.Dataset.range(10) device_dataset = host_dataset.apply( @@ -184,7 +187,7 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element) - @test_util.deprecated_graph_mode_only + @combinations.generate(test_base.graph_only_combinations()) def testPrefetchToDeviceGpuWithReInit(self): if not test_util.is_gpu_available(): self.skipTest("No GPU available") diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py index abc9eb5f0ad..ff1f1680a76 100644 --- a/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_with_slack_test.py @@ -24,16 +24,17 @@ from tensorflow.core.protobuf import config_pb2 from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import multi_device_iterator_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): - @test_util.run_v1_only("b/121264236") + # TODO(b/121264236) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testPrefetchWithSlackOption(self): """Determines slack_period based on num devices attached to iterator.""" dataset = dataset_ops.Dataset.range(10) @@ -60,6 +61,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): self.evaluate(elem_on_1) self.evaluate(elem_on_2) + @combinations.generate(test_base.default_test_combinations()) def testPrefetchWithSlackOptionWithoutIterator(self): """Defaults to slack period of 1 without iterator.""" dataset = dataset_ops.Dataset.range(10) @@ -72,6 +74,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): dataset.options()._graph_rewrite_configs()) self.assertDatasetProduces(dataset, range(10)) + @combinations.generate(test_base.default_test_combinations()) def testWithPassthroughDataset(self): """Should still work with a passthrough dataset after prefetch().""" dataset = dataset_ops.Dataset.range(10) @@ -82,6 +85,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): dataset = dataset.with_options(options) self.assertDatasetProduces(dataset, range(1, 11)) + @combinations.generate(test_base.default_test_combinations()) def testErrorWithoutPrefetch(self): """The rewrite fails if there is no prefetch() in the pipeline.""" dataset = dataset_ops.Dataset.range(10) @@ -92,6 +96,7 @@ class PrefetchWithSlackTest(test_base.DatasetTestBase, parameterized.TestCase): get_next = self.getNext(dataset) self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testErrorWithInvalidDataset(self): """With a nested dataset op after prefetch, the rewrite should fail.""" dataset = dataset_ops.Dataset.range(10) diff --git a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py index 32bcdbe183b..30496658529 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rebatch_dataset_test.py @@ -32,8 +32,8 @@ from tensorflow.python.data.experimental.ops import scan_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.util import nest +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops @@ -47,13 +47,11 @@ def _flat_shapes(dataset): return nest.flatten(dataset_ops.get_legacy_output_shapes(dataset)) -@test_util.run_all_in_graph_and_eager_modes class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): - drop_remainder_cases = [("WithDropRemainder", True), - ("WithoutDropRemainder", False)] - - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testBasic(self, drop_remainder): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder) @@ -64,13 +62,16 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[k for k in range(i, i + 8)] for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testScalarInputError(self): dataset = dataset_ops.Dataset.range(1024) distribute._RebatchDataset(dataset.batch(4), num_replicas=4) with self.assertRaisesRegexp(ValueError, "at least one dimension"): distribute._RebatchDataset(dataset, num_replicas=4) - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testBatchNotDivisibleByNumReplicas(self, drop_remainder): dataset = dataset_ops.Dataset.range(1024).batch( 32, drop_remainder=drop_remainder) @@ -89,6 +90,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): i += 4 self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testBatchSizeNotDivisibleByNumReplicas2(self): dataset = dataset_ops.Dataset.range(32).batch(16, drop_remainder=True) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=5) @@ -102,6 +104,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output.extend([[]]) # Last replica gets an empty batch self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testTupleOutput(self): dataset = dataset_ops.Dataset.range(1024).map(lambda x: (x, x)).batch(32) rebatched_dataset = distribute._RebatchDataset(dataset, num_replicas=4) @@ -110,6 +113,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testNestedDictionaryOutput(self): dataset = dataset_ops.Dataset.range(1024).map( lambda x: {"a": x, "b": {"c": x}}).batch(32) @@ -119,7 +123,9 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testFinalPartialBatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(1032).batch( 32, drop_remainder=drop_remainder) @@ -136,7 +142,9 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[k for k in range(i, i + 2)] for i in range(1024, 1032, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) - @parameterized.named_parameters(drop_remainder_cases) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(drop_remainder=[True, False]))) def testFinalPartialBatchAfterRebatch(self, drop_remainder): dataset = dataset_ops.Dataset.range(34).batch( 32, drop_remainder=drop_remainder) @@ -150,6 +158,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += [[32], [33], [], []] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMultipleBatches(self): dataset = dataset_ops.Dataset.range(128).batch(4).batch(8) self.assertEqual([[None, None]], @@ -170,6 +179,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 128, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMapAndBatch(self): dataset = dataset_ops.Dataset.range(1024).apply( batching.map_and_batch(math_ops.square, 32)) @@ -180,6 +190,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 1024, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMapAndBatchWithCapturedInput(self): captured_t = variables.Variable(42) dataset = dataset_ops.Dataset.range(1024).apply( @@ -193,6 +204,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): self.assertDatasetProduces( rebatched_dataset, expected_output, requires_initialization=True) + @combinations.generate(test_base.default_test_combinations()) def testPaddedBatch(self): dataset = dataset_ops.Dataset.range(128).batch( 4, drop_remainder=True).padded_batch( @@ -213,6 +225,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 128, 8)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testConcatenate(self): dataset1 = dataset_ops.Dataset.range(64).batch(8) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -224,6 +237,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[i, i + 1] for i in range(0, 32, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testConcatenateDifferentShapes(self): dataset1 = dataset_ops.Dataset.range(64).batch(16) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -235,6 +249,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): [[i, i + 1] for i in range(0, 32, 2)]) self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testZip(self): dataset1 = dataset_ops.Dataset.range(64).batch(8) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -245,6 +260,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [([i, i + 1], [i, i + 1]) for i in range(0, 32, 2)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testZipDifferentShapes(self): dataset1 = dataset_ops.Dataset.range(64).batch(16) dataset2 = dataset_ops.Dataset.range(32).batch(8) @@ -256,6 +272,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 32, 2)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testFlatMapBatching(self): dataset = dataset_ops.Dataset.range(2).flat_map( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -274,6 +291,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for i in range(0, 32, 8)] # generates 4 elements self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testInterleaveBatching(self): dataset = dataset_ops.Dataset.range(2).interleave( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -290,6 +308,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += expected_output self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testParallelInterleaveBatching(self): dataset = dataset_ops.Dataset.range(2).interleave( lambda _: dataset_ops.Dataset.range(32).batch( # pylint: disable=g-long-lambda @@ -307,6 +326,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output += expected_output self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowStaticBatch(self): dataset = dataset_ops.Dataset.from_tensor_slices( [[array_ops.constant(i, dtype=dtypes.int64)] * 3 for i in range(40)]) @@ -326,6 +346,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): for k in range(2)] self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) @@ -350,6 +371,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatchWithPartialBatch(self): # {0, 1, 0, 1, ...} dataset = dataset_ops.Dataset.range(40).map(lambda x: x % 2) @@ -371,6 +393,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testGroupByWindowDynamicBatchWithPartialBatchWithDropRemainder(self): # This test exercises nested batch functionality, dynamic batch size # and drop_remainder=True together. @@ -395,6 +418,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[value] * batch_size for batch_size, value in pairs] self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testScanAfterBatch(self): dataset = dataset_ops.Dataset.range(40).batch(10).apply( scan_ops.scan(np.int64(2), lambda state, value: (state, value * state))) @@ -405,6 +429,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): expected_output = [[i * 2 for i in range(j*5, (j+1)*5)] for j in range(8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testMakeBatchedFeaturesDataset(self): # Set up fn = os.path.join(self.get_temp_dir(), "tf_record.txt") @@ -438,6 +463,7 @@ class RebatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase): } for i in range(0, 1024, 8)] # pylint: disable=g-complex-comprehension self.assertDatasetProduces(rebatched_dataset, expected_output) + @combinations.generate(test_base.default_test_combinations()) def testRaggedTensorDataset(self): # Set up a dataset that produces ragged tensors with a static batch size. row_lengths = np.random.randint(8, size=128) diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py index 673e77fc3bb..fb1d4ea5d3a 100644 --- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py @@ -24,9 +24,9 @@ import numpy as np from tensorflow.python.data.experimental.ops import resampling from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import math_ops from tensorflow.python.ops import random_ops from tensorflow.python.ops import string_ops @@ -34,12 +34,11 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat -@test_util.run_all_in_graph_and_eager_modes class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.named_parameters( - ("InitialDistributionKnown", True), - ("InitialDistributionUnknown", False)) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(initial_known=[True, False]))) def testDistribution(self, initial_known): classes = np.random.randint(5, size=(20000,)) # Uniformly sampled target_dist = [0.9, 0.05, 0.05, 0.0, 0.0] @@ -72,9 +71,9 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): returned_dist = class_counts / total_returned self.assertAllClose(target_dist, returned_dist, atol=1e-2) - @parameterized.named_parameters( - ("OnlyInitial", True), - ("NotInitial", False)) + @combinations.generate( + combinations.times(test_base.default_test_combinations(), + combinations.combine(only_initial_dist=[True, False]))) def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist): init_dist = [0.5, 0.5] target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0] @@ -99,6 +98,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase): while True: returned.append(self.evaluate(get_next())) + @combinations.generate(test_base.default_test_combinations()) def testRandomClasses(self): init_dist = [0.25, 0.25, 0.25, 0.25] target_dist = [0.0, 0.0, 0.0, 1.0] diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py index 92ae528b940..8bb109a6519 100644 --- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py @@ -17,18 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.ops import shuffle_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class ShuffleAndRepeatTest(test_base.DatasetTestBase): +class ShuffleAndRepeatTest(test_base.DatasetTestBase, parameterized.TestCase): def _build_ds(self, seed, count=5, num_elements=20): return dataset_ops.Dataset.range(num_elements).apply( @@ -44,6 +44,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self.evaluate(get_next()) return outputs + @combinations.generate(test_base.default_test_combinations()) def testCorrectOutput(self): output = self._gen_outputs(lambda: self._build_ds(10), 100) self.assertSequenceEqual( @@ -52,6 +53,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): for i in range(5): self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20)) + @combinations.generate(test_base.default_test_combinations()) def testReshuffling(self): # Check that the output orders of different epochs are indeed different. output = self._gen_outputs(lambda: self._build_ds(10), 100) @@ -60,17 +62,20 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): epoch2 = output[(i + 1) * 20:(i + 2) * 20] self.assertNotEqual(epoch1, epoch2) + @combinations.generate(test_base.default_test_combinations()) def testSameOrderForSameSeeds(self): output1 = self._gen_outputs(lambda: self._build_ds(10), 100) output2 = self._gen_outputs(lambda: self._build_ds(10), 100) self.assertEqual(output1, output2) + @combinations.generate(test_base.default_test_combinations()) def testDifferentOrderForDifferentSeeds(self): output1 = self._gen_outputs(lambda: self._build_ds(10), 100) output2 = self._gen_outputs(lambda: self._build_ds(20), 100) self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) + @combinations.generate(test_base.default_test_combinations()) def testCountNone(self): output1 = self._gen_outputs( lambda: self._build_ds(10, count=None), 100, verify_exhausted=False) @@ -79,6 +84,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) + @combinations.generate(test_base.default_test_combinations()) def testCountMinusOne(self): output1 = self._gen_outputs( lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False) @@ -87,6 +93,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self.assertNotEqual(output1, output2) self.assertEqual(sorted(output1), sorted(output2)) + @combinations.generate(test_base.default_test_combinations()) def testInfiniteOutputs(self): # Asserting the iterator is exhausted after producing 100 items should fail. with self.assertRaises(AssertionError): @@ -94,6 +101,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): with self.assertRaises(AssertionError): self._gen_outputs(lambda: self._build_ds(10, count=-1), 100) + @combinations.generate(test_base.default_test_combinations()) def testInfiniteEmpty(self): with self.assertRaises(errors.OutOfRangeError): self._gen_outputs(lambda: self._build_ds(10, count=None, num_elements=0), @@ -102,12 +110,14 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase): self._gen_outputs(lambda: self._build_ds(10, count=-1, num_elements=0), 100) + @combinations.generate(test_base.default_test_combinations()) def testLargeBufferSize(self): ds = dataset_ops.Dataset.range(20).apply( shuffle_ops.shuffle_and_repeat(buffer_size=21)) get_next = self.getNext(ds) self.evaluate(get_next()) + @combinations.generate(test_base.default_test_combinations()) def testVeryLargeBufferSize(self): num_epochs = 1000 * 1000 # Each element being shuffled and repeated has shape (100,). This will OOM diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py index f55f62f5cb0..8e1dd4bd8dc 100644 --- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py @@ -18,18 +18,22 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.kernel_tests import sql_dataset_test_base +from tensorflow.python.data.kernel_tests import test_base +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): +class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase, + parameterized.TestCase): # Test that SqlDataset can read from a database table. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSet(self): for _ in range(2): # Run twice to verify statelessness of db operations. dataset = self._createSqlDataset( @@ -44,6 +48,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): num_test_iterations=2) # Test that SqlDataset works on a join query. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetJoinQuery(self): get_next = self.getNext( self._createSqlDataset( @@ -60,6 +65,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that SqlDataset can read a database entry with a null-terminator # in the middle of the text and place the entry in a `string` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetNullTerminator(self): get_next = self.getNext( self._createSqlDataset( @@ -76,6 +82,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that SqlDataset works when used on two different queries. # Because the output types of the dataset must be determined at graph-creation # time, the two queries must have the same number and types of columns. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetReuseSqlDataset(self): get_next = self.getNext( self._createSqlDataset( @@ -100,6 +107,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that an `OutOfRangeError` is raised on the first call to # `get_next_str_only` if result set is empty. + @combinations.generate(test_base.default_test_combinations()) def testReadEmptyResultSet(self): get_next = self.getNext( self._createSqlDataset( @@ -110,6 +118,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.evaluate(get_next()) # Test that an error is raised when `driver_name` is invalid. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithInvalidDriverName(self): with self.assertRaises(errors.InvalidArgumentError): dataset = self._createSqlDataset( @@ -120,6 +129,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.assertDatasetProduces(dataset, expected_output=[]) # Test that an error is raised when a column name in `query` is nonexistent + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithInvalidColumnName(self): get_next = self.getNext( self._createSqlDataset( @@ -130,6 +140,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.evaluate(get_next()) # Test that an error is raised when there is a syntax error in `query`. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetOfQueryWithSyntaxError(self): get_next = self.getNext( self._createSqlDataset( @@ -141,6 +152,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that an error is raised when the number of columns in `query` # does not match the length of `, output_types`. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithMismatchBetweenColumnsAndOutputTypes(self): get_next = self.getNext( self._createSqlDataset( @@ -154,6 +166,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # than a select query. In particular, the error refers to the number of # output types passed to the op not matching the number of columns in the # result set of the query (namely, 0 for an insert statement.) + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetOfInsertQuery(self): get_next = self.getNext( self._createSqlDataset( @@ -165,6 +178,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8(self): get_next = self.getNext( self._createSqlDataset( @@ -178,6 +192,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -191,6 +206,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt8MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -205,6 +221,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16(self): get_next = self.getNext( self._createSqlDataset( @@ -218,6 +235,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -231,6 +249,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt16MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -246,6 +265,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32(self): get_next = self.getNext( self._createSqlDataset( @@ -257,6 +277,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -270,6 +291,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -285,6 +307,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database # table and place it in an `int32` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt32VarCharColumnAsInt(self): get_next = self.getNext( self._createSqlDataset( @@ -298,6 +321,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table # and place it in an `int64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64(self): get_next = self.getNext( self._createSqlDataset( @@ -311,6 +335,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a negative or 0-valued integer from a # SQLite database table and place it in an `int64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64NegativeAndZero(self): get_next = self.getNext( self._createSqlDataset( @@ -324,6 +349,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a large (positive or negative) integer from # a SQLite database table and place it in an `int64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetInt64MaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -339,6 +365,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table and # place it in a `uint8` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt8(self): get_next = self.getNext( self._createSqlDataset( @@ -352,6 +379,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read the minimum and maximum uint8 values from a # SQLite database table and place them in `uint8` tensors. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt8MinAndMaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -367,6 +395,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer from a SQLite database table # and place it in a `uint16` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt16(self): get_next = self.getNext( self._createSqlDataset( @@ -380,6 +409,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read the minimum and maximum uint16 values from a # SQLite database table and place them in `uint16` tensors. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetUInt16MinAndMaxValues(self): get_next = self.getNext( self._createSqlDataset( @@ -396,6 +426,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a # SQLite database table and place them as `True` and `False` respectively # in `bool` tensors. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetBool(self): get_next = self.getNext( self._createSqlDataset( @@ -409,6 +440,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued # from a SQLite database table and place it as `True` in a `bool` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetBoolNotZeroOrOne(self): get_next = self.getNext( self._createSqlDataset( @@ -422,6 +454,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a float from a SQLite database table # and place it in a `float64` tensor. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64(self): get_next = self.getNext( self._createSqlDataset( @@ -437,6 +470,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # Test that `SqlDataset` can read a float from a SQLite database table beyond # the precision of 64-bit IEEE, without throwing an error. Test that # `SqlDataset` identifies such a value as equal to itself. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64OverlyPrecise(self): get_next = self.getNext( self._createSqlDataset( @@ -458,6 +492,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): # representing the largest integer representable as a 64-bit IEEE float # such that the previous integer is also representable as a 64-bit IEEE float. # Test that `SqlDataset` can distinguish these two numbers. + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetFloat64LargestConsecutiveWholeNumbersNotEqual(self): get_next = self.getNext( self._createSqlDataset( @@ -472,6 +507,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase): self.evaluate(get_next()) # Test that SqlDataset can stop correctly when combined with batch + @combinations.generate(test_base.default_test_combinations()) def testReadResultSetWithBatchStop(self): dataset = self._createSqlDataset( query="SELECT * FROM data", output_types=(dtypes.int32)) diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py index 4f04a0a3639..f77f2f21bf7 100644 --- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py @@ -17,6 +17,7 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized import numpy as np from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base @@ -24,7 +25,9 @@ from tensorflow.python.data.experimental.kernel_tests import stats_dataset_test_ from tensorflow.python.data.experimental.ops import batching from tensorflow.python.data.experimental.ops import stats_aggregator from tensorflow.python.data.experimental.ops import stats_ops +from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.ops import array_ops @@ -32,8 +35,11 @@ from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): +# TODO(jsimsa): Figure out why are graph tests failing. +class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.eager_only_combinations()) def testBytesProduced(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( @@ -57,6 +63,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.assertStatisticsHasCount(handle, "bytes_produced", 100.0, 101) self.assertStatisticsHasSum(handle, "bytes_produced", expected_sum, 101) + @combinations.generate(test_base.eager_only_combinations()) def testLatencyStats(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -76,6 +83,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 100.0, 101) + @combinations.generate(test_base.eager_only_combinations()) def testPrefetchBufferUtilization(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).map( @@ -117,6 +125,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): 301, offset=2) + @combinations.generate(test_base.eager_only_combinations()) def testPrefetchBufferScalars(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(10).map( @@ -140,6 +149,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.eager_only_combinations()) def testFilteredElementsStats(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(101).filter( @@ -167,6 +177,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle, self.regexForNodeName("FilterDataset", "filtered_elements"), 34.0) + @combinations.generate(test_base.eager_only_combinations()) def testReinitialize(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -187,6 +198,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.assertStatisticsHasCount(handle, "record_latency", (j + 1) * 100.0, (j * 100) + 101) + @combinations.generate(test_base.eager_only_combinations()) def testNoAggregatorRegistered(self): dataset = dataset_ops.Dataset.range(100).apply( stats_ops.latency_stats("record_latency")) @@ -198,6 +210,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.eager_only_combinations()) def testMultipleTags(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -221,6 +234,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle, "record_latency", 100.0, 201, offset=1) self.assertStatisticsHasCount(handle, "record_latency_2", 100.0, 201) + @combinations.generate(test_base.eager_only_combinations()) def testRepeatedTags(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -239,6 +253,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 200.0, 201) + @combinations.generate(test_base.eager_only_combinations()) def testMultipleIteratorsSameAggregator(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -259,6 +274,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): handle = self.getHandle(aggregator) self.assertStatisticsHasCount(handle, "record_latency", 200.0, 201) + @combinations.generate(test_base.eager_only_combinations()) def testMultipleDatasetWithPrefixes(self): aggregator = stats_aggregator.StatsAggregator() dataset = dataset_ops.Dataset.range(100).apply( @@ -289,6 +305,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.assertStatisticsHasCount(handle, "dataset2::record_latency", 100.0, 201) + @combinations.generate(test_base.eager_only_combinations()) def testMultiplePrefetchStats(self): aggregator = stats_aggregator.StatsAggregator() @@ -314,8 +331,10 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase): self.evaluate(next_element()) -class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): +class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.eager_only_combinations()) def testMapBufferUtilization(self): def dataset_fn(): @@ -326,6 +345,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): self.parallelCallsStats( dataset_fn, {"ParallelMapDataset"}, 10, function_processing_time=True) + @combinations.generate(test_base.eager_only_combinations()) def testMapAutoTuneBufferUtilization(self): def dataset_fn(): @@ -336,6 +356,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): self.parallelCallsStats( dataset_fn, {"ParallelMapDataset"}, 10, function_processing_time=True) + @combinations.generate(test_base.eager_only_combinations()) def testInterleaveAutoTuneBufferUtilization(self): def dataset_fn(): @@ -351,6 +372,7 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): self.parallelCallsStats(dataset_fn, {"ParallelInterleaveDatasetV2"}, 10) + @combinations.generate(test_base.eager_only_combinations()) def testMapAndBatchAutoTuneBufferUtilization(self): def dataset_fn(): @@ -370,8 +392,10 @@ class ThreadUtilizationStatsTest(stats_dataset_test_base.StatsDatasetTestBase): class FeatureStatsDatasetTest( stats_dataset_test_base.StatsDatasetTestBase, - reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase): + reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase, + parameterized.TestCase): + @combinations.generate(test_base.eager_only_combinations()) def testFeaturesStats(self): num_epochs = 5 total_records = num_epochs * self._num_records diff --git a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py index b2b0effb0df..959837faa24 100644 --- a/tensorflow/python/data/experimental/kernel_tests/take_while_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/take_while_test.py @@ -23,18 +23,21 @@ import numpy as np from tensorflow.python.data.experimental.ops import take_while_ops from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import constant_op from tensorflow.python.framework import errors -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import math_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): - @parameterized.parameters((14, 2), (15, 2), (100, 3)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(num_elements=[14, 15], window_size=[2]) + + combinations.combine(num_elements=[100], window_size=[3]))) def testTakeWhileDataset(self, num_elements, window_size): def _predicate_func(elem): @@ -49,8 +52,19 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): expected_num_elements = int(num_elements / window_size) * window_size self.assertDatasetProduces(dataset, np.arange(expected_num_elements)) - @parameterized.parameters((10, 2, False), (16, 7, False), (100, 99, False), - (100, 101, True), (0, 1, True)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine( + num_elements=[10], upper_bound=[2], out_of_bounds=[False]) + + combinations.combine( + num_elements=[16], upper_bound=[7], out_of_bounds=[False]) + + combinations.combine( + num_elements=[100], upper_bound=[99], out_of_bounds=[False]) + + combinations.combine( + num_elements=[100], upper_bound=[101], out_of_bounds=[True]) + + combinations.combine( + num_elements=[0], upper_bound=[1], out_of_bounds=[True]))) def testTakeWhileDatasetRange(self, num_elements, upper_bound, out_of_bounds): dataset = dataset_ops.Dataset.range(num_elements).apply( take_while_ops.take_while(lambda x: x < upper_bound)) @@ -62,6 +76,7 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): else: self.assertDatasetProduces(dataset, np.arange(upper_bound)) + @combinations.generate(test_base.default_test_combinations()) def testTakeWhileDatasetString(self): def not_equal(string): @@ -79,7 +94,13 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.assertEqual(b"test", self.evaluate(next_element())) - @parameterized.parameters((5, 3), (10, 0), (100, 5), (8, 7)) + @combinations.generate( + combinations.times( + test_base.default_test_combinations(), + combinations.combine(size=[5], index=[3]) + + combinations.combine(size=[10], index=[0]) + + combinations.combine(size=[100], index=[5]) + + combinations.combine(size=[8], index=[7]))) def testTakewhileDatasetShortCircuit(self, size, index): def _predicate_func(data_elem): @@ -98,6 +119,7 @@ class TakeWhileTest(test_base.DatasetTestBase, parameterized.TestCase): with self.assertRaises(errors.OutOfRangeError): self.evaluate(next_element()) + @combinations.generate(test_base.default_test_combinations()) def testTakeWhileDatasetWithRepeat(self): dataset = dataset_ops.Dataset.range(10).apply( take_while_ops.take_while(lambda x: x < 2)).repeat(5) diff --git a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py index 136a446bbd8..a327fc82466 100644 --- a/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/tf_record_writer_test.py @@ -19,14 +19,16 @@ from __future__ import print_function import os +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import grouping from tensorflow.python.data.experimental.ops import writers from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import readers from tensorflow.python.eager import function +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.lib.io import python_io from tensorflow.python.lib.io import tf_record from tensorflow.python.ops import string_ops @@ -34,8 +36,7 @@ from tensorflow.python.platform import test from tensorflow.python.util import compat -@test_util.run_all_in_graph_and_eager_modes -class TFRecordWriterTest(test_base.DatasetTestBase): +class TFRecordWriterTest(test_base.DatasetTestBase, parameterized.TestCase): def setUp(self): super(TFRecordWriterTest, self).setUp() @@ -63,11 +64,13 @@ class TFRecordWriterTest(test_base.DatasetTestBase): def _outputFilename(self): return os.path.join(self.get_temp_dir(), "tf_record.out.txt") + @combinations.generate(test_base.default_test_combinations()) def testWrite(self): self.evaluate(self.writer_fn(self._createFile())) for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testWriteZLIB(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.ZLIB) self.evaluate( @@ -76,6 +79,7 @@ class TFRecordWriterTest(test_base.DatasetTestBase): tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testWriteGZIP(self): options = tf_record.TFRecordOptions(tf_record.TFRecordCompressionType.GZIP) self.evaluate( @@ -84,20 +88,24 @@ class TFRecordWriterTest(test_base.DatasetTestBase): tf_record.tf_record_iterator(self._outputFilename(), options=options)): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testFailDataset(self): with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write("whoops") + @combinations.generate(test_base.default_test_combinations()) def testFailDType(self): input_dataset = dataset_ops.Dataset.from_tensors(10) with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset) + @combinations.generate(test_base.default_test_combinations()) def testFailShape(self): input_dataset = dataset_ops.Dataset.from_tensors([["hello"], ["world"]]) with self.assertRaises(TypeError): writers.TFRecordWriter(self._outputFilename(), "").write(input_dataset) + @combinations.generate(test_base.default_test_combinations()) def testSideEffect(self): def writer_fn(): input_dataset = readers.TFRecordDataset(self._createFile()) @@ -112,6 +120,7 @@ class TFRecordWriterTest(test_base.DatasetTestBase): for i, r in enumerate(tf_record.tf_record_iterator(self._outputFilename())): self.assertAllEqual(self._record(i), r) + @combinations.generate(test_base.default_test_combinations()) def testShard(self): filename = self._createFile() dataset = readers.TFRecordDataset([filename]) diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py index 42d76a2eb30..9a51c4224ff 100644 --- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py @@ -17,17 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import unique from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import dtypes -from tensorflow.python.framework import test_util from tensorflow.python.platform import test from tensorflow.python.util import compat -@test_util.run_all_in_graph_and_eager_modes -class UniqueTest(test_base.DatasetTestBase): +class UniqueTest(test_base.DatasetTestBase, parameterized.TestCase): def _testSimpleHelper(self, dtype, test_cases): """Test the `unique()` transformation on a list of test cases. @@ -52,7 +53,7 @@ class UniqueTest(test_base.DatasetTestBase): for element in expected ]) - @test_util.run_deprecated_v1 + @combinations.generate(test_base.graph_only_combinations()) def testSimpleInt(self): for dtype in [dtypes.int32, dtypes.int64]: self._testSimpleHelper(dtype, [ @@ -65,7 +66,7 @@ class UniqueTest(test_base.DatasetTestBase): ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]), ]) - @test_util.run_deprecated_v1 + @combinations.generate(test_base.graph_only_combinations()) def testSimpleString(self): self._testSimpleHelper(dtypes.string, [ ([], []), diff --git a/tensorflow/python/data/experimental/kernel_tests/variant_test.py b/tensorflow/python/data/experimental/kernel_tests/variant_test.py index 6a3a1424d12..897aa223371 100644 --- a/tensorflow/python/data/experimental/kernel_tests/variant_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/variant_test.py @@ -17,16 +17,18 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.experimental.ops import cardinality from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.framework import test_util +from tensorflow.python.framework import combinations from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class VariantTest(test_base.DatasetTestBase): +class VariantTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testRoundtripRange(self): dataset = dataset_ops.Dataset.range(10) variant = dataset_ops.to_variant(dataset) @@ -35,6 +37,7 @@ class VariantTest(test_base.DatasetTestBase): self.assertDatasetProduces(dataset, range(10)) self.assertEqual(self.evaluate(cardinality.cardinality(dataset)), 10) + @combinations.generate(test_base.default_test_combinations()) def testRoundtripMap(self): dataset = dataset_ops.Dataset.range(10).map(lambda x: x*x) variant = dataset_ops.to_variant(dataset) diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py index 09627d02994..3fd252ab3ac 100644 --- a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py +++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py @@ -17,18 +17,20 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from absl.testing import parameterized + from tensorflow.python.data.kernel_tests import test_base from tensorflow.python.data.ops import dataset_ops +from tensorflow.python.framework import combinations from tensorflow.python.framework import ops -from tensorflow.python.framework import test_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import gen_dataset_ops from tensorflow.python.platform import test -@test_util.run_all_in_graph_and_eager_modes -class WrapDatasetVariantTest(test_base.DatasetTestBase): +class WrapDatasetVariantTest(test_base.DatasetTestBase, parameterized.TestCase): + @combinations.generate(test_base.default_test_combinations()) def testBasic(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access @@ -42,7 +44,9 @@ class WrapDatasetVariantTest(test_base.DatasetTestBase): for i in range(100): self.assertEqual(i, self.evaluate(get_next())) - @test_util.run_v1_only("b/123901304") + # TODO(b/123901304) + @combinations.generate( + combinations.combine(tf_api_version=[1], mode=["graph"])) def testSkipEagerGPU(self): ds = dataset_ops.Dataset.range(100) ds_variant = ds._variant_tensor # pylint: disable=protected-access From d423b6e1bce2bcde8081fa3f07ed78be851a3a2c Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Thu, 5 Dec 2019 13:11:45 -0800 Subject: [PATCH 188/383] Move tf.math.sqrt python api endpoint into math_ops.py Update its one line docstring. Make its example testable. PiperOrigin-RevId: 284036847 Change-Id: I152b3b389d7335c9ed1a1cb4d1659b868cc8f657 --- .../api_def/python_api/api_def_Sqrt.pbtxt | 7 +--- tensorflow/python/ops/math_ops.py | 37 +++++++++++++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt index 16a4d9a7bcc..59e2dfe8366 100644 --- a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt @@ -1,9 +1,4 @@ op { graph_op_name: "Sqrt" - endpoint { - name: "math.sqrt" - } - endpoint { - name: "sqrt" - } + visibility: HIDDEN } diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 340cbf0606b..47f4742f4be 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4303,3 +4303,40 @@ def ceil(x, name=None): @end_compatibility """ return gen_math_ops.ceil(x, name) + + +@tf_export("math.sqrt", "sqrt") +@dispatch.add_dispatch_support +def sqrt(x, name=None): # pylint: disable=redefined-builtin + r"""Computes element-wise square root of the input tensor. + + Note: This operation does not support integer types. + + >>> x = tf.constant([[4.0], [16.0]]) + >>> tf.sqrt(x) + + >>> y = tf.constant([[-4.0], [16.0]]) + >>> tf.sqrt(y) + + >>> z = tf.constant([[-1.0], [16.0]], dtype=tf.complex128) + >>> tf.sqrt(z) + + + Note: In order to support complex complex, please provide an input tensor + of `complex64` or `complex128`. + + Args: + x: A `tf.Tensor` of type `bfloat16`, `half`, `float32`, `float64`, + `complex64`, `complex128` + name: A name for the operation (optional). + + Returns: + A `tf.Tensor` of same size, type and sparsity as `x`. + """ + return gen_math_ops.sqrt(x, name) \ No newline at end of file From f5829c0e31d5f521e19d97dcc9d4ff6ff927598f Mon Sep 17 00:00:00 2001 From: Clayne Robison Date: Thu, 5 Dec 2019 14:25:12 -0700 Subject: [PATCH 189/383] Fixing MKL broken links and updating with 1.15 and 2.0 links --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 51ca43e1571..9cac16619bd 100644 --- a/README.md +++ b/README.md @@ -120,8 +120,8 @@ Build Type **Linux ppc64le CPU** Stable Release | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_CPU_Release_Build/) **Linux ppc64le GPU** Nightly | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/) | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/) **Linux ppc64le GPU** Stable Release | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/) -**Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) -**Linux CPU with Intel® MKL-DNN**
**Supports Python 2.7, 3.4, 3.5, 3.6 and 3.7** | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.14.0 PyPI](https://pypi.org/project/intel-tensorflow/) +**Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) +**Linux CPU with Intel® MKL-DNN** Stable Release | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/) | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/) **Red Hat® Enterprise Linux® 7.6 CPU & GPU**
Python 2.7, 3.6 | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/) ## Resources From 21e8f2476eb1e50d70bc65bc81a587364a41abe2 Mon Sep 17 00:00:00 2001 From: nmostafa Date: Thu, 5 Dec 2019 13:12:50 -0800 Subject: [PATCH 190/383] Add UnrankedMemRef Type Closes #261 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/261 from nmostafa:nmostafa/unranked 96b6e918f6ed64496f7573b2db33c0b02658ca45 PiperOrigin-RevId: 284037040 Change-Id: I216503415996e1228269233793eb97fee8c0d62f --- .../mlir/g3doc/ConversionToLLVMDialect.md | 17 ++ third_party/mlir/g3doc/Dialects/Standard.md | 15 +- third_party/mlir/g3doc/LangRef.md | 57 ++++- .../StandardToLLVM/ConvertStandardToLLVM.h | 61 ++++-- .../include/mlir/Dialect/StandardOps/Ops.td | 20 +- third_party/mlir/include/mlir/IR/OpBase.td | 9 + .../mlir/include/mlir/IR/StandardTypes.h | 53 ++++- .../StandardToLLVM/ConvertStandardToLLVM.cpp | 195 +++++++++++++++--- .../mlir/lib/Dialect/StandardOps/Ops.cpp | 90 +++++--- third_party/mlir/lib/IR/AsmPrinter.cpp | 7 + third_party/mlir/lib/IR/MLIRContext.cpp | 4 +- third_party/mlir/lib/IR/StandardTypes.cpp | 31 +++ third_party/mlir/lib/IR/TypeDetail.h | 29 ++- third_party/mlir/lib/Parser/Parser.cpp | 30 ++- 14 files changed, 519 insertions(+), 99 deletions(-) diff --git a/third_party/mlir/g3doc/ConversionToLLVMDialect.md b/third_party/mlir/g3doc/ConversionToLLVMDialect.md index fc0572fce98..595049ad440 100644 --- a/third_party/mlir/g3doc/ConversionToLLVMDialect.md +++ b/third_party/mlir/g3doc/ConversionToLLVMDialect.md @@ -90,6 +90,23 @@ memref<10x?x42x?x123 x f32> -> !llvm.type<"{ float*, float*, i64, [5 x i64], [5 memref<1x? x vector<4xf32>> -> !llvm.type<"{ <4 x float>*, <4 x float>*, i64, [1 x i64], [1 x i64] }"> ``` +If the rank of the memref is unknown at compile time, the Memref is converted to +an unranked descriptor that contains: 1. a 64-bit integer representing the +dynamic rank of the memref, followed by 2. a pointer to a ranked memref +descriptor with the contents listed above. + +Dynamic ranked memrefs should be used only to pass arguments to external library +calls that expect a unified memref type. The called functions can parse any +unranked memref descriptor by reading the rank and parsing the enclosed ranked +descriptor pointer. + +Examples: + +```mlir {.mlir} +// unranked descriptor +memref<*xf32> -> !llvm.type<"{i64, i8*}"> +``` + ### Function Types Function types get converted to LLVM function types. The arguments are converted diff --git a/third_party/mlir/g3doc/Dialects/Standard.md b/third_party/mlir/g3doc/Dialects/Standard.md index cbea654a256..9d53eba328e 100644 --- a/third_party/mlir/g3doc/Dialects/Standard.md +++ b/third_party/mlir/g3doc/Dialects/Standard.md @@ -912,12 +912,21 @@ Examples: // Convert to a type with more known dimensions. %4 = memref_cast %3 : memref to memref<4x?xf32> + +// Convert to a type with unknown rank. +%5 = memref_cast %3 : memref to memref<*xf32> + +// Convert to a type with static rank. +%6 = memref_cast %5 : memref<*xf32> to memref ``` Convert a memref from one type to an equivalent type without changing any data -elements. The source and destination types must both be memref types with the -same element type, same mappings, same address space, and same rank. The -operation is invalid if converting to a mismatching constant dimension. +elements. The types are equivalent if 1. they both have the same static rank, +same element type, same mappings, same address space. The operation is invalid +if converting to a mismatching constant dimension, or 2. exactly one of the +operands have an unknown rank, and they both have the same element type and same +address space. The operation is invalid if both operands are of dynamic rank or +if converting to a mismatching static rank. ### 'mulf' operation diff --git a/third_party/mlir/g3doc/LangRef.md b/third_party/mlir/g3doc/LangRef.md index d084f0fa149..fa22fa5a35e 100644 --- a/third_party/mlir/g3doc/LangRef.md +++ b/third_party/mlir/g3doc/LangRef.md @@ -760,9 +760,15 @@ TODO: Need to decide on a representation for quantized integers Syntax: ``` {.ebnf} -memref-type ::= `memref` `<` dimension-list-ranked tensor-memref-element-type - (`,` layout-specification)? | - (`,` memory-space)? `>` + +memref-type ::= ranked-memref-type | unranked-memref-type + +ranked-memref-type ::= `memref` `<` dimension-list-ranked tensor-memref-element-type + (`,` layout-specification)? | + (`,` memory-space)? `>` + +unranked-memref-type ::= `memref` `<*x` tensor-memref-element-type + (`,` memory-space)? `>` stride-list ::= `[` (dimension (`,` dimension)*)? `]` strided-layout ::= `offset:` dimension `,` `strides: ` stride-list @@ -774,9 +780,48 @@ A `memref` type is a reference to a region of memory (similar to a buffer pointer, but more powerful). The buffer pointed to by a memref can be allocated, aliased and deallocated. A memref can be used to read and write data from/to the memory region which it references. Memref types use the same shape specifier as -tensor types, but do not allow unknown rank. Note that `memref`, `memref<0 -x f32>`, `memref<1 x 0 x f32>`, and `memref<0 x 1 x f32>` are all different -types. +tensor types. Note that `memref`, `memref<0 x f32>`, `memref<1 x 0 x f32>`, +and `memref<0 x 1 x f32>` are all different types. + +A `memref` is allowed to have an unknown rank (e.g. `memref<*xf32>`). The +purpose of unranked memrefs is to allow external library functions to receive +memref arguments of any rank without versioning the functions based on the rank. +Other uses of this type are disallowed or will have undefined behavior. + +##### Codegen of Unranked Memref + +Using unranked memref in codegen besides the case mentioned above is highly +discouraged. Codegen is concerned with generating loop nests and specialized +instructions for high-performance, unranked memref is concerned with hiding the +rank and thus, the number of enclosing loops required to iterate over the data. +However, if there is a need to code-gen unranked memref, one possible path is to +cast into a static ranked type based on the dynamic rank. Another possible path +is to emit a single while loop conditioned on a linear index and perform +delinearization of the linear index to a dynamic array containing the (unranked) +indices. While this is possible, it is expected to not be a good idea to perform +this during codegen as the cost of the translations is expected to be +prohibitive and optimizations at this level are not expected to be worthwhile. +If expressiveness is the main concern, irrespective of performance, passing +unranked memrefs to an external C++ library and implementing rank-agnostic logic +there is expected to be significantly simpler. + +Unranked memrefs may provide expressiveness gains in the future and help bridge +the gap with unranked tensors. Unranked memrefs will not be expected to be +exposed to codegen but one may query the rank of an unranked memref (a special +op will be needed for this purpose) and perform a switch and cast to a ranked +memref as a prerequisite to codegen. + +Example ```mlir {.mlir} // With static ranks, we need a function for each +possible argument type %A = alloc() : memref<16x32xf32> %B = alloc() : +memref<16x32x64xf32> call @helper_2D(%A) : (memref<16x32xf32>)->() call +@helper_3D(%B) : (memref<16x32x64xf32>)->() + +// With unknown rank, the functions can be unified under one unranked type %A = +alloc() : memref<16x32xf32> %B = alloc() : memref<16x32x64xf32> // Remove rank +info %A_u = memref_cast %A : memref<16x32xf32> -> memref<*xf32> %B_u = +memref_cast %B : memref<16x32x64xf32> -> memref<*xf32> // call same function +with dynamic ranks call @helper(%A_u) : (memref<*xf32>)->() call @helper(%B_u) : +(memref<*xf32>)->() ``` The core syntax and representation of a layout specification is a [semi-affine map](Dialects/Affine.md#semi-affine-maps). Additionally, syntactic diff --git a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h index e50a8a548db..6b02edaa389 100644 --- a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h +++ b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h @@ -34,6 +34,9 @@ class Type; } // namespace llvm namespace mlir { + +class UnrankedMemRefType; + namespace LLVM { class LLVMDialect; class LLVMType; @@ -116,6 +119,10 @@ private: // 2. as many index types as memref has dynamic dimensions. Type convertMemRefType(MemRefType type); + // Convert an unranked memref type to an LLVM type that captures the + // runtime rank and a pointer to the static ranked memref desc + Type convertUnrankedMemRefType(UnrankedMemRefType type); + // Convert a 1D vector type into an LLVM vector type. Type convertVectorType(VectorType type); @@ -127,10 +134,34 @@ private: LLVM::LLVMType unwrap(Type type); }; +/// Helper class to produce LLVM dialect operations extracting or inserting +/// values to a struct. +class StructBuilder { +public: + /// Construct a helper for the given value. + explicit StructBuilder(Value *v); + /// Builds IR creating an `undef` value of the descriptor type. + static StructBuilder undef(OpBuilder &builder, Location loc, + Type descriptorType); + + /*implicit*/ operator Value *() { return value; } + +protected: + // LLVM value + Value *value; + // Cached struct type. + Type structType; + +protected: + /// Builds IR to extract a value from the struct at position pos + Value *extractPtr(OpBuilder &builder, Location loc, unsigned pos); + /// Builds IR to set a value in the struct at position pos + void setPtr(OpBuilder &builder, Location loc, unsigned pos, Value *ptr); +}; /// Helper class to produce LLVM dialect operations extracting or inserting /// elements of a MemRef descriptor. Wraps a Value pointing to the descriptor. /// The Value may be null, in which case none of the operations are valid. -class MemRefDescriptor { +class MemRefDescriptor : public StructBuilder { public: /// Construct a helper for the given descriptor value. explicit MemRefDescriptor(Value *descriptor); @@ -169,22 +200,28 @@ public: /// Returns the (LLVM) type this descriptor points to. LLVM::LLVMType getElementType(); - /*implicit*/ operator Value *() { return value; } - private: - Value *extractPtr(OpBuilder &builder, Location loc, unsigned pos); - void setPtr(OpBuilder &builder, Location loc, unsigned pos, Value *ptr); - - // Cached descriptor type. - Type structType; - // Cached index type. Type indexType; - - // Actual descriptor. - Value *value; }; +class UnrankedMemRefDescriptor : public StructBuilder { +public: + /// Construct a helper for the given descriptor value. + explicit UnrankedMemRefDescriptor(Value *descriptor); + /// Builds IR creating an `undef` value of the descriptor type. + static UnrankedMemRefDescriptor undef(OpBuilder &builder, Location loc, + Type descriptorType); + + /// Builds IR extracting the rank from the descriptor + Value *rank(OpBuilder &builder, Location loc); + /// Builds IR setting the rank in the descriptor + void setRank(OpBuilder &builder, Location loc, Value *value); + /// Builds IR extracting ranked memref descriptor ptr + Value *memRefDescPtr(OpBuilder &builder, Location loc); + /// Builds IR setting ranked memref descriptor ptr + void setMemRefDescPtr(OpBuilder &builder, Location loc, Value *value); +}; /// Base class for operation conversions targeting the LLVM IR dialect. Provides /// conversion patterns with an access to the containing LLVMLowering for the /// purpose of type conversions. diff --git a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td index e7439e49502..a231bf8af02 100644 --- a/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td +++ b/third_party/mlir/include/mlir/Dialect/StandardOps/Ops.td @@ -842,7 +842,8 @@ def MemRefCastOp : CastOp<"memref_cast"> { let description = [{ The "memref_cast" operation converts a memref from one type to an equivalent type with a compatible shape. The source and destination types are - when both are memref types with the same element type, affine mappings, + compatible if: + a. both are ranked memref types with the same element type, affine mappings, address space, and rank but where the individual dimensions may add or remove constant dimensions from the memref type. @@ -850,6 +851,7 @@ def MemRefCastOp : CastOp<"memref_cast"> { acts as an assertion that fails at runtime of the dynamic dimensions disagree with resultant destination size. + Example: Assert that the input dynamic shape matches the destination static shape. %2 = memref_cast %1 : memref to memref<4x4xf32> Erase static shape information, replacing it with dynamic information. @@ -864,10 +866,20 @@ def MemRefCastOp : CastOp<"memref_cast"> { dynamic information. %5 = memref_cast %1 : memref<12x4xf32, offset:5, strides: [4, 1]> to memref<12x4xf32, offset:?, strides: [?, ?]> + + b. either or both memref types are unranked with the same element type, and + address space. + + Example: + Cast to concrete shape. + %4 = memref_cast %1 : memref<*xf32> to memref<4x?xf32> + + Erase rank information. + %5 = memref_cast %1 : memref<4x?xf32> to memref<*xf32> }]; - let arguments = (ins AnyMemRef:$source); - let results = (outs AnyMemRef); + let arguments = (ins AnyRankedOrUnrankedMemRef:$source); + let results = (outs AnyRankedOrUnrankedMemRef); let extraClassDeclaration = [{ /// Return true if `a` and `b` are valid operand and result pairs for @@ -875,7 +887,7 @@ def MemRefCastOp : CastOp<"memref_cast"> { static bool areCastCompatible(Type a, Type b); /// The result of a memref_cast is always a memref. - MemRefType getType() { return getResult()->getType().cast(); } + Type getType() { return getResult()->getType(); } }]; } diff --git a/third_party/mlir/include/mlir/IR/OpBase.td b/third_party/mlir/include/mlir/IR/OpBase.td index f81063f1085..6a884f2e948 100644 --- a/third_party/mlir/include/mlir/IR/OpBase.td +++ b/third_party/mlir/include/mlir/IR/OpBase.td @@ -221,6 +221,9 @@ def IsTensorTypePred : CPred<"$_self.isa()">; // Whether a type is a MemRefType. def IsMemRefTypePred : CPred<"$_self.isa()">; +// Whether a type is an IsUnrankedMemRefType +def IsUnrankedMemRefTypePred : CPred<"$_self.isa()">; + // Whether a type is a ShapedType. def IsShapedTypePred : CPred<"$_self.isa()">; @@ -486,6 +489,10 @@ class 2DTensorOf allowedTypes> : TensorRankOf; class 3DTensorOf allowedTypes> : TensorRankOf; class 4DTensorOf allowedTypes> : TensorRankOf; +// Unranked Memref type +def AnyUnrankedMemRef : + ShapedContainerType<[AnyType], + IsUnrankedMemRefTypePred, "unranked.memref">; // Memref type. // Memrefs are blocks of data with fixed type and rank. @@ -494,6 +501,8 @@ class MemRefOf allowedTypes> : def AnyMemRef : MemRefOf<[AnyType]>; +def AnyRankedOrUnrankedMemRef: AnyTypeOf<[AnyUnrankedMemRef, AnyMemRef]>; + // Memref declarations handle any memref, independent of rank, size, (static or // dynamic), layout, or memory space. def I1MemRef : MemRefOf<[I1]>; diff --git a/third_party/mlir/include/mlir/IR/StandardTypes.h b/third_party/mlir/include/mlir/IR/StandardTypes.h index f19c2d276fd..23a1ff2177e 100644 --- a/third_party/mlir/include/mlir/IR/StandardTypes.h +++ b/third_party/mlir/include/mlir/IR/StandardTypes.h @@ -40,6 +40,7 @@ struct VectorTypeStorage; struct RankedTensorTypeStorage; struct UnrankedTensorTypeStorage; struct MemRefTypeStorage; +struct UnrankedMemRefTypeStorage; struct ComplexTypeStorage; struct TupleTypeStorage; @@ -64,6 +65,7 @@ enum Kind { RankedTensor, UnrankedTensor, MemRef, + UnrankedMemRef, Complex, Tuple, None, @@ -243,6 +245,7 @@ public: return type.getKind() == StandardTypes::Vector || type.getKind() == StandardTypes::RankedTensor || type.getKind() == StandardTypes::UnrankedTensor || + type.getKind() == StandardTypes::UnrankedMemRef || type.getKind() == StandardTypes::MemRef; } @@ -370,12 +373,24 @@ public: } }; +/// Base MemRef for Ranked and Unranked variants +class BaseMemRefType : public ShapedType { +public: + using ShapedType::ShapedType; + + /// Methods for support type inquiry through isa, cast, and dyn_cast. + static bool classof(Type type) { + return type.getKind() == StandardTypes::MemRef || + type.getKind() == StandardTypes::UnrankedMemRef; + } +}; + /// MemRef types represent a region of memory that have a shape with a fixed /// number of dimensions. Each shape element can be a non-negative integer or /// unknown (represented by any negative integer). MemRef types also have an /// affine map composition, represented as an array AffineMap pointers. -class MemRefType - : public Type::TypeBase { +class MemRefType : public Type::TypeBase { public: using Base::Base; @@ -426,6 +441,40 @@ private: using Base::getImpl; }; +/// Unranked MemRef type represent multi-dimensional MemRefs that +/// have an unknown rank. +class UnrankedMemRefType + : public Type::TypeBase { +public: + using Base::Base; + + /// Get or create a new UnrankedMemRefType of the provided element + /// type and memory space + static UnrankedMemRefType get(Type elementType, unsigned memorySpace); + + /// Get or create a new UnrankedMemRefType of the provided element + /// type and memory space declared at the given, potentially unknown, + /// location. If the UnrankedMemRefType defined by the arguments would be + /// ill-formed, emit errors and return a nullptr-wrapping type. + static UnrankedMemRefType getChecked(Type elementType, unsigned memorySpace, + Location location); + + /// Verify the construction of a unranked memref type. + static LogicalResult + verifyConstructionInvariants(llvm::Optional loc, + MLIRContext *context, Type elementType, + unsigned memorySpace); + + ArrayRef getShape() const { return llvm::None; } + + /// Returns the memory space in which data referred to by this memref resides. + unsigned getMemorySpace() const; + static bool kindof(unsigned kind) { + return kind == StandardTypes::UnrankedMemRef; + } +}; + /// Tuple types represent a collection of other types. Note: This type merely /// provides a common mechanism for representing tuples in MLIR. It is up to /// dialect authors to provides operations for manipulating them, e.g. diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp index 23c7be310a9..5a6282e8d4d 100644 --- a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp +++ b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp @@ -193,6 +193,22 @@ Type LLVMTypeConverter::convertMemRefType(MemRefType type) { return LLVM::LLVMType::getStructTy(ptrTy, ptrTy, indexTy); } +// Converts UnrankedMemRefType to LLVMType. The result is a descriptor which +// contains: +// 1. int64_t rank, the dynamic rank of this MemRef +// 2. void* ptr, pointer to the static ranked MemRef descriptor. This will be +// stack allocated (alloca) copy of a MemRef descriptor that got casted to +// be unranked. + +static constexpr unsigned kRankInUnrankedMemRefDescriptor = 0; +static constexpr unsigned kPtrInUnrankedMemRefDescriptor = 1; + +Type LLVMTypeConverter::convertUnrankedMemRefType(UnrankedMemRefType type) { + auto rankTy = LLVM::LLVMType::getInt64Ty(llvmDialect); + auto ptrTy = LLVM::LLVMType::getInt8PtrTy(llvmDialect); + return LLVM::LLVMType::getStructTy(rankTy, ptrTy); +} + // Convert an n-D vector type to an LLVM vector type via (n-1)-D array type when // n > 1. // For example, `vector<4 x f32>` converts to `!llvm.type<"<4 x float>">` and @@ -221,6 +237,8 @@ Type LLVMTypeConverter::convertStandardType(Type type) { return convertIndexType(indexType); if (auto memRefType = type.dyn_cast()) return convertMemRefType(memRefType); + if (auto memRefType = type.dyn_cast()) + return convertUnrankedMemRefType(memRefType); if (auto vectorType = type.dyn_cast()) return convertVectorType(vectorType); if (auto llvmType = type.dyn_cast()) @@ -245,22 +263,42 @@ LLVMOpLowering::LLVMOpLowering(StringRef rootOpName, MLIRContext *context, PatternBenefit benefit) : ConversionPattern(rootOpName, benefit, context), lowering(lowering_) {} +/*============================================================================*/ +/* StructBuilder implementation */ +/*============================================================================*/ +StructBuilder::StructBuilder(Value *v) : value(v) { + assert(value != nullptr && "value cannot be null"); + structType = value->getType().cast(); +} + +Value *StructBuilder::extractPtr(OpBuilder &builder, Location loc, + unsigned pos) { + Type type = structType.cast().getStructElementType(pos); + return builder.create(loc, type, value, + builder.getI64ArrayAttr(pos)); +} + +void StructBuilder::setPtr(OpBuilder &builder, Location loc, unsigned pos, + Value *ptr) { + value = builder.create(loc, structType, value, ptr, + builder.getI64ArrayAttr(pos)); +} /*============================================================================*/ /* MemRefDescriptor implementation */ /*============================================================================*/ /// Construct a helper for the given descriptor value. -MemRefDescriptor::MemRefDescriptor(Value *descriptor) : value(descriptor) { - if (value) { - structType = value->getType().cast(); - indexType = value->getType().cast().getStructElementType( - kOffsetPosInMemRefDescriptor); - } +MemRefDescriptor::MemRefDescriptor(Value *descriptor) + : StructBuilder(descriptor) { + assert(value != nullptr && "value cannot be null"); + indexType = value->getType().cast().getStructElementType( + kOffsetPosInMemRefDescriptor); } /// Builds IR creating an `undef` value of the descriptor type. MemRefDescriptor MemRefDescriptor::undef(OpBuilder &builder, Location loc, Type descriptorType) { + Value *descriptor = builder.create(loc, descriptorType.cast()); return MemRefDescriptor(descriptor); @@ -334,24 +372,42 @@ void MemRefDescriptor::setStride(OpBuilder &builder, Location loc, unsigned pos, builder.getI64ArrayAttr({kStridePosInMemRefDescriptor, pos})); } -Value *MemRefDescriptor::extractPtr(OpBuilder &builder, Location loc, - unsigned pos) { - Type type = structType.cast().getStructElementType(pos); - return builder.create(loc, type, value, - builder.getI64ArrayAttr(pos)); -} - -void MemRefDescriptor::setPtr(OpBuilder &builder, Location loc, unsigned pos, - Value *ptr) { - value = builder.create(loc, structType, value, ptr, - builder.getI64ArrayAttr(pos)); -} - LLVM::LLVMType MemRefDescriptor::getElementType() { return value->getType().cast().getStructElementType( kAlignedPtrPosInMemRefDescriptor); } +/*============================================================================*/ +/* UnrankedMemRefDescriptor implementation */ +/*============================================================================*/ + +/// Construct a helper for the given descriptor value. +UnrankedMemRefDescriptor::UnrankedMemRefDescriptor(Value *descriptor) + : StructBuilder(descriptor) {} + +/// Builds IR creating an `undef` value of the descriptor type. +UnrankedMemRefDescriptor UnrankedMemRefDescriptor::undef(OpBuilder &builder, + Location loc, + Type descriptorType) { + Value *descriptor = + builder.create(loc, descriptorType.cast()); + return UnrankedMemRefDescriptor(descriptor); +} +Value *UnrankedMemRefDescriptor::rank(OpBuilder &builder, Location loc) { + return extractPtr(builder, loc, kRankInUnrankedMemRefDescriptor); +} +void UnrankedMemRefDescriptor::setRank(OpBuilder &builder, Location loc, + Value *v) { + setPtr(builder, loc, kRankInUnrankedMemRefDescriptor, v); +} +Value *UnrankedMemRefDescriptor::memRefDescPtr(OpBuilder &builder, + Location loc) { + return extractPtr(builder, loc, kPtrInUnrankedMemRefDescriptor); +} +void UnrankedMemRefDescriptor::setMemRefDescPtr(OpBuilder &builder, + Location loc, Value *v) { + setPtr(builder, loc, kPtrInUnrankedMemRefDescriptor, v); +} namespace { // Base class for Standard to LLVM IR op conversions. Matches the Op type // provided as template argument. Carries a reference to the LLVM dialect in @@ -432,7 +488,7 @@ struct FuncOpConversion : public LLVMLegalizationPattern { auto converted = lowering.convertType(t).dyn_cast(); if (!converted) return matchFailure(); - if (t.isa()) { + if (t.isa() || t.isa()) { converted = converted.getPointerTo(); promotedArgIndices.push_back(en.index()); } @@ -983,6 +1039,14 @@ struct CallOpInterfaceLowering : public LLVMLegalizationPattern { Type packedResult; unsigned numResults = callOp.getNumResults(); auto resultTypes = llvm::to_vector<4>(callOp.getResultTypes()); + + for (Type resType : resultTypes) { + assert(!resType.isa() && + "Returning unranked memref is not supported. Pass result as an" + "argument instead."); + (void)resType; + } + if (numResults != 0) { if (!(packedResult = this->lowering.packFunctionResults(resultTypes))) return this->matchFailure(); @@ -1076,11 +1140,26 @@ struct MemRefCastOpLowering : public LLVMLegalizationPattern { PatternMatchResult match(Operation *op) const override { auto memRefCastOp = cast(op); - MemRefType sourceType = - memRefCastOp.getOperand()->getType().cast(); - MemRefType targetType = memRefCastOp.getType(); - return (isSupportedMemRefType(targetType) && - isSupportedMemRefType(sourceType)) + Type srcType = memRefCastOp.getOperand()->getType(); + Type dstType = memRefCastOp.getType(); + + if (srcType.isa() && dstType.isa()) { + MemRefType sourceType = + memRefCastOp.getOperand()->getType().cast(); + MemRefType targetType = memRefCastOp.getType().cast(); + return (isSupportedMemRefType(targetType) && + isSupportedMemRefType(sourceType)) + ? matchSuccess() + : matchFailure(); + } + + // At least one of the operands is unranked type + assert(srcType.isa() || + dstType.isa()); + + // Unranked to unranked cast is disallowed + return !(srcType.isa() && + dstType.isa()) ? matchSuccess() : matchFailure(); } @@ -1089,12 +1168,65 @@ struct MemRefCastOpLowering : public LLVMLegalizationPattern { ConversionPatternRewriter &rewriter) const override { auto memRefCastOp = cast(op); OperandAdaptor transformed(operands); - // memref_cast is defined for source and destination memref types with the - // same element type, same mappings, same address space and same rank. - // Therefore a simple bitcast suffices. If not it is undefined behavior. + + auto srcType = memRefCastOp.getOperand()->getType(); + auto dstType = memRefCastOp.getType(); auto targetStructType = lowering.convertType(memRefCastOp.getType()); - rewriter.replaceOpWithNewOp(op, targetStructType, - transformed.source()); + auto loc = op->getLoc(); + + if (srcType.isa() && dstType.isa()) { + // memref_cast is defined for source and destination memref types with the + // same element type, same mappings, same address space and same rank. + // Therefore a simple bitcast suffices. If not it is undefined behavior. + rewriter.replaceOpWithNewOp(op, targetStructType, + transformed.source()); + } else if (srcType.isa() && dstType.isa()) { + // Casting ranked to unranked memref type + // Set the rank in the destination from the memref type + // Allocate space on the stack and copy the src memref decsriptor + // Set the ptr in the destination to the stack space + auto srcMemRefType = srcType.cast(); + int64_t rank = srcMemRefType.getRank(); + // ptr = AllocaOp sizeof(MemRefDescriptor) + auto ptr = lowering.promoteOneMemRefDescriptor(loc, transformed.source(), + rewriter); + // voidptr = BitCastOp srcType* to void* + auto voidPtr = + rewriter.create(loc, getVoidPtrType(), ptr) + .getResult(); + // rank = ConstantOp srcRank + auto rankVal = rewriter.create( + loc, lowering.convertType(rewriter.getIntegerType(64)), + rewriter.getI64IntegerAttr(rank)); + // undef = UndefOp + UnrankedMemRefDescriptor memRefDesc = + UnrankedMemRefDescriptor::undef(rewriter, loc, targetStructType); + // d1 = InsertValueOp undef, rank, 0 + memRefDesc.setRank(rewriter, loc, rankVal); + // d2 = InsertValueOp d1, voidptr, 1 + memRefDesc.setMemRefDescPtr(rewriter, loc, voidPtr); + rewriter.replaceOp(op, (Value *)memRefDesc); + + } else if (srcType.isa() && dstType.isa()) { + // Casting from unranked type to ranked. + // The operation is assumed to be doing a correct cast. If the destination + // type mismatches the unranked the type, it is undefined behavior. + UnrankedMemRefDescriptor memRefDesc(transformed.source()); + // ptr = ExtractValueOp src, 1 + auto ptr = memRefDesc.memRefDescPtr(rewriter, loc); + // castPtr = BitCastOp i8* to structTy* + auto castPtr = + rewriter + .create( + loc, targetStructType.cast().getPointerTo(), + ptr) + .getResult(); + // struct = LoadOp castPtr + auto loadOp = rewriter.create(loc, castPtr); + rewriter.replaceOp(op, loadOp.getResult()); + } else { + llvm_unreachable("Unsuppored unranked memref to unranked memref cast"); + } } }; @@ -1896,7 +2028,8 @@ SmallVector LLVMTypeConverter::promoteMemRefDescriptors( for (auto it : llvm::zip(opOperands, operands)) { auto *operand = std::get<0>(it); auto *llvmOperand = std::get<1>(it); - if (!operand->getType().isa()) { + if (!operand->getType().isa() && + !operand->getType().isa()) { promotedOperands.push_back(operand); continue; } diff --git a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp index 9f6510d0f17..0e2bee063a8 100644 --- a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp +++ b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp @@ -1769,46 +1769,70 @@ bool MemRefCastOp::areCastCompatible(Type a, Type b) { auto aT = a.dyn_cast(); auto bT = b.dyn_cast(); - if (!aT || !bT) - return false; - if (aT.getElementType() != bT.getElementType()) - return false; - if (aT.getAffineMaps() != bT.getAffineMaps()) { - int64_t aOffset, bOffset; - SmallVector aStrides, bStrides; - if (failed(getStridesAndOffset(aT, aStrides, aOffset)) || - failed(getStridesAndOffset(bT, bStrides, bOffset)) || - aStrides.size() != bStrides.size()) - return false; + auto uaT = a.dyn_cast(); + auto ubT = b.dyn_cast(); - // Strides along a dimension/offset are compatible if the value in the - // source memref is static and the value in the target memref is the - // same. They are also compatible if either one is dynamic (see description - // of MemRefCastOp for details). - auto checkCompatible = [](int64_t a, int64_t b) { - return (a == MemRefType::getDynamicStrideOrOffset() || - b == MemRefType::getDynamicStrideOrOffset() || a == b); - }; - if (!checkCompatible(aOffset, bOffset)) + if (aT && bT) { + if (aT.getElementType() != bT.getElementType()) return false; - for (auto aStride : enumerate(aStrides)) - if (!checkCompatible(aStride.value(), bStrides[aStride.index()])) + if (aT.getAffineMaps() != bT.getAffineMaps()) { + int64_t aOffset, bOffset; + SmallVector aStrides, bStrides; + if (failed(getStridesAndOffset(aT, aStrides, aOffset)) || + failed(getStridesAndOffset(bT, bStrides, bOffset)) || + aStrides.size() != bStrides.size()) return false; - } - if (aT.getMemorySpace() != bT.getMemorySpace()) - return false; - // They must have the same rank, and any specified dimensions must match. - if (aT.getRank() != bT.getRank()) - return false; - - for (unsigned i = 0, e = aT.getRank(); i != e; ++i) { - int64_t aDim = aT.getDimSize(i), bDim = bT.getDimSize(i); - if (aDim != -1 && bDim != -1 && aDim != bDim) + // Strides along a dimension/offset are compatible if the value in the + // source memref is static and the value in the target memref is the + // same. They are also compatible if either one is dynamic (see + // description of MemRefCastOp for details). + auto checkCompatible = [](int64_t a, int64_t b) { + return (a == MemRefType::getDynamicStrideOrOffset() || + b == MemRefType::getDynamicStrideOrOffset() || a == b); + }; + if (!checkCompatible(aOffset, bOffset)) + return false; + for (auto aStride : enumerate(aStrides)) + if (!checkCompatible(aStride.value(), bStrides[aStride.index()])) + return false; + } + if (aT.getMemorySpace() != bT.getMemorySpace()) return false; + + // They must have the same rank, and any specified dimensions must match. + if (aT.getRank() != bT.getRank()) + return false; + + for (unsigned i = 0, e = aT.getRank(); i != e; ++i) { + int64_t aDim = aT.getDimSize(i), bDim = bT.getDimSize(i); + if (aDim != -1 && bDim != -1 && aDim != bDim) + return false; + } + return true; + } else { + if (!aT && !uaT) + return false; + if (!bT && !ubT) + return false; + // Unranked to unranked casting is unsupported + if (uaT && ubT) + return false; + + auto aEltType = (aT) ? aT.getElementType() : uaT.getElementType(); + auto bEltType = (bT) ? bT.getElementType() : ubT.getElementType(); + if (aEltType != bEltType) + return false; + + auto aMemSpace = (aT) ? aT.getMemorySpace() : uaT.getMemorySpace(); + auto bMemSpace = (bT) ? bT.getMemorySpace() : ubT.getMemorySpace(); + if (aMemSpace != bMemSpace) + return false; + + return true; } - return true; + return false; } OpFoldResult MemRefCastOp::fold(ArrayRef operands) { diff --git a/third_party/mlir/lib/IR/AsmPrinter.cpp b/third_party/mlir/lib/IR/AsmPrinter.cpp index 1d3f9d74403..a3a15dac533 100644 --- a/third_party/mlir/lib/IR/AsmPrinter.cpp +++ b/third_party/mlir/lib/IR/AsmPrinter.cpp @@ -1086,6 +1086,13 @@ void ModulePrinter::printType(Type type) { os << '>'; return; } + case StandardTypes::UnrankedMemRef: { + auto v = type.cast(); + os << "memref<*x"; + printType(v.getElementType()); + os << '>'; + return; + } case StandardTypes::Complex: os << "complex<"; printType(type.cast().getElementType()); diff --git a/third_party/mlir/lib/IR/MLIRContext.cpp b/third_party/mlir/lib/IR/MLIRContext.cpp index be904f8da44..d3feca14477 100644 --- a/third_party/mlir/lib/IR/MLIRContext.cpp +++ b/third_party/mlir/lib/IR/MLIRContext.cpp @@ -90,8 +90,8 @@ struct BuiltinDialect : public Dialect { UnknownLoc>(); addTypes(); + MemRefType, UnrankedMemRefType, NoneType, OpaqueType, + RankedTensorType, TupleType, UnrankedTensorType, VectorType>(); // TODO: These operations should be moved to a different dialect when they // have been fully decoupled from the core. diff --git a/third_party/mlir/lib/IR/StandardTypes.cpp b/third_party/mlir/lib/IR/StandardTypes.cpp index 8a4b51f215a..7054f6d5ca8 100644 --- a/third_party/mlir/lib/IR/StandardTypes.cpp +++ b/third_party/mlir/lib/IR/StandardTypes.cpp @@ -390,6 +390,37 @@ ArrayRef MemRefType::getAffineMaps() const { unsigned MemRefType::getMemorySpace() const { return getImpl()->memorySpace; } +//===----------------------------------------------------------------------===// +// UnrankedMemRefType +//===----------------------------------------------------------------------===// + +UnrankedMemRefType UnrankedMemRefType::get(Type elementType, + unsigned memorySpace) { + return Base::get(elementType.getContext(), StandardTypes::UnrankedMemRef, + elementType, memorySpace); +} + +UnrankedMemRefType UnrankedMemRefType::getChecked(Type elementType, + unsigned memorySpace, + Location location) { + return Base::getChecked(location, elementType.getContext(), + StandardTypes::UnrankedMemRef, elementType, + memorySpace); +} + +unsigned UnrankedMemRefType::getMemorySpace() const { + return getImpl()->memorySpace; +} + +LogicalResult UnrankedMemRefType::verifyConstructionInvariants( + llvm::Optional loc, MLIRContext *context, Type elementType, + unsigned memorySpace) { + // Check that memref is formed from allowed types. + if (!elementType.isIntOrFloat() && !elementType.isa()) + return emitOptionalError(*loc, "invalid memref element type"); + return success(); +} + /// Given MemRef `sizes` that are either static or dynamic, returns the /// canonical "contiguous" strides AffineExpr. Strides are multiplicative and /// once a dynamic dimension is encountered, all canonical strides become diff --git a/third_party/mlir/lib/IR/TypeDetail.h b/third_party/mlir/lib/IR/TypeDetail.h index 1cf0f5a7190..5bcb0b61aa5 100644 --- a/third_party/mlir/lib/IR/TypeDetail.h +++ b/third_party/mlir/lib/IR/TypeDetail.h @@ -119,8 +119,8 @@ struct FunctionTypeStorage : public TypeStorage { /// Shaped Type Storage. struct ShapedTypeStorage : public TypeStorage { - ShapedTypeStorage(Type elementType, unsigned subclassData = 0) - : TypeStorage(subclassData), elementType(elementType) {} + ShapedTypeStorage(Type elementTy, unsigned subclassData = 0) + : TypeStorage(subclassData), elementType(elementTy) {} /// The hash key used for uniquing. using KeyTy = Type; @@ -252,6 +252,31 @@ struct MemRefTypeStorage : public ShapedTypeStorage { const unsigned memorySpace; }; +/// Unranked MemRef is a MemRef with unknown rank. +/// Only element type and memory space are known +struct UnrankedMemRefTypeStorage : public ShapedTypeStorage { + + UnrankedMemRefTypeStorage(Type elementTy, const unsigned memorySpace) + : ShapedTypeStorage(elementTy), memorySpace(memorySpace) {} + + /// The hash key used for uniquing. + using KeyTy = std::tuple; + bool operator==(const KeyTy &key) const { + return key == KeyTy(elementType, memorySpace); + } + + /// Construction. + static UnrankedMemRefTypeStorage *construct(TypeStorageAllocator &allocator, + const KeyTy &key) { + + // Initialize the memory using placement new. + return new (allocator.allocate()) + UnrankedMemRefTypeStorage(std::get<0>(key), std::get<1>(key)); + } + /// Memory space in which data referenced by memref resides. + const unsigned memorySpace; +}; + /// Complex Type Storage. struct ComplexTypeStorage : public TypeStorage { ComplexTypeStorage(Type elementType) : elementType(elementType) {} diff --git a/third_party/mlir/lib/Parser/Parser.cpp b/third_party/mlir/lib/Parser/Parser.cpp index 3ccee376985..ddc8d0191f5 100644 --- a/third_party/mlir/lib/Parser/Parser.cpp +++ b/third_party/mlir/lib/Parser/Parser.cpp @@ -1054,8 +1054,13 @@ ParseResult Parser::parseStridedLayout(int64_t &offset, /// Parse a memref type. /// -/// memref-type ::= `memref` `<` dimension-list-ranked type -/// (`,` semi-affine-map-composition)? (`,` memory-space)? `>` +/// memref-type ::= ranked-memref-type | unranked-memref-type +/// +/// ranked-memref-type ::= `memref` `<` dimension-list-ranked type +/// (`,` semi-affine-map-composition)? (`,` +/// memory-space)? `>` +/// +/// unranked-memref-type ::= `memref` `<*x` type (`,` memory-space)? `>` /// /// semi-affine-map-composition ::= (semi-affine-map `,` )* semi-affine-map /// memory-space ::= integer-literal /* | TODO: address-space-id */ @@ -1066,9 +1071,20 @@ Type Parser::parseMemRefType() { if (parseToken(Token::less, "expected '<' in memref type")) return nullptr; + bool isUnranked; SmallVector dimensions; - if (parseDimensionListRanked(dimensions)) - return nullptr; + + if (consumeIf(Token::star)) { + // This is an unranked memref type. + isUnranked = true; + if (parseXInDimensionList()) + return nullptr; + + } else { + isUnranked = false; + if (parseDimensionListRanked(dimensions)) + return nullptr; + } // Parse the element type. auto typeLoc = getToken().getLoc(); @@ -1093,6 +1109,8 @@ Type Parser::parseMemRefType() { consumeToken(Token::integer); parsedMemorySpace = true; } else { + if (isUnranked) + return emitError("cannot have affine map for unranked memref type"); if (parsedMemorySpace) return emitError("expected memory space to be last in memref type"); if (getToken().is(Token::kw_offset)) { @@ -1131,6 +1149,10 @@ Type Parser::parseMemRefType() { return nullptr; } + if (isUnranked) + return UnrankedMemRefType::getChecked(elementType, memorySpace, + getEncodedSourceLocation(typeLoc)); + return MemRefType::getChecked(dimensions, elementType, affineMapComposition, memorySpace, getEncodedSourceLocation(typeLoc)); } From 8af6ab559762c0de1d2c1f1bfb90b0c8e03c1452 Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Thu, 5 Dec 2019 13:14:40 -0800 Subject: [PATCH 191/383] Docstring fixes for initializers - Removes some 1.x references - Adds examples - Adds a description of what an initializer is for PiperOrigin-RevId: 284037370 Change-Id: I7ad061f6c53d210e97cede28aceb6428a001858a --- tensorflow/python/ops/init_ops_v2.py | 208 ++++++++++++++++++++------- 1 file changed, 153 insertions(+), 55 deletions(-) diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py index c490921a285..9c403d6a7b4 100644 --- a/tensorflow/python/ops/init_ops_v2.py +++ b/tensorflow/python/ops/init_ops_v2.py @@ -91,16 +91,68 @@ class Initializer(object): @tf_export("zeros_initializer", v1=[]) class Zeros(Initializer): - """Initializer that generates tensors initialized to 0.""" + """Initializer that generates tensors initialized to 0. + + Initializers allow you to pre-specify an initialization strategy, encoded in + the Initializer object, without knowing the shape and dtype of the variable + being initialized. + + Examples: + + >>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.zeros_initializer()) + >>> v1 + + >>> v2 + + >>> make_variables(4, tf.random_uniform_initializer(minval=-1., maxval=1.)) + (, >> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.ones_initializer()) + >>> v1 + + >>> v2 + + >>> make_variables(4, tf.random_uniform_initializer(minval=-1., maxval=1.)) + (, >> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.constant_initializer(2.)) + >>> v1 + + >>> v2 + + >>> make_variables(4, tf.random_uniform_initializer(minval=-1., maxval=1.)) + (, >> value = [0, 1, 2, 3, 4, 5, 6, 7] + >>> init = tf.constant_initializer(value) + >>> # Fitting shape + >>> tf.Variable(init(shape=[2, 4], dtype=tf.float32)) + + >>> # Larger shape + >>> tf.Variable(init(shape=[3, 4], dtype=tf.float32)) + Traceback (most recent call last): + ... + TypeError: ...value has 8 elements, shape is (3, 4) with 12 elements... + >>> # Smaller shape + >>> tf.Variable(init(shape=[2, 3], dtype=tf.float32)) + Traceback (most recent call last): + ... + TypeError: ...value has 8 elements, shape is (2, 3) with 6 elements... Args: value: A Python scalar, list or tuple of values, or a N-dimensional numpy @@ -143,36 +233,6 @@ class Constant(Initializer): Raises: TypeError: If the input `value` is not one of the expected types. - - Examples: - The following example can be rewritten using a numpy.ndarray instead - of the `value` list, even reshaped, as shown in the two commented lines - below the `value` list initialization. - - >>> value = [0, 1, 2, 3, 4, 5, 6, 7] - >>> init = tf.compat.v1.constant_initializer(value) - >>> # Fitting shape - >>> with tf.compat.v1.Session(): - ... x = tf.compat.v1.get_variable('x', shape=[2, 4], initializer=init) - ... x.initializer.run() - ... print(x.eval()) - [[0. 1. 2. 3.] - [4. 5. 6. 7.]] - >>> # Larger shape - >>> with tf.compat.v1.Session(): - ... y = tf.compat.v1.get_variable('y', shape=[3, 4], initializer=init) - ... y.initializer.run() - ... print(y.eval()) - [[0. 1. 2. 3.] - [4. 5. 6. 7.] - [7. 7. 7. 7.]] - >>> # Smaller shape - >>> with tf.compat.v1.Session(): - ... z = tf.compat.v1.get_variable('z', shape=[2, 3], initializer=init) - Traceback (most recent call last): - ... - ValueError: Too many elements provided. Needed at most 6, but received 8 - """ def __init__(self, value=0): @@ -207,14 +267,33 @@ class Constant(Initializer): class RandomUniform(Initializer): """Initializer that generates tensors with a uniform distribution. + Initializers allow you to pre-specify an initialization strategy, encoded in + the Initializer object, without knowing the shape and dtype of the variable + being initialized. + + Examples: + + >>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.ones_initializer()) + >>> v1 + + >>> v2 + + >>> make_variables(4, tf.random_uniform_initializer(minval=-1., maxval=1.)) + (, >> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, + ... tf.random_normal_initializer(mean=1., stddev=2.)) + >>> v1 + + >>> v2 + >> make_variables(4, tf.random_uniform_initializer(minval=-1., maxval=1.)) + (, Date: Thu, 5 Dec 2019 13:19:17 -0800 Subject: [PATCH 192/383] [tfdbg] Add CurtHealth mode to DebugNumericSummaryV2Op. - In order to mitigate overflow constraints, we add an optional attribute to DebugNumericSummaryV3Op to set the output to float64. Note float64 is now supported on TPUs in addition to the other devices. Additionally, the attribute is set to float32 by default for backwards compatibility. - The TensorDebugMode added is CURT_HEALTH, a mode that computes a shape-[2] rank-1 tensor given any float-type tensor. The first element is the id of the tensor. The second element is a bit which is set to 1 if the tensor contains any NaN or inf, and 0 otherwise. - The CPU and GPU kernels of the op are added. PiperOrigin-RevId: 284038207 Change-Id: Ifdc700e742ecf86012a9e67cd517a0f9642e6579 --- .../api_def_DebugNumericSummaryV2.pbtxt | 54 +++++- tensorflow/core/kernels/debug_ops.cc | 64 +++++-- tensorflow/core/kernels/debug_ops.h | 160 +++++++++++++----- tensorflow/core/kernels/debug_ops_gpu.cu.cc | 79 +++++++-- tensorflow/core/ops/debug_ops.cc | 11 +- .../python/debug/lib/debug_v2_ops_test.py | 126 +++++++++++++- .../api/golden/v1/tensorflow.raw_ops.pbtxt | 2 +- .../api/golden/v2/tensorflow.raw_ops.pbtxt | 2 +- 8 files changed, 412 insertions(+), 86 deletions(-) diff --git a/tensorflow/core/api_def/base_api/api_def_DebugNumericSummaryV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_DebugNumericSummaryV2.pbtxt index c9097723057..28f0271c7e8 100644 --- a/tensorflow/core/api_def/base_api/api_def_DebugNumericSummaryV2.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_DebugNumericSummaryV2.pbtxt @@ -15,17 +15,67 @@ Tensor debug mode: the mode in which the input tensor is summarized tensorflow/core/protobuf/debug_event.proto for details. Supported values: - 8 (REDUCE_INF_NAN_THREE_SLOTS): Output a float32 tensor of shape + 2 (CURT_HEALTH): Output a float32/64 tensor of shape [2]. The 1st + element is the tensor_id, if provided, and -1 otherwise. The 2nd + element is a bit which is set to 1 if the input tensor has an + infinity or nan value, or zero otherwise. + + 3 (CONCISE_HEALTH): Ouput a float32/64 tensor of shape [5]. The 1st + element is the tensor_id, if provided, and -1 otherwise. The + remaining four slots are the total number of elements, -infs, + +infs, and nans in the input tensor respectively. + + 4 (FULL_HEALTH): Output a float32/64 tensor of shape [11]. The 1st + element is the tensor_id, if provided, and -1 otherwise. The 2nd + element is the device_id, if provided, and -1 otherwise. The 3rd + element holds the datatype value of the input tensor as according + to the enumerated type in tensorflow/core/framework/types.proto. + The remaining elements hold the total number of elements, -infs, + +infs, nans, negative finite numbers, zeros, and positive finite + numbers in the input tensor respectively. + + 5 (SHAPE): Output a float32/64 tensor of shape [10]. The 1st + element is the tensor_id, if provided, and -1 otherwise. The 2nd + element holds the datatype value of the input tensor as according + to the enumerated type in tensorflow/core/framework/types.proto. + The 3rd element holds the rank of the tensor. The 4th element holds + the number of elements within the tensor. Finally the remaining 6 + elements hold the shape of the tensor. If the rank of the tensor + is lower than 6, the shape is right padded with zeros. If the rank + is greater than 6, the head of the shape is truncated. + + 6 (FULL_NUMERICS): Output a float32/64 tensor of shape [22]. The 1st + element is the tensor_id, if provided, and -1 otherwise. The 2nd + element is the device_id, if provided, and -1 otherwise. The 3rd + element holds the datatype value of the input tensor as according + to the enumerated type in tensorflow/core/framework/types.proto. + The 4th element holds the rank of the tensor. The 5th to 11th + elements hold the shape of the tensor. If the rank of the tensor + is lower than 6, the shape is right padded with zeros. If the rank + is greater than 6, the head of the shape is truncated. The 12th to + 18th elements hold the number of elements, -infs, +infs, nans, + denormal floats, negative finite numbers, zeros, and positive + finite numbers in the input tensor respectively. The final four + elements hold the min value, max value, mean, and variance of the + input tensor. + + 8 (REDUCE_INF_NAN_THREE_SLOTS): Output a float32/64 tensor of shape [3]. The 1st element is -inf if any elements of the input tensor is -inf, or zero otherwise. The 2nd element is +inf if any elements of the input tensor is +inf, or zero otherwise. The 3rd element is - nan if any element of the input tensor is nan, or zero otherwise + nan if any element of the input tensor is nan, or zero otherwise. END } attr { name: "tensor_id" description: <("T"), \ - DebugNumericSummaryV2Op); -TF_CALL_half(REGISTER_DEBUG_NUMERIC_SUMMARY_V2); -TF_CALL_bfloat16(REGISTER_DEBUG_NUMERIC_SUMMARY_V2); -TF_CALL_float(REGISTER_DEBUG_NUMERIC_SUMMARY_V2); -TF_CALL_double(REGISTER_DEBUG_NUMERIC_SUMMARY_V2); +#define REGISTER_DEBUG_NUMERIC_SUMMARY_V2_FLOAT(type) \ + REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("output_dtype"), \ + DebugNumericSummaryV2Op); +TF_CALL_half(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_FLOAT); +TF_CALL_bfloat16(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_FLOAT); +TF_CALL_float(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_FLOAT); +TF_CALL_double(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_FLOAT); + +#define REGISTER_DEBUG_NUMERIC_SUMMARY_V2_DOUBLE(type) \ + REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .TypeConstraint("output_dtype"), \ + DebugNumericSummaryV2Op); +TF_CALL_half(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_DOUBLE); +TF_CALL_bfloat16(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_DOUBLE); +TF_CALL_float(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_DOUBLE); +TF_CALL_double(REGISTER_DEBUG_NUMERIC_SUMMARY_V2_DOUBLE); #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") .Device(DEVICE_GPU) - .TypeConstraint("T"), - DebugNumericSummaryV2Op); -REGISTER_KERNEL_BUILDER( - Name("DebugNumericSummaryV2").Device(DEVICE_GPU).TypeConstraint("T"), - DebugNumericSummaryV2Op); + .TypeConstraint("T") + .TypeConstraint("output_dtype"), + DebugNumericSummaryV2Op); REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") .Device(DEVICE_GPU) - .TypeConstraint("T"), - DebugNumericSummaryV2Op); + .TypeConstraint("T") + .TypeConstraint("output_dtype"), + DebugNumericSummaryV2Op); +REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .TypeConstraint("output_dtype"), + DebugNumericSummaryV2Op); +REGISTER_KERNEL_BUILDER( + Name("DebugNumericSummaryV2") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .TypeConstraint("output_dtype"), + DebugNumericSummaryV2Op); +REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .TypeConstraint("output_dtype"), + DebugNumericSummaryV2Op); +REGISTER_KERNEL_BUILDER(Name("DebugNumericSummaryV2") + .Device(DEVICE_GPU) + .TypeConstraint("T") + .TypeConstraint("output_dtype"), + DebugNumericSummaryV2Op); #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // namespace tensorflow diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 72a333dd1f0..31f5e1ca6de 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -455,22 +455,38 @@ typedef Eigen::ThreadPoolDevice CPUDevice; typedef Eigen::GpuDevice GPUDevice; #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -template -struct ReduceInfNanTwoSlotsLaunch { - void Run(const GPUDevice& d, const T* data, int size, float output[2]); +template +struct CurtHealthLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]); }; -extern template struct ReduceInfNanTwoSlotsLaunch; -extern template struct ReduceInfNanTwoSlotsLaunch; -extern template struct ReduceInfNanTwoSlotsLaunch; +extern template struct CurtHealthLaunch; +extern template struct CurtHealthLaunch; +extern template struct CurtHealthLaunch; +extern template struct CurtHealthLaunch; +extern template struct CurtHealthLaunch; +extern template struct CurtHealthLaunch; + +template +struct ReduceInfNanThreeSlotsLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); +}; + +extern template struct ReduceInfNanThreeSlotsLaunch; +extern template struct ReduceInfNanThreeSlotsLaunch; +extern template struct ReduceInfNanThreeSlotsLaunch; +extern template struct ReduceInfNanThreeSlotsLaunch; +extern template struct ReduceInfNanThreeSlotsLaunch; +extern template struct ReduceInfNanThreeSlotsLaunch; + #endif -template +template class DebugNumericSummaryV2Op; // Numeric summary op for tfdbg v2: CPU Kernel. -template -class DebugNumericSummaryV2Op : public OpKernel { +template +class DebugNumericSummaryV2Op : public OpKernel { public: explicit DebugNumericSummaryV2Op(OpKernelConstruction* context) : OpKernel(context) { @@ -481,27 +497,51 @@ class DebugNumericSummaryV2Op : public OpKernel { void Compute(OpKernelContext* context) override { const Tensor& tensor = context->input(0); + auto in = tensor.flat(); + const Tin* data = in.data(); + const int64 size = in.size(); + Tensor* output_tensor; + Tout tensor_id = static_cast(tensor_id_); + // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because + // that mode does not make use of tensor_id. + if (tensor_debug_mode_ != 8) { + OP_REQUIRES( + context, tensor_id_ <= kMaxTensorId, + errors::InvalidArgument("DebugNumericSummaryV2Op requires " + "tensor_id to be less than or equal to " + "(2^", + std::numeric_limits::digits, + "). Given tensor_id:", tensor_id_)); + } - if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. - auto in = tensor.flat(); - const T* data = in.data(); - const int64 size = in.size(); - - Tensor* output_tensor; + if (tensor_debug_mode_ == 2) { // CURT_HEALTH + TensorShape shape({2}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + output_tensor->flat()(0) = tensor_id; // Slot tensor id + output_tensor->flat()(1) = 0.0; // Has inf or nan + int fp_props = + std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { + return Eigen::numext::isfinite(y) ? x : 1; + }); + if (fp_props) { + output_tensor->flat()(1) = 1.0; + } + } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. TensorShape shape({3}); OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); - output_tensor->flat()(0) = 0.0; // Slot for -inf. - output_tensor->flat()(1) = 0.0; // Slot for inf. - output_tensor->flat()(2) = 0.0; // Slot for nan. + output_tensor->flat()(0) = 0.0; // Slot for -inf. + output_tensor->flat()(1) = 0.0; // Slot for inf. + output_tensor->flat()(2) = 0.0; // Slot for nan. int fp_props = - std::accumulate(data, data + size, 0, [](const int x, const T& y) { + std::accumulate(data, data + size, 0, [](const int x, const Tin& y) { int result = x; if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { // Do nothing: common case. } else if (Eigen::numext::isinf(y)) { - result |= y < static_cast(0.f) ? kNegInfBit : kPosInfBit; + result |= y < static_cast(0.f) ? kNegInfBit : kPosInfBit; } else if (Eigen::numext::isnan(y)) { result |= kNaNBit; } @@ -509,18 +549,14 @@ class DebugNumericSummaryV2Op : public OpKernel { }); if (fp_props & kNegInfBit) { - output_tensor->flat()(0) = - -std::numeric_limits::infinity(); + output_tensor->flat()(0) = -std::numeric_limits::infinity(); } if (fp_props & kPosInfBit) { - output_tensor->flat()(1) = - std::numeric_limits::infinity(); + output_tensor->flat()(1) = std::numeric_limits::infinity(); } if (fp_props & kNaNBit) { - output_tensor->flat()(2) = - std::numeric_limits::quiet_NaN(); + output_tensor->flat()(2) = std::numeric_limits::quiet_NaN(); } - } else { // TODO(cais): Implement other tensor debug modes in debug_event.proto. context->SetStatus(errors::Unimplemented( @@ -530,16 +566,17 @@ class DebugNumericSummaryV2Op : public OpKernel { private: int tensor_debug_mode_; - int tensor_id_; + int64 tensor_id_; static constexpr int kNegInfBit = 0x01; static constexpr int kPosInfBit = 0x02; static constexpr int kNaNBit = 0x04; + static constexpr int64 kMaxTensorId = 1L << std::numeric_limits::digits; }; #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM -template -class DebugNumericSummaryV2Op : public AsyncOpKernel { +template +class DebugNumericSummaryV2Op : public AsyncOpKernel { public: typedef GPUDevice Device; @@ -551,8 +588,52 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { } void ComputeAsync(OpKernelContext* context, DoneCallback done) override { - if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. - Tensor* output_tensor; + Tensor* output_tensor; + Tout tensor_id = static_cast(tensor_id_); + // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because + // that mode does not make use of tensor_id. + if (tensor_debug_mode_ != 8) { + OP_REQUIRES_ASYNC( + context, tensor_id_ <= kMaxTensorId, + errors::InvalidArgument("DebugNumericSummaryV2Op requires " + "tensor_id to be less than or equal to " + "(2^", + std::numeric_limits::digits, + "). Given tensor_id:", tensor_id_), + done); + } + + if (tensor_debug_mode_ == 2) { // CURT_HEALTH. + TensorShape shape({2}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + + auto* stream = context->op_device_context()->stream(); + OP_REQUIRES_ASYNC(context, stream != nullptr, + errors::Internal("No GPU stream available."), done); + + se::DeviceMemoryBase output_tensor_ptr( + output_tensor->flat().data(), + output_tensor->flat().size()); + stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout)); + // Copy tensor_id to slot zero + stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout)); + if (context->input(0).NumElements() == 0) { + done(); + return; + } + + // Call the GPU kernels for the numerical (inf/nan) checks. + const Device& d = context->eigen_device(); + auto input = context->input(0).flat(); + CurtHealthLaunch().Run(d, input.data(), input.size(), + output_tensor->flat().data() + 1); + + auto check_cb = [this, done]() { done(); }; + + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, std::move(check_cb)); + } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. TensorShape shape({3}); OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); @@ -562,10 +643,10 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { errors::Internal("No GPU stream available."), done); se::DeviceMemoryBase output_tensor_ptr( - output_tensor->flat().data(), - output_tensor->flat().size()); + output_tensor->flat().data(), + output_tensor->flat().size()); stream->ThenMemset32(&output_tensor_ptr, 0, - output_tensor->flat().size() * sizeof(float)); + output_tensor->flat().size() * sizeof(Tout)); if (context->input(0).NumElements() == 0) { done(); return; @@ -573,9 +654,9 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { // Call the GPU kernels for the numerical (inf/nan) checks. const Device& d = context->eigen_device(); - auto input = context->input(0).flat(); - ReduceInfNanTwoSlotsLaunch().Run(d, input.data(), input.size(), - output_tensor->flat().data()); + auto input = context->input(0).flat(); + ReduceInfNanThreeSlotsLaunch().Run( + d, input.data(), input.size(), output_tensor->flat().data()); auto check_cb = [this, done]() { done(); }; @@ -591,7 +672,8 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { private: int tensor_debug_mode_; - int tensor_id_; + int64 tensor_id_; + static constexpr int64 kMaxTensorId = 1L << std::numeric_limits::digits; }; #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/debug_ops_gpu.cu.cc b/tensorflow/core/kernels/debug_ops_gpu.cu.cc index 42bca1ab59a..5597c12a5ad 100644 --- a/tensorflow/core/kernels/debug_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/debug_ops_gpu.cu.cc @@ -33,13 +33,32 @@ namespace { typedef Eigen::GpuDevice GPUDevice; -// A CUDA kernel that fills a length-2 vector according to whether any of the -// input data contains infinity or NaN. The first element is filled with -// infinity of any of the elements is +/- infinity. The second element is -// filled with NaN if any of the elements is NaN. -template -__global__ void ReduceInfNanTwoSlotsKernel(const T* __restrict__ data, int size, - float output[2]) { +// A CUDA kernel that fills the second element of a vector according +// to whether any of the input data contains infinity or NaN. +template +__global__ void CurtHealthKernel(const Tin* __restrict__ data, int size, + Tout output[1]) { + const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32 total_thread_count = gridDim.x * blockDim.x; + + int32 offset = thread_id; + + while (offset < size) { + if (isinf(data[offset]) || isnan(data[offset])) { + output[0] = 1.0; + } + offset += total_thread_count; + } +} + +// A CUDA kernel that fills a length-3 vector according to whether any of the +// input data contains negative infinity, positive infinity, or NaN. The first +// element is filled with -infinity if any of the elements is -infinity. +// The second element is filled with +infinity if any of the elements is +// +infinity. The last is filled with NaN if any of the elements is NaN. +template +__global__ void ReduceInfNanThreeSlotsKernel(const Tin* __restrict__ data, + int size, Tout output[3]) { const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int32 total_thread_count = gridDim.x * blockDim.x; @@ -47,14 +66,14 @@ __global__ void ReduceInfNanTwoSlotsKernel(const T* __restrict__ data, int size, while (offset < size) { if (isinf(data[offset])) { - if (data[offset] < static_cast(0.f)) { - output[0] = -std::numeric_limits::infinity(); + if (data[offset] < static_cast(0.f)) { + output[0] = -std::numeric_limits::infinity(); } else { - output[1] = std::numeric_limits::infinity(); + output[1] = std::numeric_limits::infinity(); } } if (isnan(data[offset])) { - output[2] = std::numeric_limits::quiet_NaN(); + output[2] = std::numeric_limits::quiet_NaN(); } offset += total_thread_count; } @@ -62,22 +81,46 @@ __global__ void ReduceInfNanTwoSlotsKernel(const T* __restrict__ data, int size, } // namespace -template -struct ReduceInfNanTwoSlotsLaunch { - void Run(const GPUDevice& d, const T* data, int size, float output[2]) { +template +struct CurtHealthLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[1]) { const int32 block_size = d.maxGpuThreadsPerBlock(); const int32 num_blocks = (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / block_size; - TF_CHECK_OK(GpuLaunchKernel(ReduceInfNanTwoSlotsKernel, num_blocks, + TF_CHECK_OK(GpuLaunchKernel(CurtHealthKernel, num_blocks, block_size, 0, d.stream(), data, size, output)); } }; -template struct ReduceInfNanTwoSlotsLaunch; -template struct ReduceInfNanTwoSlotsLaunch; -template struct ReduceInfNanTwoSlotsLaunch; +template struct CurtHealthLaunch; +template struct CurtHealthLaunch; +template struct CurtHealthLaunch; +template struct CurtHealthLaunch; +template struct CurtHealthLaunch; +template struct CurtHealthLaunch; + +template +struct ReduceInfNanThreeSlotsLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]) { + const int32 block_size = d.maxGpuThreadsPerBlock(); + const int32 num_blocks = + (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / + block_size; + + TF_CHECK_OK(GpuLaunchKernel(ReduceInfNanThreeSlotsKernel, + num_blocks, block_size, 0, d.stream(), data, + size, output)); + } +}; + +template struct ReduceInfNanThreeSlotsLaunch; +template struct ReduceInfNanThreeSlotsLaunch; +template struct ReduceInfNanThreeSlotsLaunch; +template struct ReduceInfNanThreeSlotsLaunch; +template struct ReduceInfNanThreeSlotsLaunch; +template struct ReduceInfNanThreeSlotsLaunch; } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/ops/debug_ops.cc b/tensorflow/core/ops/debug_ops.cc index 3d22cbbd4d3..7977974fefc 100644 --- a/tensorflow/core/ops/debug_ops.cc +++ b/tensorflow/core/ops/debug_ops.cc @@ -95,15 +95,10 @@ REGISTER_OP("DebugIdentityV2") REGISTER_OP("DebugNumericSummaryV2") .Input("input: T") - .Output("output: float32") + .Output("output: output_dtype") + .Attr("output_dtype: {float32, float64} = DT_FLOAT") .Attr("T: type") .Attr("tensor_debug_mode: int = -1") .Attr("tensor_id: int = -1") - .SetShapeFn([](shape_inference::InferenceContext *c) { - // The following is for REDUCE_INF_NAN_THREE_SLOTS. - // TODO(cais): Support other tensor_debug_mode values. - shape_inference::ShapeHandle output_shape = c->MakeShape({3}); - c->set_output(0, output_shape); - return Status::OK(); - }); + .SetShapeFn(shape_inference::UnknownShape); } // namespace tensorflow diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py index 08b0ec17316..474b2330e99 100644 --- a/tensorflow/python/debug/lib/debug_v2_ops_test.py +++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py @@ -29,6 +29,7 @@ from tensorflow.python.debug.lib import dumping_callback_test_lib from tensorflow.python.eager import def_function from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes +from tensorflow.python.framework import errors from tensorflow.python.framework import ops from tensorflow.python.framework import tensor_util from tensorflow.python.framework import test_util @@ -222,7 +223,8 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): next(graph_trace_iter) @test_util.run_in_graph_and_eager_modes - def testDebugNumericSummaryV2OpReduceInfNanTwoSlots(self): + def testDebugNumericSummaryV2OpReduceInfNanThreeSlots(self): + def debug_summary(x): return self.evaluate(gen_debug_ops.debug_numeric_summary_v2( x, tensor_debug_mode=( @@ -265,6 +267,128 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): self.assertAllEqual( debug_summary(constant_op.constant(x)), [0.0, 0.0, np.nan]) + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpLargeTensorIDError(self): + modes = [ + debug_event_pb2.TensorDebugMode.CURT_HEALTH, + ] + # Maximum allowed tensor_id + tensor_id = np.power(2, 53) + for mode in modes: + self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + constant_op.constant(42.0), + tensor_debug_mode=mode, + tensor_id=tensor_id, + output_dtype=dtypes.float64)) + # Incrementing by one should error + tensor_id += 1 + for mode in modes: + with self.assertRaises(errors.InvalidArgumentError): + self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + constant_op.constant(42.0), + tensor_debug_mode=mode, + tensor_id=tensor_id, + output_dtype=dtypes.float64)) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpCurtHealthValuesSmall(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.CURT_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + tensor, tensor_id = debug_summary(constant_op.constant([])) + self.assertAllEqual(tensor, [tensor_id, 0.0]) + + tensor, tensor_id = debug_summary(constant_op.constant(42.0)) + self.assertAllEqual(tensor, [tensor_id, 0.0]) + + tensor, tensor_id = debug_summary(constant_op.constant([3.0, 4.0])) + self.assertAllEqual(tensor, [tensor_id, 0.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([3.0, -np.inf]))) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, 0], [np.nan, 0]]))) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, 0], [np.nan, np.inf]]))) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, np.inf], [np.nan, -np.inf]]))) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpCurtHealthValuesLarge(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.CURT_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + x = np.zeros([100, 100], dtype=np.float16) + x[32, 47] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + x = np.zeros([97, 97], dtype=np.float32) + x[50, 83] = -np.inf + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + x[1, 41] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + x = np.zeros([9701], dtype=np.float64) + x[9700] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 1.0]) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpCurtHealthConsistency(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.CURT_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + x = np.zeros([100, 100], dtype=np.float16) + x[43, 99] = np.nan + c = constant_op.constant(x) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + + x = np.zeros([100, 100, 50], dtype=np.float64) + x[0, 0, 1] = np.inf + c = constant_op.constant(x) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + + c = constant_op.constant(np.ones((100, 200), np.double)) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + + if __name__ == "__main__": ops.enable_eager_execution() googletest.main() diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index 604f676bf34..9bac746ffc8 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -966,7 +966,7 @@ tf_module { } member_method { name: "DebugNumericSummaryV2" - argspec: "args=[\'input\', \'tensor_debug_mode\', \'tensor_id\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], " + argspec: "args=[\'input\', \'output_dtype\', \'tensor_debug_mode\', \'tensor_id\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'-1\', \'-1\', \'None\'], " } member_method { name: "DecodeAndCropJpeg" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index 604f676bf34..9bac746ffc8 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -966,7 +966,7 @@ tf_module { } member_method { name: "DebugNumericSummaryV2" - argspec: "args=[\'input\', \'tensor_debug_mode\', \'tensor_id\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'-1\', \'None\'], " + argspec: "args=[\'input\', \'output_dtype\', \'tensor_debug_mode\', \'tensor_id\', \'name\'], varargs=None, keywords=None, defaults=[\"\", \'-1\', \'-1\', \'None\'], " } member_method { name: "DecodeAndCropJpeg" From 1f5fd616ed01855add58f79563393bb11ef8ddbd Mon Sep 17 00:00:00 2001 From: Jose Baiocchi Date: Thu, 5 Dec 2019 13:22:07 -0800 Subject: [PATCH 193/383] Add helpers to classify TraceMeRecorder::Event PiperOrigin-RevId: 284038760 Change-Id: I4e720ee87260810f0a6dd567ad01733dc1b9d940 --- .../profiler/internal/cpu/host_tracer_test.cc | 1 - .../internal/cpu/host_tracer_utils.cc | 46 +++++++++---------- .../profiler/internal/cpu/host_tracer_utils.h | 16 +++++++ 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc index 7623a4ec9b8..ffe702ad121 100644 --- a/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc +++ b/tensorflow/core/profiler/internal/cpu/host_tracer_test.cc @@ -126,7 +126,6 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) { ASSERT_EQ(space.planes_size(), 1); const auto& plane = space.planes(0); - EXPECT_EQ(plane.name(), "Host Threads"); ASSERT_EQ(plane.lines_size(), 1); ASSERT_EQ(plane.event_metadata_size(), 6); ASSERT_EQ(plane.stat_metadata_size(), 2); diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc index 099cd1a45ae..8d669e431ff 100644 --- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc +++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.cc @@ -38,9 +38,9 @@ void MakeCompleteEvents(TraceMeRecorder::Events* events) { std::vector end_events; for (auto& thread : *events) { for (auto& event : thread.events) { - if (event.start_time && !event.end_time) { // ActivityStart + if (IsStartEvent(event)) { start_events.emplace(event.activity_id, &event); - } else if (!event.start_time && event.end_time) { // ActivityEnd + } else if (IsEndEvent(event)) { auto iter = start_events.find(event.activity_id); if (iter != start_events.end()) { // same thread auto* start_event = iter->second; @@ -68,7 +68,6 @@ void ConvertCompleteEventsToXPlane(uint64 start_timestamp_ns, const TraceMeRecorder::Events& events, XPlane* raw_plane) { XPlaneBuilder xplane(raw_plane); - xplane.SetName("Host Threads"); absl::flat_hash_map xevent_metadata_by_name; absl::flat_hash_map xstat_metadata_by_name; for (const auto& thread : events) { @@ -78,28 +77,27 @@ void ConvertCompleteEventsToXPlane(uint64 start_timestamp_ns, xline.SetTimestampNs(start_timestamp_ns); xline.ReserveEvents(thread.events.size()); for (const auto& event : thread.events) { - if (event.start_time && event.end_time) { - Annotation annotation = ParseAnnotation(event.name); - XEventMetadata*& xevent_metadata = - xevent_metadata_by_name[annotation.name]; - if (xevent_metadata == nullptr) { - xevent_metadata = - xplane.GetOrCreateEventMetadata(xevent_metadata_by_name.size()); - xevent_metadata->set_name(string(annotation.name)); - } - XEventBuilder xevent = xline.AddEvent(*xevent_metadata); - xevent.SetTimestampNs(event.start_time); - xevent.SetEndTimestampNs(event.end_time); - xevent.ReserveStats(annotation.metadata.size()); - for (const auto& metadata : annotation.metadata) { - XStatMetadata*& xstat_metadata = xstat_metadata_by_name[metadata.key]; - if (xstat_metadata == nullptr) { - xstat_metadata = - xplane.GetOrCreateStatMetadata(xstat_metadata_by_name.size()); - xstat_metadata->set_name(string(metadata.key)); - } - xevent.ParseAndAddStatValue(*xstat_metadata, metadata.value); + if (!IsCompleteEvent(event)) continue; + Annotation annotation = ParseAnnotation(event.name); + XEventMetadata*& xevent_metadata = + xevent_metadata_by_name[annotation.name]; + if (xevent_metadata == nullptr) { + xevent_metadata = + xplane.GetOrCreateEventMetadata(xevent_metadata_by_name.size()); + xevent_metadata->set_name(string(annotation.name)); + } + XEventBuilder xevent = xline.AddEvent(*xevent_metadata); + xevent.SetTimestampNs(event.start_time); + xevent.SetEndTimestampNs(event.end_time); + xevent.ReserveStats(annotation.metadata.size()); + for (const auto& metadata : annotation.metadata) { + XStatMetadata*& xstat_metadata = xstat_metadata_by_name[metadata.key]; + if (xstat_metadata == nullptr) { + xstat_metadata = + xplane.GetOrCreateStatMetadata(xstat_metadata_by_name.size()); + xstat_metadata->set_name(string(metadata.key)); } + xevent.ParseAndAddStatValue(*xstat_metadata, metadata.value); } } } diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h index e4227601990..fa5bf382c88 100644 --- a/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h +++ b/tensorflow/core/profiler/internal/cpu/host_tracer_utils.h @@ -22,6 +22,22 @@ limitations under the License. namespace tensorflow { namespace profiler { +// Returns true if event was created by TraceMe::ActivityStart. +inline bool IsStartEvent(const TraceMeRecorder::Event& event) { + return (event.start_time != 0) && (event.end_time == 0); +} + +// Returns true if event was created by TraceMe::ActivityEnd. +inline bool IsEndEvent(const TraceMeRecorder::Event& event) { + return (event.start_time == 0) && (event.end_time != 0); +} + +// Returns true if event was created by TraceMe::Stop or MakeCompleteEvents +// below. +inline bool IsCompleteEvent(const TraceMeRecorder::Event& event) { + return (event.start_time != 0) && (event.end_time != 0); +} + // Combine events created by TraceMe::ActivityStart and TraceMe::ActivityEnd, // which can be paired up by their activity_id. void MakeCompleteEvents(TraceMeRecorder::Events* events); From 0c531cb8cd15c9533167d02b2099240c52fc1f37 Mon Sep 17 00:00:00 2001 From: Yanhui Liang Date: Thu, 5 Dec 2019 13:36:21 -0800 Subject: [PATCH 194/383] Add testable docstrings to conv1D and conv3D. PiperOrigin-RevId: 284041695 Change-Id: I2f17f47cd4cde640d7975f31904b108a95ce96bc --- .../python/keras/layers/convolutional.py | 273 ++++++++++++------ 1 file changed, 180 insertions(+), 93 deletions(-) diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py index 6a69af0f580..fd7402a28c3 100644 --- a/tensorflow/python/keras/layers/convolutional.py +++ b/tensorflow/python/keras/layers/convolutional.py @@ -78,8 +78,7 @@ class Conv(Layer): Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any `strides` value != 1. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. 'linear' activation: `a(x) = x`). + If you don't specify anything, no activation is applied. use_bias: Boolean, whether the layer uses a bias. kernel_initializer: An initializer for the convolution kernel. bias_initializer: An initializer for the bias vector. If None, the default @@ -341,6 +340,17 @@ class Conv1D(Conv): `(10, 128)` for sequences of 10 vectors of 128-dimensional vectors, or `(None, 128)` for variable-length sequences of 128-dimensional vectors. + Examples: + + >>> # The inputs are 128-length vectors with 10 timesteps, and the batch size + >>> # is 4. + >>> input_shape = (4, 10, 128) + >>> x = tf.random.normal(input_shape) + >>> y = tf.keras.layers.Conv1D( + ... 32, 3, activation='relu',input_shape=input_shape)(x) + >>> print(y.shape) + (4, 8, 32) + Arguments: filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). @@ -351,8 +361,8 @@ class Conv1D(Conv): Specifying any stride value != 1 is incompatible with specifying any `dilation_rate` value != 1. padding: One of `"valid"`, `"causal"` or `"same"` (case-insensitive). - `"causal"` results in causal (dilated) convolutions, e.g. output[t] - does not depend on input[t+1:]. Useful when modeling temporal data + `"causal"` results in causal (dilated) convolutions, e.g. `output[t]` + does not depend on `input[t+1:]`. Useful when modeling temporal data where the model should not violate the temporal order. See [WaveNet: A Generative Model for Raw Audio, section 2.1](https://arxiv.org/abs/1609.03499). @@ -363,11 +373,13 @@ class Conv1D(Conv): Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any `strides` value != 1. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. - kernel_initializer: Initializer for the `kernel` weights matrix. - bias_initializer: Initializer for the bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix ( + see `keras.initializers`). + bias_initializer: Initializer for the bias vector ( + see `keras.initializers`). kernel_regularizer: Regularizer function applied to the `kernel` weights matrix (see `keras.regularizers`). bias_regularizer: Regularizer function applied to the bias vector ( @@ -380,25 +392,19 @@ class Conv1D(Conv): bias_constraint: Constraint function applied to the bias vector ( see `keras.constraints`). - Examples: - ```python - # Small convolutional model for 128-length vectors with 6 timesteps - # model.input_shape == (None, 6, 128) - - model = Sequential() - model.add(Conv1D(32, 3, - activation='relu', - input_shape=(6, 128))) - - # now: model.output_shape == (None, 4, 32) - ``` - Input shape: 3D tensor with shape: `(batch_size, steps, input_dim)` Output shape: 3D tensor with shape: `(batch_size, new_steps, filters)` `steps` value might have changed due to padding or strides. + + Returns: + A tensor of rank 3 representing + `activation(conv1d(inputs, kernel) + bias)`. + + Raises: + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, @@ -456,29 +462,30 @@ class Conv2D(Conv): Examples: - >>> # Small convolutional model for 128x128 RGB images with `channels_last` - >>> input_shape = (32, 128, 128, 3) + >>> # The inputs are 28x28 RGB images with `channels_last` and the batch + >>> # size is 4. + >>> input_shape = (4, 28, 28, 3) >>> x = tf.random.normal(input_shape) >>> y = tf.keras.layers.Conv2D( ... 2, 3, activation='relu', input_shape=input_shape)(x) >>> print(y.shape) - (32, 126, 126, 2) + (4, 26, 26, 2) >>> # With `dilation_rate` as 2. - >>> input_shape = (32, 128, 128, 3) + >>> input_shape = (4, 28, 28, 3) >>> x = tf.random.normal(input_shape) >>> y = tf.keras.layers.Conv2D( ... 2, 3, activation='relu', dilation_rate=2, input_shape=input_shape)(x) >>> print(y.shape) - (32, 124, 124, 2) + (4, 24, 24, 2) >>> # With `padding` as "same". - >>> input_shape = (32, 128, 128, 3) + >>> input_shape = (4, 28, 28, 3) >>> x = tf.random.normal(input_shape) >>> y = tf.keras.layers.Conv2D( ... 2, 3, activation='relu', padding="same", input_shape=input_shape)(x) >>> print(y.shape) - (32, 128, 128, 2) + (4, 28, 28, 2) Arguments: @@ -512,14 +519,13 @@ class Conv2D(Conv): Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any stride value != 1. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`). Check `keras.activations` for - available activation functions (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. - kernel_initializer: Initializer for the `kernel` weights matrix. Check - `keras.initializers` for available initializers. - bias_initializer: Initializer for the bias vector. Check - `keras.initializers` for available initializers. + kernel_initializer: Initializer for the `kernel` weights matrix ( + see `keras.initializers`). + bias_initializer: Initializer for the bias vector ( + see `keras.initializers`). kernel_regularizer: Regularizer function applied to the `kernel` weights matrix (see `keras.regularizers`). bias_regularizer: Regularizer function applied to the bias vector ( @@ -534,15 +540,15 @@ class Conv2D(Conv): Input shape: 4D tensor with shape: - `(samples, channels, rows, cols)` if data_format='channels_first' + `(batch_size, channels, rows, cols)` if data_format='channels_first' or 4D tensor with shape: - `(samples, rows, cols, channels)` if data_format='channels_last'. + `(batch_size, rows, cols, channels)` if data_format='channels_last'. Output shape: 4D tensor with shape: - `(samples, filters, new_rows, new_cols)` if data_format='channels_first' + `(batch_size, filters, new_rows, new_cols)` if data_format='channels_first' or 4D tensor with shape: - `(samples, new_rows, new_cols, filters)` if data_format='channels_last'. + `(batch_size, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to padding. Returns: @@ -550,8 +556,8 @@ class Conv2D(Conv): `activation(conv2d(inputs, kernel) + bias)`. Raises: - ValueError: if `padding` is "causal". - ValueError: when both `strides` > 1 and `dilation_rate` > 1. + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, @@ -608,6 +614,17 @@ class Conv3D(Conv): with a single channel, in `data_format="channels_last"`. + Examples: + + >>> # The inputs are 28x28x28 volumes with a single channel, and the + >>> # batch size is 4 + >>> input_shape =(4, 28, 28, 28, 1) + >>> x = tf.random.normal(input_shape) + >>> y = tf.keras.layers.Conv3D( + ... 2, 3, activation='relu', input_shape=input_shape)(x) + >>> print(y.shape) + (4, 26, 26, 26, 2) + Arguments: filters: Integer, the dimensionality of the output space (i.e. the number of output filters in the convolution). @@ -640,11 +657,13 @@ class Conv3D(Conv): Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any stride value != 1. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. - kernel_initializer: Initializer for the `kernel` weights matrix. - bias_initializer: Initializer for the bias vector. + kernel_initializer: Initializer for the `kernel` weights matrix ( + see `keras.initializers`). + bias_initializer: Initializer for the bias vector ( + see `keras.initializers`). kernel_regularizer: Regularizer function applied to the `kernel` weights matrix ( see `keras.regularizers`). @@ -660,21 +679,29 @@ class Conv3D(Conv): Input shape: 5D tensor with shape: - `(samples, channels, conv_dim1, conv_dim2, conv_dim3)` if + `(batch_size, channels, conv_dim1, conv_dim2, conv_dim3)` if data_format='channels_first' or 5D tensor with shape: - `(samples, conv_dim1, conv_dim2, conv_dim3, channels)` if + `(batch_size, conv_dim1, conv_dim2, conv_dim3, channels)` if data_format='channels_last'. Output shape: 5D tensor with shape: - `(samples, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if + `(batch_size, filters, new_conv_dim1, new_conv_dim2, new_conv_dim3)` if data_format='channels_first' or 5D tensor with shape: - `(samples, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)` if + `(batch_size, new_conv_dim1, new_conv_dim2, new_conv_dim3, filters)` if data_format='channels_last'. `new_conv_dim1`, `new_conv_dim2` and `new_conv_dim3` values might have changed due to padding. + + Returns: + A tensor of rank 5 representing + `activation(conv3d(inputs, kernel) + bias)`. + + Raises: + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, @@ -771,8 +798,8 @@ class Conv2DTranspose(Conv2D): Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any stride value != 1. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix ( see `keras.initializers`). @@ -791,37 +818,37 @@ class Conv2DTranspose(Conv2D): Input shape: 4D tensor with shape: - `(batch, channels, rows, cols)` if data_format='channels_first' + `(batch_size, channels, rows, cols)` if data_format='channels_first' or 4D tensor with shape: - `(batch, rows, cols, channels)` if data_format='channels_last'. + `(batch_size, rows, cols, channels)` if data_format='channels_last'. Output shape: 4D tensor with shape: - `(batch, filters, new_rows, new_cols)` if data_format='channels_first' + `(batch_size, filters, new_rows, new_cols)` if data_format='channels_first' or 4D tensor with shape: - `(batch, new_rows, new_cols, filters)` if data_format='channels_last'. + `(batch_size, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to padding. If `output_padding` is specified: ``` - new_rows = ((rows - 1) * strides[0] + kernel_size[0] - - 2 * padding[0] + output_padding[0]) - new_cols = ((cols - 1) * strides[1] + kernel_size[1] - - 2 * padding[1] + output_padding[1]) + new_rows = ((rows - 1) * strides[0] + kernel_size[0] - 2 * padding[0] + + output_padding[0]) + new_cols = ((cols - 1) * strides[1] + kernel_size[1] - 2 * padding[1] + + output_padding[1]) ``` - References: - - [A guide to convolution arithmetic for deep - learning](https://arxiv.org/abs/1603.07285v1) - - [Deconvolutional - Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf) - Returns: A tensor of rank 4 representing `activation(conv2dtranspose(inputs, kernel) + bias)`. Raises: - ValueError: if `padding` is "causal". - ValueError: when both `strides` > 1 and `dilation_rate` > 1. + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. + + References: + - [A guide to convolution arithmetic for deep + learning](https://arxiv.org/abs/1603.07285v1) + - [Deconvolutional + Networks](https://www.matthewzeiler.com/mattzeiler/deconvolutionalnetworks.pdf) """ def __init__(self, @@ -1061,8 +1088,8 @@ class Conv3DTranspose(Conv3D): Currently, specifying any `dilation_rate` value != 1 is incompatible with specifying any stride value != 1. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. kernel_initializer: Initializer for the `kernel` weights matrix. bias_initializer: Initializer for the bias vector. @@ -1081,18 +1108,35 @@ class Conv3DTranspose(Conv3D): Input shape: 5D tensor with shape: - `(batch, channels, depth, rows, cols)` if data_format='channels_first' + `(batch_size, channels, depth, rows, cols)` if data_format='channels_first' or 5D tensor with shape: - `(batch, depth, rows, cols, channels)` if data_format='channels_last'. + `(batch_size, depth, rows, cols, channels)` if data_format='channels_last'. Output shape: 5D tensor with shape: - `(batch, filters, new_depth, new_rows, new_cols)` if + `(batch_size, filters, new_depth, new_rows, new_cols)` if data_format='channels_first' or 5D tensor with shape: - `(batch, new_depth, new_rows, new_cols, filters)` if + `(batch_size, new_depth, new_rows, new_cols, filters)` if data_format='channels_last'. `depth` and `rows` and `cols` values might have changed due to padding. + If `output_padding` is specified:: + ``` + new_depth = ((depth - 1) * strides[0] + kernel_size[0] - 2 * padding[0] + + output_padding[0]) + new_rows = ((rows - 1) * strides[1] + kernel_size[1] - 2 * padding[1] + + output_padding[1]) + new_cols = ((cols - 1) * strides[2] + kernel_size[2] - 2 * padding[2] + + output_padding[2]) + ``` + + Returns: + A tensor of rank 5 representing + `activation(conv3dtranspose(inputs, kernel) + bias)`. + + Raises: + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. References: - [A guide to convolution arithmetic for deep @@ -1329,8 +1373,8 @@ class SeparableConv(Conv): each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. 'linear' activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias. depthwise_initializer: An initializer for the depthwise convolution kernel. pointwise_initializer: An initializer for the pointwise convolution kernel. @@ -1531,13 +1575,15 @@ class SeparableConv1D(SeparableConv): each input channel. The total number of depthwise convolution output channels will be equal to `num_filters_in * depth_multiplier`. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. 'linear' activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias. - depthwise_initializer: An initializer for the depthwise convolution kernel. - pointwise_initializer: An initializer for the pointwise convolution kernel. + depthwise_initializer: An initializer for the depthwise convolution kernel ( + see `keras.initializers`). + pointwise_initializer: An initializer for the pointwise convolution kernel ( + see `keras.initializers`). bias_initializer: An initializer for the bias vector. If None, the default - initializer will be used. + initializer will be used (see `keras.initializers`). depthwise_regularizer: Optional regularizer for the depthwise convolution kernel (see `keras.regularizers`). pointwise_regularizer: Optional regularizer for the pointwise @@ -1562,6 +1608,26 @@ class SeparableConv1D(SeparableConv): trainable: Boolean, if `True` the weights of this layer will be marked as trainable (and listed in `layer.trainable_weights`). name: A string, the name of the layer. + + Input shape: + 3D tensor with shape: + `(batch_size, channels, steps)` if data_format='channels_first' + or 5D tensor with shape: + `(batch_size, steps, channels)` if data_format='channels_last'. + + Output shape: + 3D tensor with shape: + `(batch_size, filters, new_steps)` if data_format='channels_first' + or 3D tensor with shape: + `(batch_size, new_steps, filters)` if data_format='channels_last'. + `new_steps` value might have changed due to padding or strides. + + Returns: + A tensor of rank 3 representing + `activation(separableconv1d(inputs, kernel) + bias)`. + + Raises: + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, @@ -1700,12 +1766,15 @@ class SeparableConv2D(SeparableConv): The total number of depthwise convolution output channels will be equal to `filters_in * depth_multiplier`. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. "linear" activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. - depthwise_initializer: Initializer for the depthwise kernel matrix. - pointwise_initializer: Initializer for the pointwise kernel matrix. - bias_initializer: Initializer for the bias vector. + depthwise_initializer: Initializer for the depthwise kernel matrix ( + see `keras.initializers`). + pointwise_initializer: Initializer for the pointwise kernel matrix ( + see `keras.initializers`). + bias_initializer: Initializer for the bias vector ( + see `keras.initializers`). depthwise_regularizer: Regularizer function applied to the depthwise kernel matrix (see `keras.regularizers`). pointwise_regularizer: Regularizer function applied to @@ -1726,16 +1795,24 @@ class SeparableConv2D(SeparableConv): Input shape: 4D tensor with shape: - `(batch, channels, rows, cols)` if data_format='channels_first' + `(batch_size, channels, rows, cols)` if data_format='channels_first' or 4D tensor with shape: - `(batch, rows, cols, channels)` if data_format='channels_last'. + `(batch_size, rows, cols, channels)` if data_format='channels_last'. Output shape: 4D tensor with shape: - `(batch, filters, new_rows, new_cols)` if data_format='channels_first' + `(batch_size, filters, new_rows, new_cols)` if data_format='channels_first' or 4D tensor with shape: - `(batch, new_rows, new_cols, filters)` if data_format='channels_last'. + `(batch_size, new_rows, new_cols, filters)` if data_format='channels_last'. `rows` and `cols` values might have changed due to padding. + + Returns: + A tensor of rank 4 representing + `activation(separableconv2d(inputs, kernel) + bias)`. + + Raises: + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, @@ -1845,11 +1922,13 @@ class DepthwiseConv2D(Conv2D): Keras config file at `~/.keras/keras.json`. If you never set it, then it will be 'channels_last'. activation: Activation function to use. - If you don't specify anything, no activation is applied - (ie. 'linear' activation: `a(x) = x`) (see `keras.activations`). + If you don't specify anything, no activation is applied ( + see `keras.activations`). use_bias: Boolean, whether the layer uses a bias vector. - depthwise_initializer: Initializer for the depthwise kernel matrix. - bias_initializer: Initializer for the bias vector. + depthwise_initializer: Initializer for the depthwise kernel matrix ( + see `keras.initializers`). + bias_initializer: Initializer for the bias vector ( + see `keras.initializers`). depthwise_regularizer: Regularizer function applied to the depthwise kernel matrix (see `keras.regularizers`). bias_regularizer: Regularizer function applied to the bias vector ( @@ -1875,6 +1954,14 @@ class DepthwiseConv2D(Conv2D): or 4D tensor with shape: `[batch, new_rows, new_cols, filters]` if data_format='channels_last'. `rows` and `cols` values might have changed due to padding. + + Returns: + A tensor of rank 4 representing + `activation(depthwiseconv2d(inputs, kernel) + bias)`. + + Raises: + ValueError: if `padding` is "causal". + ValueError: when both `strides` > 1 and `dilation_rate` > 1. """ def __init__(self, From 47ba03f54a0b91311610a0a00a6d64c857b9977d Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 5 Dec 2019 13:37:55 -0800 Subject: [PATCH 195/383] Add legalization from tf.SplitV to XLA HLO ops tf.SplitV is similar to tf.Split except that the size of each chunk on the dimension to split is explicitly given as an op operand and they are not necessarily the same. Along the way, tightened verification for tf.SplitV. PiperOrigin-RevId: 284042118 Change-Id: I4f2f256c473277ba022a7383bbfb4d3528a94ef4 --- .../mlir/tensorflow/ir/tf_generated_ops.td | 2 + .../compiler/mlir/tensorflow/ir/tf_ops.cc | 104 ++++++++++++-- .../mlir/tensorflow/tests/tf-ops.mlir | 81 ++++++++++- .../compiler/mlir/xla/tests/legalize-tf.mlir | 46 +++++- .../mlir/xla/transforms/legalize_tf.cc | 134 ++++++++++++++++-- 5 files changed, 340 insertions(+), 27 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td index cdc545d5681..ef25e27db12 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td @@ -5117,6 +5117,8 @@ def TF_SplitVOp : TF_Op<"SplitV", [NoSideEffect]> { TF_DerivedOperandTypeAttr Tlen = TF_DerivedOperandTypeAttr<1>; TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>; TF_DerivedResultSizeAttr num_split = TF_DerivedResultSizeAttr<0>; + + let verifier = [{ return Verify(*this); }]; } def TF_SqrtOp : TF_Op<"Sqrt", [NoSideEffect, SameOperandsAndResultType]> { diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index 1bd9accbb78..8d37ef85527 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -1542,17 +1542,23 @@ static LogicalResult Verify(SoftmaxCrossEntropyWithLogitsOp op) { // SplitOp //===----------------------------------------------------------------------===// -static LogicalResult Verify(SplitOp op) { +// Verifies the input and split dimension operands for tf.Split/tf.SplitV. +// Writes the split dimension's index (adjusted with input rank) via `dim_index` +// if it's a constant. +template +LogicalResult VerifySplitInputAndSplitDim(Op op, Optional *dim_index) { + *dim_index = llvm::None; + Value *split_dim = op.split_dim(); - auto split_dim_type = split_dim->getType().dyn_cast(); - if (!split_dim_type) return success(); - if (split_dim_type.getRank() != 0) - return op.emitOpError("split dimension should be an integer scalar tensor"); + if (auto split_dim_type = split_dim->getType().dyn_cast()) + if (split_dim_type.getRank() != 0) + return op.emitOpError( + "split dimension should be an integer scalar tensor"); // We can perform further verification if the input tensor to be split has // known rank and the split dimension tensor is a constant. - auto input_type = op.value()->getType().dyn_cast(); + auto input_type = op.value()->getType().template dyn_cast(); if (!input_type) return success(); int64_t input_rank = input_type.getRank(); @@ -1562,21 +1568,95 @@ static LogicalResult Verify(SplitOp op) { DenseIntElementsAttr split_dim_attr; if (!matchPattern(split_dim, m_Constant(&split_dim_attr))) return success(); - int64_t dim_index = (*split_dim_attr.begin()).getSExtValue(); + int64_t index = (*split_dim_attr.begin()).getSExtValue(); - if (dim_index + input_rank < 0 || dim_index >= input_rank) { + if (index + input_rank < 0 || index >= input_rank) { return op.emitOpError("split dimension must be in range [-") << input_rank << ", " << input_rank << ")"; } - if (dim_index < 0) dim_index += input_rank; + if (index < 0) index += input_rank; + *dim_index = index; - int64_t input_dim_size = input_type.getDimSize(dim_index); - if (input_dim_size < 0) return success(); + return success(); +} + +static LogicalResult Verify(SplitOp op) { + Optional dim_index; + if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure(); + if (!dim_index) return success(); + + int64_t input_dim_size = + op.value()->getType().cast().getDimSize(*dim_index); + if (input_dim_size == ShapedType::kDynamicSize) return success(); if (input_dim_size % op.getNumResults() != 0) return op.emitOpError("dimension #") - << dim_index << " not divisible by the number of result tensors"; + << *dim_index << " not divisible by the number of result tensors"; + + return success(); +} + +//===----------------------------------------------------------------------===// +// SplitVOp +//===----------------------------------------------------------------------===// + +static LogicalResult Verify(SplitVOp op) { + auto split_sizes_type = + op.size_splits()->getType().dyn_cast(); + if (!split_sizes_type) return success(); + + if (split_sizes_type.getRank() != 1 || + split_sizes_type.getDimSize(0) != op.getNumResults()) + return op.emitOpError("split sizes should be a 1D tensor of ") + << op.getNumResults() << " elements"; + + Optional dim_index = 0; + if (failed(VerifySplitInputAndSplitDim(op, &dim_index))) return failure(); + if (!dim_index) return success(); + + int64_t input_dim_size = + op.value()->getType().cast().getDimSize(*dim_index); + if (input_dim_size == ShapedType::kDynamicSize) return success(); + + // If split sizes come from a constant, they must sum to the dimension size + // along split_dim, and we can have no more than one dynamic dimension. + DenseIntElementsAttr split_sizes_attr; + if (!matchPattern(op.size_splits(), m_Constant(&split_sizes_attr))) + return success(); + + int64_t total_dim_size = 0; // Total dimension size assigned to splits + llvm::Optional dynamic_dim_index; + + SmallVector split_sizes; + split_sizes.reserve( + split_sizes_attr.getType().cast().getNumElements()); + + for (auto dim : llvm::enumerate(split_sizes_attr)) { + int64_t dim_val = dim.value().getSExtValue(); + split_sizes.push_back(dim_val); + if (dim_val == ShapedType::kDynamicSize) { + // We cannot have more than one dynamic dimension. + if (dynamic_dim_index) + return op.emitOpError( + "cannot have more than one dynamic dimension in split sizes"); + dynamic_dim_index = dim.index(); + } else { + total_dim_size += dim_val; + } + } + + if (!dynamic_dim_index && total_dim_size != input_dim_size) + return op.emitOpError( + "split sizes must sum up to the dimension size along split " + "dimension, found ") + << total_dim_size << " vs " << input_dim_size; + + if (dynamic_dim_index && total_dim_size > input_dim_size) + return op.emitOpError( + "split sizes must sum up to be less than or equal to the " + "dimension size along split dimension, found ") + << total_dim_size << " vs " << input_dim_size; return success(); } diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir index e064c1a53ef..cc16b545c93 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir @@ -1610,7 +1610,7 @@ func @testSplitUnknownDimInput(%input: tensor<4x?x4xf32>) { // ----- -func @testSplitNonConstSplitDim(%input: tensor<4x4xf32>, %split_dim: tensor<1xi32>) { +func @testSplitNonScalarSplitDim(%input: tensor<4x4xf32>, %split_dim: tensor<1xi32>) { // expected-error @+1 {{split dimension should be an integer scalar tensor}} %0:2 = "tf.Split"(%split_dim, %input) : (tensor<1xi32>, tensor<4x4xf32>) -> (tensor<*xf32>, tensor<*xf32>) return @@ -1674,3 +1674,82 @@ func @testTopKV2WrongKRank(%input: tensor<8xf32>, %k: tensor<5xi32>) { %0:2 = "tf.TopKV2"(%input, %k) : (tensor<8xf32>, tensor<5xi32>) -> (tensor<*xf32>, tensor<*xi32>) return } + +// ----- + +func @testSplitVScalarInput(%input: tensor, %split_sizes: tensor<2xi32>, %split_dim: tensor) { + // expected-error @+1 {{cannot split scalar input tensor}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitVNonScalarSplitDim(%input: tensor<4x4xf32>, %split_sizes: tensor<2xi32>, %split_dim: tensor<1xi32>) { + // expected-error @+1 {{split dimension should be an integer scalar tensor}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor<1xi32>) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitVSplitDimOutOfRange(%input: tensor<4x4xf32>, %split_sizes: tensor<2xi32>) { + %split_dim = "tf.Const"() {value = dense<100>: tensor} : () -> (tensor) + // expected-error @+1 {{split dimension must be in range [-2, 2)}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitVWrongSplitSizesType(%input: tensor<4x4xf32>, %split_sizes: tensor<2x2xi32>, %split_dim: tensor) { + // expected-error @+1 {{op split sizes should be a 1D tensor of 2 elements}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2x2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitVMultipleDynamicSizes(%input: tensor<4x4xf32>) { + %split_dim = "tf.Const"() {value = dense<1>: tensor} : () -> (tensor) + %split_sizes = "tf.Const"() {value = dense<[-1, -1]>: tensor<2xi32>} : () -> (tensor<2xi32>) + // expected-error @+1 {{cannot have more than one dynamic dimension in split sizes}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitVSplitSizeOutOfRange(%input: tensor<4x4xf32>) { + %split_dim = "tf.Const"() {value = dense<1>: tensor} : () -> (tensor) + %split_sizes = "tf.Const"() {value = dense<[-1, 100]>: tensor<2xi32>} : () -> (tensor<2xi32>) + // expected-error @+1 {{split sizes must sum up to be less than or equal to the dimension size along split dimension, found 100 vs 4}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitVSplitSizeOutOfRange(%input: tensor<4x4xf32>) { + %split_dim = "tf.Const"() {value = dense<1>: tensor} : () -> (tensor) + %split_sizes = "tf.Const"() {value = dense<[2, 3]>: tensor<2xi32>} : () -> (tensor<2xi32>) + // expected-error @+1 {{split sizes must sum up to the dimension size along split dimension, found 5 vs 4}} + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +// ----- + +func @testSplitV1(%input: tensor<4x4xf32>) { + %split_dim = "tf.Const"() {value = dense<1>: tensor} : () -> (tensor) + %split_sizes = "tf.Const"() {value = dense<[-1, 4]>: tensor<2xi32>} : () -> (tensor<2xi32>) + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} + +func @testSplitV2(%input: tensor<4x4xf32>) { + %split_dim = "tf.Const"() {value = dense<1>: tensor} : () -> (tensor) + %split_sizes = "tf.Const"() {value = dense<[3, 1]>: tensor<2xi32>} : () -> (tensor<2xi32>) + %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) + return +} diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index 7bc9614b72e..5620b9012a7 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -1965,11 +1965,12 @@ func @split_match_and_split_into_two_dynamic(%input: tensor<4x?xf32>) -> (tensor } // CHECK-LABEL: @split_match_and_split_into_three +// CHECK-SAME: (%[[ARG:.*]]: tensor<4x6xf32>) func @split_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>) { %cst = "tf.Const"() {value = dense<1> : tensor} : () -> tensor - // CHECK: %[[ONE:.*]] = "xla_hlo.slice"(%arg0) {limit_indices = dense<[4, 2]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> - // CHECK: %[[TWO:.*]] = "xla_hlo.slice"(%arg0) {limit_indices = dense<4> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> - // CHECK: %[[THREE:.*]] = "xla_hlo.slice"(%arg0) {limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 4]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> + // CHECK: %[[ONE:.*]] = "xla_hlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 2]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> + // CHECK: %[[TWO:.*]] = "xla_hlo.slice"(%[[ARG]]) {limit_indices = dense<4> : tensor<2xi64>, start_indices = dense<[0, 2]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> + // CHECK: %[[THREE:.*]] = "xla_hlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 4]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> %0:3 = "tf.Split"(%cst, %input) : (tensor, tensor<4x6xf32>) -> (tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32>) // CHECK: return %[[ONE]], %[[TWO]], %[[THREE]] return %0#0, %0#1, %0#2 : tensor<4x2xf32>, tensor<4x2xf32>, tensor<4x2xf32> @@ -2013,3 +2014,42 @@ func @topk_v2(%input: tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) %0:2 = "tf.TopKV2"(%input, %k): (tensor<16x16xf32>, tensor) -> (tensor<16x8xf32>, tensor<16x8xi32>) return %0#0, %0#1: tensor<16x8xf32>, tensor<16x8xi32> } + +//===----------------------------------------------------------------------===// +// tf.SplitV legalization +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: @splitv_match_and_split_into_three +// CHECK-SAME: (%[[ARG:.*]]: tensor<4x6xf32>) +func @splitv_match_and_split_into_three(%input: tensor<4x6xf32>) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) { + %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32> + %split_dim = "tf.Const"() {value = dense<1> : tensor} : () -> tensor + // CHECK: %[[ONE:.*]] = "xla_hlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x1xf32> + // CHECK: %[[TWO:.*]] = "xla_hlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x2xf32> + // CHECK: %[[THREE:.*]] = "xla_hlo.slice"(%[[ARG]]) {limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor<4x6xf32>) -> tensor<4x3xf32> + %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x6xf32>, tensor<3xi32>, tensor) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) + // CHECK: return %[[ONE]], %[[TWO]], %[[THREE]] + return %0#0, %0#1, %0#2 : tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32> +} + +// CHECK-LABEL: @splitv_match_and_split_into_three_dynamic +func @splitv_match_and_split_into_three_dynamic(%input: tensor) -> (tensor, tensor, tensor) { + %split_sizes = "tf.Const"() {value = dense<[1, 2, 3]> : tensor<3xi32>} : () -> tensor<3xi32> + %split_dim = "tf.Const"() {value = dense<1> : tensor} : () -> tensor + // CHECK: "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor) -> tensor + // CHECK: "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor) -> tensor + // CHECK: "xla_hlo.slice"(%{{.*}}) {limit_indices = dense<[-1, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} : (tensor) -> tensor + %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor, tensor<3xi32>, tensor) -> (tensor, tensor, tensor) + return %0#0, %0#1, %0#2 : tensor, tensor, tensor +} + +// CHECK-LABEL: @splitv_dynamic_dim_in_split_sizes +func @splitv_dynamic_dim_in_split_sizes(%input: tensor<4x6xf32>) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) { + %split_sizes = "tf.Const"() {value = dense<[1, -1, 3]> : tensor<3xi32>} : () -> tensor<3xi32> + %split_dim = "tf.Const"() {value = dense<1> : tensor} : () -> tensor + // CHECK: limit_indices = dense<[4, 1]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64> + // CHECK: limit_indices = dense<[4, 3]> : tensor<2xi64>, start_indices = dense<[0, 1]> : tensor<2xi64> + // CHECK: limit_indices = dense<[4, 6]> : tensor<2xi64>, start_indices = dense<[0, 3]> : tensor<2xi64> + %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x6xf32>, tensor<3xi32>, tensor) -> (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) + return %0#0, %0#1, %0#2 : tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32> +} diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc index b427e0124c0..57299525019 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc @@ -925,7 +925,7 @@ class ConvertSizeOp : public OpRewritePattern { }; // Converts the tf.Split op into a series of HLO slice ops when the tensor to be -// split has fuly static shape and the dimension to split is a constant. +// split has fully static shape and the dimension to split is a constant. // // The main logic of this pattern is to calculate the index start and end range // for each slice. And this happens only on the dimension to be split; for all @@ -1016,6 +1016,118 @@ class ConvertSplitOp : public OpRewritePattern { } }; +// Converts the tf.SplitV op into a series of HLO slice ops when the tensor to +// be split has fully static shape and the dimension to split and split sizes +// are constants. +// +// This is similar to the conversion for tf.Split op other than that the size of +// each chunk on the dimension to split is explicitly given as an op operand +// and they are not necessarily the same. +// +// For example, given the following IR: +// +// %split_sizes = "tf.Const"() {value = dense<[1, -1, 3]> : tensor<3xi32>} +// %split_dim = "tf.Const"() {value = dense<1> : tensor} +// %0:3 = "tf.SplitV"(%input, %split_sizes, %split_dim) : +// (tensor<4x6xf32>, tensor<3xi32>, tensor) -> +// (tensor<4x1xf32>, tensor<4x2xf32>, tensor<4x3xf32>) +// +// We will generate slices following slices: +// %0 = "xla_hlo.slice"(%input) { +// limit_indices = dense<[4, 1]> : tensor<2xi64>, +// start_indices = dense<0> : tensor<2xi64>, +// strides = dense<1> : tensor<2xi64>} : +// (tensor<4x6xf32>) -> tensor<4x1xf32> +// %1 = "xla_hlo.slice"(%input) { +// limit_indices = dense<[4, 3]> : tensor<2xi64>, +// start_indices = dense<[0, 1]> : tensor<2xi64>, +// strides = dense<1> : tensor<2xi64>} : +// (tensor<4x6xf32>) -> tensor<4x2xf32> +// %2 = "xla_hlo.slice"(%input) { +// limit_indices = dense<[4, 6]> : tensor<2xi64>, +// start_indices = dense<[0, 3]> : tensor<2xi64>, +// strides = dense<1> : tensor<2xi64>} : +// (tensor<4x6xf32>) -> tensor<4x3xf32> +class ConvertSplitVOp : public OpRewritePattern { + public: + using OpRewritePattern::OpRewritePattern; + + PatternMatchResult matchAndRewrite(TF::SplitVOp op, + PatternRewriter &rewriter) const override { + // We can only split along static dimensions. + // TODO(b/145731001): enhance to support dynamic-shaped inputs. + auto input_type = op.value()->getType().dyn_cast(); + if (!input_type) return matchFailure(); + + // We can only match when the split dimension is a constant scalar. + DenseIntElementsAttr split_dim_attr; + if (!matchPattern(op.split_dim(), m_Constant(&split_dim_attr))) + return matchFailure(); + + // We can only match when the split sizes is a constant int vector. + DenseIntElementsAttr split_sizes_attr; + if (!matchPattern(op.size_splits(), m_Constant(&split_sizes_attr))) + return matchFailure(); + + // Get each chunck's size along the dimension to split. It may contain + // dynamic sizes and we need to update it if so. + SmallVector split_sizes; + int64_t total_dim_size = 0; // Total dimension size assigned to splits + llvm::Optional dynamic_dim_index; + split_sizes.reserve( + split_sizes_attr.getType().cast().getNumElements()); + for (auto dim : llvm::enumerate(split_sizes_attr)) { + int64_t dim_val = dim.value().getSExtValue(); + split_sizes.push_back(dim_val); + if (dim_val == ShapedType::kDynamicSize) { + // We cannot have more than one dynamic dimension. + assert(!dynamic_dim_index && "invalid split sizes"); + dynamic_dim_index = dim.index(); + } else { + total_dim_size += dim_val; + } + } + + // Get the dimension we are splitting at. Offset properly if it's negative. + int64_t input_rank = input_type.getRank(); + int64_t dim_index = (*split_dim_attr.begin()).getSExtValue(); + if (dim_index < 0) dim_index += input_rank; + + int64_t input_dim_size = input_type.getDimSize(dim_index); + if (TensorType::isDynamic(input_dim_size)) return matchFailure(); + + assert(((dynamic_dim_index && total_dim_size <= input_dim_size) || + (!dynamic_dim_index && total_dim_size == input_dim_size)) && + "invalid split sizes"); + + // Update the dynamic dimension with calculated concrete size. + if (dynamic_dim_index) + split_sizes[*dynamic_dim_index] = input_dim_size - total_dim_size; + + // Parameters for constructing each slice. + SmallVector begin_indices(input_rank, 0); + auto end_indices = llvm::to_vector<4>(input_type.getShape()); + SmallVector strides(input_rank, 1); + + // All HLO slice results used to replace the original tf.Split op. + SmallVector slices; + slices.reserve(op.getNumResults()); + + for (int i = 0; i < op.getNumResults(); ++i) { + end_indices[dim_index] = begin_indices[dim_index] + split_sizes[i]; + slices.push_back(rewriter.create( + op.getLoc(), op.value(), GetI64ElementsAttr(begin_indices, &rewriter), + GetI64ElementsAttr(end_indices, &rewriter), + GetI64ElementsAttr(strides, &rewriter))); + // Prepare the begin indice for the next slice. + begin_indices[dim_index] = end_indices[dim_index]; + } + + rewriter.replaceOp(op, slices); + return matchSuccess(); + } +}; + // Converts StridedSlice op to HLO Slice op along with Reverse op to handle // negative strides and Reshape op to update the output shape. Indices and // strides operands are converted to attributes with non-negative indexing. @@ -2018,16 +2130,16 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) { // level TensorFlow ops. So, we don't have to target all the TensorFlow ops // here for lowering to HLO. TF::PopulateLoweringTFPatterns(context, &patterns); - patterns - .insert, - ConvertSoftmaxOp, ConvertSplitOp, - ConvertStridedSliceOp, ConvertTopKV2Op, ConvertMeanOp, - ConvertSumOp, ConvertMaxOp, ConvertTileOp, ConvertMaxPoolGradOp, - ConvertOneHotOp, ConvertConv2DBackpropInputOp, - ConvertConv2DBackpropFilterOp>(op->getContext()); + patterns.insert< + ConvertArgMaxOp, ConvertBF16FloorDivOp, ConvertConv2D, ConvertEinsumOp, + ConvertMaxPoolOp, ConvertRangeOp, ConvertSigmoidOp, ConvertSizeOp, + ConvertMaxPoolOp, ConvertRangeOp, ConvertSigmoidOp, + ConvertSoftmaxOp, + ConvertSoftmaxOp, ConvertSplitOp, ConvertSplitVOp, + ConvertStridedSliceOp, ConvertTopKV2Op, ConvertMeanOp, ConvertSumOp, + ConvertMaxOp, ConvertTileOp, ConvertMaxPoolGradOp, ConvertOneHotOp, + ConvertConv2DBackpropInputOp, ConvertConv2DBackpropFilterOp>( + op->getContext()); ConversionTarget target(*context); target.addLegalDialect(); From c7792f8c83e3bea4b902204e7c9a2122e74ea091 Mon Sep 17 00:00:00 2001 From: Ken Franko Date: Thu, 5 Dec 2019 13:38:29 -0800 Subject: [PATCH 196/383] Add more detailed documentation to MaxPooling2D including examples. PiperOrigin-RevId: 284042230 Change-Id: I0e1a16650a8df17720928cc5d079402286c607f0 --- tensorflow/python/keras/layers/pooling.py | 77 +++++++++++++++++++++-- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py index 36858674646..4b3083c9143 100644 --- a/tensorflow/python/keras/layers/pooling.py +++ b/tensorflow/python/keras/layers/pooling.py @@ -280,18 +280,81 @@ class Pooling2D(Layer): @keras_export('keras.layers.MaxPool2D', 'keras.layers.MaxPooling2D') class MaxPooling2D(Pooling2D): - """Max pooling operation for spatial data. + """Max pooling operation for 2D spatial data. + + Downsamples the input representation by taking the maximum value over the + window defined by `pool_size` for each dimension along the features axis. + The window is shifted by `strides` in each dimension. The resulting output + when using "valid" padding option has a shape(number of rows or columns) of: + `output_shape = (input_shape - pool_size + 1) / strides)` + + The resulting output shape when using the "same" padding option is: + `output_shape = input_shape / strides` + + For example, for stride=(1,1) and padding="valid": + + >>> x = tf.constant([[1., 2., 3.], + ... [4., 5., 6.], + ... [7., 8., 9.]]) + >>> x = tf.reshape(x, [1, 3, 3, 1]) + >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), + ... strides=(1, 1), padding='valid') + >>> max_pool_2d(x) + + + For example, for stride=(2,2) and padding="valid": + + >>> x = tf.constant([[1., 2., 3., 4.], + ... [5., 6., 7., 8.], + ... [9., 10., 11., 12.]]) + >>> x = tf.reshape(x, [1, 3, 4, 1]) + >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), + ... strides=(1, 1), padding='valid') + >>> max_pool_2d(x) + + + For example, for stride=(1,1) and padding="same": + + >>> x = tf.constant([[1., 2., 3.], + ... [4., 5., 6.], + ... [7., 8., 9.]]) + >>> x = tf.reshape(x, [1, 3, 3, 1]) + >>> max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(2, 2), + ... strides=(1, 1), padding='same') + >>> max_pool_2d(x) + Arguments: pool_size: integer or tuple of 2 integers, - factors by which to downscale (vertical, horizontal). - `(2, 2)` will halve the input in both spatial dimension. + window size over which to take the maximum. + `(2, 2)` will take the max value over a 2x2 pooling window. If only one integer is specified, the same window length will be used for both dimensions. strides: Integer, tuple of 2 integers, or None. - Strides values. - If None, it will default to `pool_size`. + Strides values. Specifies how far the pooling window moves + for each pooling step. If None, it will default to `pool_size`. padding: One of `"valid"` or `"same"` (case-insensitive). + "valid" adds no zero padding. "same" adds padding such that if the stride + is 1, the output shape is the same as input shape. data_format: A string, one of `channels_last` (default) or `channels_first`. The ordering of the dimensions in the inputs. @@ -314,6 +377,10 @@ class MaxPooling2D(Pooling2D): 4D tensor with shape `(batch_size, pooled_rows, pooled_cols, channels)`. - If `data_format='channels_first'`: 4D tensor with shape `(batch_size, channels, pooled_rows, pooled_cols)`. + + Returns: + A tensor of rank 4 representing the maximum pooled values. See above for + output shape. """ def __init__(self, From 76c66a73569eb70cd6e6dc4831ddc65ef62fbc2c Mon Sep 17 00:00:00 2001 From: Clayne Robison Date: Thu, 5 Dec 2019 14:58:42 -0700 Subject: [PATCH 197/383] Removing the link to public CI release artifacts. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9cac16619bd..88c5c248917 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Build Type **Linux ppc64le GPU** Nightly | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/) | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/) **Linux ppc64le GPU** Stable Release | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/) **Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) -**Linux CPU with Intel® MKL-DNN** Stable Release | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/) | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/) +**Linux CPU with Intel® MKL-DNN** Stable Release | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)] | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/) **Red Hat® Enterprise Linux® 7.6 CPU & GPU**
Python 2.7, 3.6 | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/) ## Resources From df097f30d9e9ee952a3f679d1810e0f31f142a93 Mon Sep 17 00:00:00 2001 From: Gunhan Gulsoy Date: Thu, 5 Dec 2019 13:50:32 -0800 Subject: [PATCH 198/383] Update logical_xor and logical_and documentation. -Move logical_and endpoint to python. -Expand their docstrings. -Write extensive testable examples for these two ops. PiperOrigin-RevId: 284044955 Change-Id: I9f727d3e33de457a27ae4336b659d5d6da090962 --- .../python_api/api_def_LogicalAnd.pbtxt | 7 +- tensorflow/python/ops/math_ops.py | 79 ++++++++++++++++--- 2 files changed, 69 insertions(+), 17 deletions(-) diff --git a/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt b/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt index d4e6a7a380e..3be990d47e1 100644 --- a/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_LogicalAnd.pbtxt @@ -1,9 +1,4 @@ op { graph_op_name: "LogicalAnd" - endpoint { - name: "math.logical_and" - } - endpoint { - name: "logical_and" - } + visibility: HIDDEN } diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 47f4742f4be..c3f453a0275 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -1268,24 +1268,39 @@ def logical_xor(x, y, name="LogicalXor"): x ^ y = (x | y) & ~(x & y) - Inputs are tensor and if the tensors contains more than one element, an - element-wise logical XOR is computed. + The operation works for the following input types: + + - Two single elements of type `bool` + - One `tf.Tensor` of type `bool` and one single `bool`, where the result will + be calculated by applying logical XOR with the single element to each + element in the larger Tensor. + - Two `tf.Tensor` objects of type `bool` of the same shape. In this case, + the result will be the element-wise logical XOR of the two input tensors. Usage: - ```python - x = tf.constant([False, False, True, True], dtype = tf.bool) - y = tf.constant([False, True, False, True], dtype = tf.bool) - z = tf.logical_xor(x, y, name="LogicalXor") - # here z = [False True True False] - ``` + >>> a = tf.constant([True]) + >>> b = tf.constant([False]) + >>> tf.math.logical_xor(a, b) + + + >>> c = tf.constant([True]) + >>> x = tf.constant([False, True, True, False]) + >>> tf.math.logical_xor(c, x) + + + >>> y = tf.constant([False, False, True, True]) + >>> z = tf.constant([False, True, False, True]) + >>> tf.math.logical_xor(y, z) + Args: - x: A `Tensor` type bool. - y: A `Tensor` of type bool. + x: A `tf.Tensor` type bool. + y: A `tf.Tensor` of type bool. + name: A name for the operation (optional). Returns: - A `Tensor` of type bool with the same size as that of x or y. + A `tf.Tensor` of type bool with the same size as that of x or y. """ # TODO(alemi) Make this a cwise op if people end up relying on it. return gen_math_ops.logical_and( @@ -1294,6 +1309,48 @@ def logical_xor(x, y, name="LogicalXor"): name=name) +@tf_export("math.logical_and", "logical_and") +@dispatch.add_dispatch_support +def logical_and(x, y, name=None): + """Logical AND function. + + The operation works for the following input types: + + - Two single elements of type `bool` + - One `tf.Tensor` of type `bool` and one single `bool`, where the result will + be calculated by applying logical AND with the single element to each + element in the larger Tensor. + - Two `tf.Tensor` objects of type `bool` of the same shape. In this case, + the result will be the element-wise logical AND of the two input tensors. + + Usage: + + >>> a = tf.constant([True]) + >>> b = tf.constant([False]) + >>> tf.math.logical_and(a, b) + + + >>> c = tf.constant([True]) + >>> x = tf.constant([False, True, True, False]) + >>> tf.math.logical_and(c, x) + + + >>> y = tf.constant([False, False, True, True]) + >>> z = tf.constant([False, True, False, True]) + >>> tf.math.logical_and(y, z) + + + Args: + x: A `tf.Tensor` type bool. + y: A `tf.Tensor` of type bool. + name: A name for the operation (optional). + + Returns: + A `tf.Tensor` of type bool with the same size as that of x or y. + """ + return gen_math_ops.logical_and(x, y, name) + + _OverrideBinaryOperatorHelper(gen_math_ops.logical_and, "and") _OverrideBinaryOperatorHelper(gen_math_ops.logical_or, "or") _OverrideBinaryOperatorHelper(logical_xor, "xor") From 3cc04ef616b3b1dbbef3ad1d0635f9e1824173bb Mon Sep 17 00:00:00 2001 From: Allen Lavoie Date: Thu, 5 Dec 2019 13:55:06 -0800 Subject: [PATCH 199/383] More docfixes for initializers Adds a doctested example for the remaining initializers. It's mostly the same example with tweaks (which makes sense; the whole point of an initializer is to have a consistent interface). Removes some additional compat.v1s and makes a couple other minor tweaks. PiperOrigin-RevId: 284045909 Change-Id: I97e1fc81c49ac32d48596fb0a3043043f5bd5106 --- tensorflow/python/ops/init_ops_v2.py | 266 ++++++++++++++++++++++----- 1 file changed, 224 insertions(+), 42 deletions(-) diff --git a/tensorflow/python/ops/init_ops_v2.py b/tensorflow/python/ops/init_ops_v2.py index 9c403d6a7b4..4999c4d8aac 100644 --- a/tensorflow/python/ops/init_ops_v2.py +++ b/tensorflow/python/ops/init_ops_v2.py @@ -392,10 +392,29 @@ class RandomNormal(Initializer): class TruncatedNormal(Initializer): """Initializer that generates a truncated normal distribution. - These values are similar to values from a `random_normal_initializer` - except that values more than two standard deviations from the mean - are discarded and re-drawn. This is the recommended initializer for - neural network weights and filters. + Initializers allow you to pre-specify an initialization strategy, encoded in + the Initializer object, without knowing the shape and dtype of the variable + being initialized. + + These values are similar to values from a `tf.initializers.RandomNormal` + except that values more than two standard deviations from the mean are + discarded and re-drawn. This is the recommended initializer for neural network + weights and filters. + + Examples: + + >>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables( + ... 3, tf.initializers.TruncatedNormal(mean=1., stddev=2.)) + >>> v1 + + >>> v2 + >> make_variables(4, tf.initializers.RandomUniform(minval=-1., maxval=1.)) + (, >> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.VarianceScaling(scale=1.)) + >>> v1 + + >>> v2 + >> make_variables(4, tf.initializers.VarianceScaling(distribution='uniform')) + (, >> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.Orthogonal()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.Orthogonal(gain=0.5)) + (>> def make_variable(k, initializer): + ... return tf.Variable(initializer(shape=[k, k], dtype=tf.float32)) + >>> make_variable(2, tf.initializers.Identity()) + + >>> make_variable(3, tf.initializers.Identity(gain=0.5)) + Args: gain: Multiplicative factor to apply to the identity matrix. @@ -623,6 +693,7 @@ class Identity(Initializer): Raises: ValueError: If the dtype is not floating point + ValueError: If the requested shape does not have exactly two axes. """ partition_info = None # Keeps logic so can be readded later if necessary dtype = _assert_float_dtype(dtype) @@ -643,15 +714,32 @@ class Identity(Initializer): class GlorotUniform(VarianceScaling): """The Glorot uniform initializer, also called Xavier uniform initializer. - It draws samples from a uniform distribution within [-limit, limit] - where `limit` is `sqrt(6 / (fan_in + fan_out))` - where `fan_in` is the number of input units in the weight tensor - and `fan_out` is the number of output units in the weight tensor. + Initializers allow you to pre-specify an initialization strategy, encoded in + the Initializer object, without knowing the shape and dtype of the variable + being initialized. + + Draws samples from a uniform distribution within [-limit, limit] where `limit` + is `sqrt(6 / (fan_in + fan_out))` where `fan_in` is the number of input units + in the weight tensor and `fan_out` is the number of output units in the weight + tensor. + + Examples: + + >>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.GlorotUniform()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.RandomNormal()) + (>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.GlorotNormal()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.RandomNormal()) + (>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.lecun_normal()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.RandomNormal()) + (>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.lecun_uniform()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.RandomNormal()) + (>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.he_normal()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.RandomNormal()) + (>> def make_variables(k, initializer): + ... return (tf.Variable(initializer(shape=[k, k], dtype=tf.float32)), + ... tf.Variable(initializer(shape=[k, k, k], dtype=tf.float32))) + >>> v1, v2 = make_variables(3, tf.initializers.he_uniform()) + >>> v1 + >> v2 + >> make_variables(4, tf.initializers.RandomNormal()) + ( Date: Thu, 5 Dec 2019 14:00:27 -0800 Subject: [PATCH 200/383] Add fully quantize test for tranpose operator. PiperOrigin-RevId: 284047075 Change-Id: I83b211bc911567dae4a97c50817374b42052d13a --- tensorflow/lite/testing/BUILD | 1 + tensorflow/lite/testing/op_tests/transpose.py | 18 +++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD index 25da7cedf01..7ccf30d45fd 100644 --- a/tensorflow/lite/testing/BUILD +++ b/tensorflow/lite/testing/BUILD @@ -513,6 +513,7 @@ edgetpu_ops = [ "sub", "sum", # high error "tanh", + "transpose", "transpose_conv", ] diff --git a/tensorflow/lite/testing/op_tests/transpose.py b/tensorflow/lite/testing/op_tests/transpose.py index ba32783e3ac..9b7e026269f 100644 --- a/tensorflow/lite/testing/op_tests/transpose.py +++ b/tensorflow/lite/testing/op_tests/transpose.py @@ -34,16 +34,31 @@ def make_transpose_tests(options): "input_shape": [[2, 2, 3]], "perm": [[0, 1, 2], [0, 2, 1]], "constant_perm": [True, False], + "fully_quantize": [False], }, { "dtype": [tf.float32], "input_shape": [[1, 2, 3, 4]], "perm": [[0, 1, 2, 3], [3, 0, 1, 2]], "constant_perm": [True, False], + "fully_quantize": [False], }, { "dtype": [tf.float32], "input_shape": [[1, 2, 3, 4, 5]], "perm": [[4, 3, 2, 1, 0]], "constant_perm": [True, False], + "fully_quantize": [False], + }, { + "dtype": [tf.float32], + "input_shape": [[2, 2, 3]], + "perm": [[0, 1, 2], [0, 2, 1]], + "constant_perm": [True], + "fully_quantize": [True], + }, { + "dtype": [tf.float32], + "input_shape": [[1, 2, 3, 4]], + "perm": [[0, 1, 2, 3], [3, 0, 1, 2]], + "constant_perm": [True], + "fully_quantize": [True], }] def build_graph(parameters): @@ -66,7 +81,8 @@ def make_transpose_tests(options): def build_inputs(parameters, sess, inputs, outputs): values = [ - create_tensor_data(parameters["dtype"], parameters["input_shape"]) + create_tensor_data(parameters["dtype"], parameters["input_shape"], + min_value=-1, max_value=1) ] if not parameters["constant_perm"]: values.append(np.array(parameters["perm"])) From 8d3a2cbf7892176c5bced17df03499e2618d8297 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 14:19:41 -0800 Subject: [PATCH 201/383] Set async flag correctly for memcpy device events when using CuptiDriverApiHookWithCudaEvent. PiperOrigin-RevId: 284052255 Change-Id: I29d3e3b50597cbda0a984b57bdefb65920419ab3 --- .../profiler/internal/gpu/cupti_tracer.cc | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc index 17b4362eca2..8736f777da4 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc @@ -644,6 +644,7 @@ struct MemcpyRecord { CUcontext context; CUstream stream; uint32 correlation_id; + bool async; CUevent start_event; CUevent stop_event; uint64 start_timestamp; @@ -744,9 +745,10 @@ class CudaEventRecorder { // Registers the start of a copy operation. The returned index should be // passed to StopMemcpy() after the memcpy has completed. size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes, - CUcontext context, CUstream stream, - uint32 correlation_id) { - MemcpyRecord record = {type, size_bytes, context, stream, correlation_id}; + CUcontext context, CUstream stream, uint32 correlation_id, + bool async) { + MemcpyRecord record = {type, size_bytes, context, + stream, correlation_id, async}; record.start_timestamp = CuptiTracer::GetTimestamp(); LogIfError(CreateAndRecordEvent(&record.start_event, stream)); absl::MutexLock lock(&mutex_); @@ -953,8 +955,7 @@ class CudaEventRecorder { event.memcpy_info.num_bytes = record.size_bytes; // TODO: support MemcpyD2D where destination != source; event.memcpy_info.destination = ordinal_; - // TODO: support differentiate sync and async memcpy. - event.memcpy_info.async = false; + event.memcpy_info.async = record.async; // TODO: set src_mem_kind and dst_mem_kind. collector_->AddEvent(std::move(event)); return Status::OK(); @@ -1173,16 +1174,16 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook { auto params = static_cast(cbdata->functionParams); *cbdata->correlationData = recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr, - cbdata->correlationId); + cbdata->correlationId, /*async*/ false); } template static void StartMemcpyAsync(CuptiTracerEventType type, const CUpti_CallbackData *cbdata, CudaEventRecorder *recorder) { auto params = static_cast(cbdata->functionParams); - *cbdata->correlationData = - recorder->StartMemcpy(type, params->ByteCount, cbdata->context, - params->hStream, cbdata->correlationId); + *cbdata->correlationData = recorder->StartMemcpy( + type, params->ByteCount, cbdata->context, params->hStream, + cbdata->correlationId, /*async*/ true); } static CUmemorytype GetMemoryType(CUdeviceptr ptr) { From c7061962262c8cacc4017b352f32d2134216ca4a Mon Sep 17 00:00:00 2001 From: Pavithra Vijay Date: Thu, 5 Dec 2019 14:31:12 -0800 Subject: [PATCH 202/383] Add/Fix docs for losses. PiperOrigin-RevId: 284054999 Change-Id: I21cd8286c367ec291b176a6ff7be7663a7f0485d --- tensorflow/python/keras/losses.py | 344 ++++++++++++++++++++---------- 1 file changed, 229 insertions(+), 115 deletions(-) diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py index 8370fdf03fc..4ddf176094a 100644 --- a/tensorflow/python/keras/losses.py +++ b/tensorflow/python/keras/losses.py @@ -48,7 +48,7 @@ class Loss(object): * `call()`: Contains the logic for loss calculation using `y_true`, `y_pred`. Example subclass implementation: - ``` + ```python class MeanSquaredError(Loss): def call(self, y_true, y_pred): y_pred = ops.convert_to_tensor(y_pred) @@ -66,7 +66,7 @@ class Loss(object): details on this. You can implement 'SUM_OVER_BATCH_SIZE' using global batch size like: - ``` + ```python with strategy.scope(): loss_obj = tf.keras.losses.CategoricalCrossentropy( reduction=tf.keras.losses.Reduction.NONE) @@ -236,11 +236,15 @@ class MeanSquaredError(LossFunctionWrapper): Usage: - ```python - mse = tf.keras.losses.MeanSquaredError() - loss = mse([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Loss: ', loss.numpy()) # Loss: 0.75 - ``` + >>> mse = tf.keras.losses.MeanSquaredError() + >>> loss = mse([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]]) + >>> loss.numpy() + 0.5 + + >>> loss = mse([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]], + ... sample_weight=[0.7, 0.3]) + >>> loss.numpy() + 0.25 Usage with the `compile` API: @@ -265,11 +269,15 @@ class MeanAbsoluteError(LossFunctionWrapper): Usage: - ```python - mae = tf.keras.losses.MeanAbsoluteError() - loss = mae([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Loss: ', loss.numpy()) # Loss: 0.75 - ``` + >>> mae = tf.keras.losses.MeanAbsoluteError() + >>> loss = mae([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]]) + >>> loss.numpy() + 0.5 + + >>> loss = mae([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]], + ... sample_weight=[0.7, 0.3]) + >>> loss.numpy() + 0.25 Usage with the `compile` API: @@ -294,11 +302,15 @@ class MeanAbsolutePercentageError(LossFunctionWrapper): Usage: - ```python - mape = tf.keras.losses.MeanAbsolutePercentageError() - loss = mape([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Loss: ', loss.numpy()) # Loss: 5e+08 - ``` + >>> mape = tf.keras.losses.MeanAbsolutePercentageError() + >>> loss = mape([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]]) + >>> loss.numpy() + 500000000.0 + + >>> loss = mape([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]], + ... sample_weight=[0.7, 0.3]) + >>> loss.numpy() + 250000000.0 Usage with the `compile` API: @@ -323,11 +335,15 @@ class MeanSquaredLogarithmicError(LossFunctionWrapper): Usage: - ```python - msle = tf.keras.losses.MeanSquaredLogarithmicError() - loss = msle([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Loss: ', loss.numpy()) # Loss: 0.36034 - ``` + >>> msle = tf.keras.losses.MeanSquaredLogarithmicError() + >>> loss = msle([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]]) + >>> loss.numpy() + 0.24022643 + + >>> loss = msle([[0., 1.], [0., 0.]], [[1., 1.], [1., 0.]], + ... sample_weight=[0.7, 0.3]) + >>> loss.numpy() + 0.12011322 Usage with the `compile` API: @@ -357,12 +373,15 @@ class BinaryCrossentropy(LossFunctionWrapper): `[batch_size]`. Usage: + >>> bce = tf.keras.losses.BinaryCrossentropy() + >>> loss = bce([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> loss.numpy() + 0.81492424 - ```python - bce = tf.keras.losses.BinaryCrossentropy() - loss = bce([0., 0., 1., 1.], [1., 1., 1., 0.]) - print('Loss: ', loss.numpy()) # Loss: 11.522857 - ``` + >>> loss = bce([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> loss.numpy() + 0.45814526 Usage with the `tf.keras` API: @@ -421,13 +440,17 @@ class CategoricalCrossentropy(LossFunctionWrapper): Usage: - ```python - cce = tf.keras.losses.CategoricalCrossentropy() - loss = cce( - [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]], - [[.9, .05, .05], [.05, .89, .06], [.05, .01, .94]]) - print('Loss: ', loss.numpy()) # Loss: 0.0945 - ``` + >>> cce = tf.keras.losses.CategoricalCrossentropy() + >>> loss = cce([[0, 1, 0], [0, 0, 1]], + ... [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]) + >>> loss.numpy() + 1.1769392 + + >>> loss = cce([[0, 1, 0], [0, 0, 1]], + ... [[0.05, 0.95, 0], [0.1, 0.8, 0.1]], + ... sample_weight=tf.constant([0.3, 0.7])) + >>> loss.numpy() + 0.8135988 Usage with the `compile` API: @@ -439,7 +462,7 @@ class CategoricalCrossentropy(LossFunctionWrapper): Args: from_logits: Whether `y_pred` is expected to be a logits tensor. By default, we assume that `y_pred` encodes a probability distribution. - Note: Using from_logits=True may be more numerically stable. + **Note: Using from_logits=True is more numerically stable.** label_smoothing: Float in [0, 1]. When > 0, label values are smoothed, meaning the confidence on label values are relaxed. e.g. `label_smoothing=0.2` means that we will use a value of `0.1` for label @@ -486,13 +509,15 @@ class SparseCategoricalCrossentropy(LossFunctionWrapper): Usage: - ```python - cce = tf.keras.losses.SparseCategoricalCrossentropy() - loss = cce( - tf.convert_to_tensor([0, 1, 2]), - tf.convert_to_tensor([[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])) - print('Loss: ', loss.numpy()) # Loss: 0.3239 - ``` + >>> scce = tf.keras.losses.SparseCategoricalCrossentropy() + >>> loss = scce([1, 2], [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]) + >>> loss.numpy() + 1.1769392 + + >>> loss = scce([1, 2], [[0.05, 0.95, 0], [0.1, 0.8, 0.1]], + ... sample_weight=tf.constant([0.3, 0.7])) + >>> loss.numpy() + 0.8135988 Usage with the `compile` API: @@ -539,14 +564,14 @@ class Hinge(LossFunctionWrapper): Usage: - ```python - h = tf.keras.losses.Hinge() - loss = h([-1., 1., 1.], [0.6, -0.7, -0.5]) + >>> h = tf.keras.losses.Hinge() + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> loss.numpy() + 1.3 - # loss = max(0, 1 - y_true * y_pred) = [1.6 + 1.7 + 1.5] / 3 - - print('Loss: ', loss.numpy()) # Loss: 1.6 - ``` + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], sample_weight=[1, 0]) + >>> loss.numpy() + 0.55 Usage with the `compile` API: @@ -571,14 +596,14 @@ class SquaredHinge(LossFunctionWrapper): Usage: - ```python - sh = tf.keras.losses.SquaredHinge() - loss = sh([-1., 1., 1.], [0.6, -0.7, -0.5]) + >>> h = tf.keras.losses.SquaredHinge() + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> loss.numpy() + 1.86 - # loss = (max(0, 1 - y_true * y_pred))^2 = [1.6^2 + 1.7^2 + 1.5^2] / 3 - - print('Loss: ', loss.numpy()) # Loss: 2.566666 - ``` + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], sample_weight=[1, 0]) + >>> loss.numpy() + 0.73 Usage with the `compile` API: @@ -604,11 +629,14 @@ class CategoricalHinge(LossFunctionWrapper): Usage: - ```python - ch = tf.keras.losses.CategoricalHinge() - loss = ch([0., 1., 1.], [1., 0., 1.]) - print('Loss: ', loss.numpy()) # Loss: 1.0 - ``` + >>> h = tf.keras.losses.CategoricalHinge() + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> loss.numpy() + 1.4000001 + + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], sample_weight=[1, 0]) + >>> loss.numpy() + 0.6 Usage with the `compile` API: @@ -633,11 +661,15 @@ class Poisson(LossFunctionWrapper): Usage: - ```python - p = tf.keras.losses.Poisson() - loss = p([1., 9., 2.], [4., 8., 12.]) - print('Loss: ', loss.numpy()) # Loss: -0.35702705 - ``` + >>> p = tf.keras.losses.Poisson() + >>> loss = p([[0., 1.], [0., 0.]], [[1., 1.], [0., 0.]]) + >>> loss.numpy() + 0.49999997 + + >>> loss = p([[0., 1.], [0., 0.]], [[1., 1.], [0., 0.]], + ... sample_weight=[1., 0.]) + >>> loss.numpy() + 0.49999997 Usage with the `compile` API: @@ -660,11 +692,15 @@ class LogCosh(LossFunctionWrapper): Usage: - ```python - l = tf.keras.losses.LogCosh() - loss = l([0., 1., 1.], [1., 0., 1.]) - print('Loss: ', loss.numpy()) # Loss: 0.289 - ``` + >>> l = tf.keras.losses.LogCosh() + >>> loss = l([[0., 1.], [0., 0.]], [[1., 1.], [0., 0.]]) + >>> loss.numpy() + 0.10844523 + + >>> loss = l([[0., 1.], [0., 0.]], [[1., 1.], [0., 0.]], + ... sample_weight=[1., 0.]) + >>> loss.numpy() + 0.10844523 Usage with the `compile` API: @@ -688,11 +724,15 @@ class KLDivergence(LossFunctionWrapper): Usage: - ```python - k = tf.keras.losses.KLDivergence() - loss = k([.4, .9, .2], [.5, .8, .12]) - print('Loss: ', loss.numpy()) # Loss: 0.11891246 - ``` + >>> kl = tf.keras.losses.KLDivergence() + >>> loss = kl([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> loss.numpy() + 0.45814306 + + >>> loss = kl([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> loss.numpy() + 0.4581446 Usage with the `compile` API: @@ -723,11 +763,15 @@ class Huber(LossFunctionWrapper): Usage: - ```python - l = tf.keras.losses.Huber() - loss = l([0., 1., 1.], [1., 0., 1.]) - print('Loss: ', loss.numpy()) # Loss: 0.333 - ``` + >>> h = tf.keras.losses.Huber() + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]]) + >>> loss.numpy() + 0.155 + + >>> loss = h([[0, 1], [0, 0]], [[0.6, 0.4], [0.4, 0.6]], + ... sample_weight=[1, 0]) + >>> loss.numpy() + 0.09 Usage with the `compile` API: @@ -766,6 +810,17 @@ class Huber(LossFunctionWrapper): 'keras.losses.mse', 'keras.losses.MSE') def mean_squared_error(y_true, y_pred): + """Computes the mean squared error between labels and predictions. + + `loss = square(y_true - y_pred)` + + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + + Returns: + Mean squared error values. shape = `[batch_size, d0, .. dN-1]`. + """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.squared_difference(y_pred, y_true), axis=-1) @@ -778,6 +833,17 @@ def mean_squared_error(y_true, y_pred): 'keras.losses.mae', 'keras.losses.MAE') def mean_absolute_error(y_true, y_pred): + """Computes the mean absolute error between labels and predictions. + + `loss = abs(y_true - y_pred)` + + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + + Returns: + Mean absolute error values. shape = `[batch_size, d0, .. dN-1]`. + """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.mean(math_ops.abs(y_pred - y_true), axis=-1) @@ -789,7 +855,18 @@ def mean_absolute_error(y_true, y_pred): 'keras.losses.mean_absolute_percentage_error', 'keras.losses.mape', 'keras.losses.MAPE') -def mean_absolute_percentage_error(y_true, y_pred): # pylint: disable=missing-docstring +def mean_absolute_percentage_error(y_true, y_pred): + """Computes the mean absolute percentage error between `y_true` and `y_pred`. + + `loss = 100 * abs(y_true - y_pred) / y_true` + + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + + Returns: + Mean absolute percentage error values. shape = `[batch_size, d0, .. dN-1]`. + """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) diff = math_ops.abs( @@ -803,7 +880,18 @@ def mean_absolute_percentage_error(y_true, y_pred): # pylint: disable=missing-d 'keras.losses.mean_squared_logarithmic_error', 'keras.losses.msle', 'keras.losses.MSLE') -def mean_squared_logarithmic_error(y_true, y_pred): # pylint: disable=missing-docstring +def mean_squared_logarithmic_error(y_true, y_pred): + """Computes the mean squared logarithmic error between `y_true` and `y_pred`. + + `loss = square(log(y_true) - log(y_pred))` + + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + + Returns: + Mean squared logarithmic error values. shape = `[batch_size, d0, .. dN-1]`. + """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) first_log = math_ops.log(K.maximum(y_pred, K.epsilon()) + 1.) @@ -830,13 +918,16 @@ def _maybe_convert_labels(y_true): def squared_hinge(y_true, y_pred): """Computes the squared hinge loss between `y_true` and `y_pred`. + `loss = square(maximum(1 - y_true * y_pred, 0))` + Args: y_true: The ground truth values. `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided we will convert them to -1 or 1. - y_pred: The predicted values. + shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: - Tensor with one scalar loss entry per sample. + Squared hinge loss values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) @@ -849,13 +940,16 @@ def squared_hinge(y_true, y_pred): def hinge(y_true, y_pred): """Computes the hinge loss between `y_true` and `y_pred`. + `loss = maximum(1 - y_true * y_pred, 0)` + Args: y_true: The ground truth values. `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided they will be converted to -1 or 1. - y_pred: The predicted values. + shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: - Tensor with one scalar loss entry per sample. + Hinge loss values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) @@ -867,13 +961,16 @@ def hinge(y_true, y_pred): def categorical_hinge(y_true, y_pred): """Computes the categorical hinge loss between `y_true` and `y_pred`. + `loss = maximum(neg - pos + 1, 0)` + where `neg = sum(y_true * y_pred)` and `pos = maximum(1 - y_true)` + Args: y_true: The ground truth values. `y_true` values are expected to be -1 or 1. If binary (0 or 1) labels are provided they will be converted to -1 or 1. y_pred: The predicted values. Returns: - A tensor. + Categorical hinge loss values. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) @@ -924,12 +1021,12 @@ def logcosh(y_true, y_pred): like the mean squared error, but will not be so strongly affected by the occasional wildly incorrect prediction. - Arguments: - y_true: tensor of true targets. - y_pred: tensor of predicted targets. + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: - Tensor with one scalar loss entry per sample. + Logcosh error values. shape = `[batch_size, d0, .. dN-1]`. """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) @@ -974,6 +1071,19 @@ def categorical_crossentropy(y_true, @keras_export('keras.metrics.sparse_categorical_crossentropy', 'keras.losses.sparse_categorical_crossentropy') def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1): + """Computes the sparse categorical crossentropy loss. + + Args: + y_true: Ground truth values. + y_pred: The predicted values. + from_logits: Whether `y_pred` is expected to be a logits tensor. By default, + we assume that `y_pred` encodes a probability distribution. + axis: (Optional) Defaults to -1. The dimension along which the entropy is + computed. + + Returns: + Sparse categorical crossentropy loss value. + """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) return K.sparse_categorical_crossentropy( @@ -982,7 +1092,19 @@ def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False, axis=-1): @keras_export('keras.metrics.binary_crossentropy', 'keras.losses.binary_crossentropy') -def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0): # pylint: disable=missing-docstring +def binary_crossentropy(y_true, y_pred, from_logits=False, label_smoothing=0): + """Computes the binary crossentropy loss. + + Args: + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. + from_logits: Whether `y_pred` is expected to be a logits tensor. By default, + we assume that `y_pred` encodes a probability distribution. + label_smoothing: Float in [0, 1]. If > `0` then smooth the labels. + + Returns: + Binary crossentropy loss value. shape = `[batch_size, d0, .. dN-1]`. + """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) label_smoothing = ops.convert_to_tensor(label_smoothing, dtype=K.floatx()) @@ -1025,7 +1147,6 @@ def kullback_leibler_divergence(y_true, y_pred): Raises: TypeError: If `y_true` cannot be cast to the `y_pred.dtype`. - """ y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) @@ -1041,19 +1162,12 @@ def poisson(y_true, y_pred): The Poisson loss is the mean of the elements of the `Tensor` `y_pred - y_true * log(y_pred)`. - Usage: - - ```python - loss = tf.keras.losses.poisson([1.4, 9.3, 2.2], [4.3, 8.2, 12.2]) - print('Loss: ', loss.numpy()) # Loss: -0.8045559 - ``` - Args: - y_true: Tensor of true targets. - y_pred: Tensor of predicted targets. + y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`. + y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`. Returns: - A `Tensor` with the mean Poisson loss. + Poisson loss value. shape = `[batch_size, d0, .. dN-1]`. Raises: InvalidArgumentError: If `y_true` and `y_pred` have incompatible shapes. @@ -1099,19 +1213,19 @@ def cosine_similarity(y_true, y_pred, axis=-1): class CosineSimilarity(LossFunctionWrapper): """Computes the cosine similarity between `y_true` and `y_pred`. + `loss = -sum(y_true * y_pred)` + Usage: - ```python - cosine_loss = tf.keras.losses.CosineSimilarity(axis=1) - loss = cosine_loss([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]]) - # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]] - # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]] - # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]] - # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1)) - = ((0. + 0.) + (0.5 + 0.5)) / 2 - - print('Loss: ', loss.numpy()) # Loss: 0.5 - ``` + >>> cosine_loss = tf.keras.losses.CosineSimilarity(axis=1) + >>> loss = cosine_loss([[0., 1.], [1., 1.]], [[1., 0.], [1., 1.]]) + >>> # l2_norm(y_true) = [[0., 1.], [1./1.414], 1./1.414]]] + >>> # l2_norm(y_pred) = [[1., 0.], [1./1.414], 1./1.414]]] + >>> # l2_norm(y_true) . l2_norm(y_pred) = [[0., 0.], [0.5, 0.5]] + >>> # loss = mean(sum(l2_norm(y_true) . l2_norm(y_pred), axis=1)) + >>> # = ((0. + 0.) + (0.5 + 0.5)) / 2 + >>> loss.numpy() + -0.49999997 Usage with the `compile` API: From 400df32a6266e540166ccdad18ab0013fc2a1e54 Mon Sep 17 00:00:00 2001 From: Clayne Robison Date: Thu, 5 Dec 2019 15:38:02 -0700 Subject: [PATCH 203/383] Fixing funky link. We need to show the status of the release builds out on the public CI, but those aren't actually the bits that people should use. So rather than take them to the Jenkins job and its artifacts, if they click on the badge, it will just take them to the badge icon on the public CI, which isn't terribly useful. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 88c5c248917..46cddb32cd4 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ Build Type **Linux ppc64le GPU** Nightly | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Build/) | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/) **Linux ppc64le GPU** Stable Release | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) | Release [1.15](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/) / [2.x](https://powerci.osuosl.org/job/TensorFlow2_PPC64LE_GPU_Release_Build/) **Linux CPU with Intel® MKL-DNN** Nightly | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/) -**Linux CPU with Intel® MKL-DNN** Stable Release | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)] | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/) +**Linux CPU with Intel® MKL-DNN** Stable Release | ![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon) | Release [1.15](https://pypi.org/project/intel-tensorflow/1.15.0/) / [2.x](https://pypi.org/project/intel-tensorflow/) **Red Hat® Enterprise Linux® 7.6 CPU & GPU**
Python 2.7, 3.6 | [![Build Status](https://jenkins-tensorflow.apps.ci.centos.org/buildStatus/icon?job=tensorflow-rhel7-3.6&build=2)](https://jenkins-tensorflow.apps.ci.centos.org/job/tensorflow-rhel7-3.6/2/) | [1.13.1 PyPI](https://tensorflow.pypi.thoth-station.ninja/index/) ## Resources From 34901a724810c1e664f1cfc5285d13e015a877cf Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 14:32:08 -0800 Subject: [PATCH 204/383] genop: fix template error Rewrite a conditional in a template to be syntactically valid. PiperOrigin-RevId: 284055201 Change-Id: Ie0129e7553857076e60e4d5d85e912a7893168d9 --- tensorflow/go/genop/internal/genop.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/go/genop/internal/genop.go b/tensorflow/go/genop/internal/genop.go index 15c125e3cf7..230462b6e7d 100644 --- a/tensorflow/go/genop/internal/genop.go +++ b/tensorflow/go/genop/internal/genop.go @@ -265,7 +265,7 @@ func {{$.Op.Name}}{{CamelCase .RenameTo}}(value {{GoType .Type}}) {{$.Op.Name}}A {{- else }} {{- if .DescribeOutputs}} // -{{- if ((len .OutArgs) eq 1) }} +{{- if eq (len .OutArgs) 1 }} // Returns {{range .OutArgs}}{{MakeComment .Description}}{{end}} {{- else }} // Returns: From 77fde6149586c9f1efd53e95821cbcf95613287e Mon Sep 17 00:00:00 2001 From: David Soergel Date: Thu, 5 Dec 2019 14:39:57 -0800 Subject: [PATCH 205/383] Update tensorboard dependency to 2.1.x TensorBoard release: https://pypi.org/project/tensorboard/2.1.0/ PiperOrigin-RevId: 284056869 Change-Id: I0645574f14dd4280420a074ef095818f2f07fae4 --- tensorflow/tools/pip_package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py index 22c8308f3eb..42ffaa857fc 100644 --- a/tensorflow/tools/pip_package/setup.py +++ b/tensorflow/tools/pip_package/setup.py @@ -61,7 +61,7 @@ REQUIRED_PACKAGES = [ 'numpy >= 1.16.0, < 2.0', 'opt_einsum >= 2.3.2', 'protobuf >= 3.8.0', - 'tensorboard >= 2.0.0, < 2.1.0', + 'tensorboard >= 2.1.0, < 2.2.0', 'tensorflow_estimator >= 2.0.0, < 2.1.0', 'termcolor >= 1.1.0', 'wrapt >= 1.11.1', From 47c66659de0f00d581d0f19799392ccc3dee8852 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 14:44:48 -0800 Subject: [PATCH 206/383] Delete brittle unit test for matmul GPU benchmark The assertion on GraphDef started failing after a recent commit changing random_uniform function behavior. Benchmarks for unit tests are generally not added so it is better to delete this test instead of fixing or disabling it. PiperOrigin-RevId: 284057912 Change-Id: I38bf72b051de3e6dcac96959498e1d876b9e3421 --- tensorflow/python/BUILD | 23 ---- .../python/ops/matmul_benchmark_test.py | 125 ------------------ 2 files changed, 148 deletions(-) delete mode 100644 tensorflow/python/ops/matmul_benchmark_test.py diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index e9e74e85ffa..12a28007912 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -7073,29 +7073,6 @@ py_library( ], ) -cuda_py_test( - name = "matmul_benchmark_test", - size = "medium", - srcs = ["ops/matmul_benchmark_test.py"], - additional_deps = [ - ":math_ops", - ":random_ops", - ":client", - ":client_testlib", - ":control_flow_ops", - ":framework_for_generated_wrappers", - ":platform", - ":platform_benchmark", - ":matmul_benchmark", - ":variables", - "//third_party/py/numpy", - "//tensorflow/core:protos_all_py", - ], - main = "ops/matmul_benchmark_test.py", - python_version = "PY3", - tags = ["no_pip"], -) - cuda_py_test( name = "session_benchmark", srcs = ["client/session_benchmark.py"], diff --git a/tensorflow/python/ops/matmul_benchmark_test.py b/tensorflow/python/ops/matmul_benchmark_test.py deleted file mode 100644 index 3df0c66ef9c..00000000000 --- a/tensorflow/python/ops/matmul_benchmark_test.py +++ /dev/null @@ -1,125 +0,0 @@ -# Copyright 2017 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== -"""Tests for matmul_benchmark.py.""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -import itertools -import numpy as np - -from tensorflow.core.framework import graph_pb2 -from tensorflow.core.framework import node_def_pb2 -from tensorflow.python.framework import ops -from tensorflow.python.ops import matmul_benchmark -from tensorflow.python.platform import test as googletest -from tensorflow.python.platform import tf_logging - - -def BuildGraphTest(n, m, k, transpose_a, transpose_b, dtype): - - def Test(self): - if not googletest.is_gpu_available(): - tf_logging.info("Skipping BuildGraphTest %s", - (n, m, k, transpose_a, transpose_b)) - return - tf_logging.info("Testing BuildGraphTest %s", - (n, m, k, transpose_a, transpose_b)) - self._VerifyBuildGraph(n, m, k, transpose_a, transpose_b, dtype) - - return Test - - -def RunGraphTest(n, m, k, transpose_a, transpose_b, dtype): - - def Test(self): - if not googletest.is_gpu_available(): - tf_logging.info("Skipping RunGraphTest %s", - (n, m, k, transpose_a, transpose_b)) - return - tf_logging.info("Testing RunGraphTest %s", - (n, m, k, transpose_a, transpose_b)) - self._VerifyRunGraph(n, m, k, transpose_a, transpose_b, dtype) - - return Test - - -class MatmulBenchmarkTest(googletest.TestCase): - - def _StripNode(self, nd): - snode = node_def_pb2.NodeDef(name=nd.name, op=nd.op, input=nd.input) - if nd.device: - snode.device = nd.device - return snode - - def _StripGraph(self, gd): - return graph_pb2.GraphDef(node=[self._StripNode(nd) for nd in gd.node]) - - def _VerifyBuildGraph(self, n, m, k, transpose_a, transpose_b, dtype): - graph = ops.Graph() - with graph.as_default(): - matmul_benchmark.build_graph(googletest.gpu_device_name(), n, m, k, - transpose_a, transpose_b, dtype) - gd = graph.as_graph_def() - dev = googletest.gpu_device_name() - proto_expected = """ - node { name: "random_uniform/shape" op: "Const" device: \"""" + dev + """\" } - node { name: "random_uniform/min" op: "Const" device: \"""" + dev + """\" } - node { name: "random_uniform/max" op: "Const" device: \"""" + dev + """\" } - node { name: "random_uniform/RandomUniform" op: "RandomUniform" input: "random_uniform/shape" device: \"""" + dev + """\" } - node { name: "random_uniform/sub" op: "Sub" input: "random_uniform/max" input: "random_uniform/min" device: \"""" + dev + """\" } - node { name: "random_uniform/mul" op: "Mul" input: "random_uniform/RandomUniform" input: "random_uniform/sub" device: \"""" + dev + """\" } - node { name: "random_uniform" op: "Add" input: "random_uniform/mul" input: "random_uniform/min" device: \"""" + dev + """\" } - node { name: "Variable" op: "VariableV2" device: \"""" + dev + """\" } - node { name: "Variable/Assign" op: "Assign" input: "Variable" input: "random_uniform" device: \"""" + dev + """\" } - node { name: "Variable/read" op: "Identity" input: "Variable" device: \"""" + dev + """\" } - node { name: "random_uniform_1/shape" op: "Const" device: \"""" + dev + """\" } - node { name: "random_uniform_1/min" op: "Const" device: \"""" + dev + """\" } - node { name: "random_uniform_1/max" op: "Const" device: \"""" + dev + """\" } - node { name: "random_uniform_1/RandomUniform" op: "RandomUniform" input: "random_uniform_1/shape" device: \"""" + dev + """\" } - node { name: "random_uniform_1/sub" op: "Sub" input: "random_uniform_1/max" input: "random_uniform_1/min" device: \"""" + dev + """\" } - node { name: "random_uniform_1/mul" op: "Mul" input: "random_uniform_1/RandomUniform" input: "random_uniform_1/sub" device: \"""" + dev + """\" } - node { name: "random_uniform_1" op: "Add" input: "random_uniform_1/mul" input: "random_uniform_1/min" device: \"""" + dev + """\" } - node { name: "Variable_1" op: "VariableV2" device: \"""" + dev + """\" } - node { name: "Variable_1/Assign" op: "Assign" input: "Variable_1" input: "random_uniform_1" device: \"""" + dev + """\" } - node { name: "Variable_1/read" op: "Identity" input: "Variable_1" device: \"""" + dev + """\" } - node { name: "MatMul" op: "MatMul" input: "Variable/read" input: "Variable_1/read" device: \"""" + dev + """\" } - node { name: "group_deps" op: "NoOp" input: "^MatMul" device: \"""" + dev + """\" } - """ - self.assertProtoEquals(str(proto_expected), self._StripGraph(gd)) - - def _VerifyRunGraph(self, n, m, k, transpose_a, transpose_b, dtype): - benchmark_instance = matmul_benchmark.MatmulBenchmark() - duration = benchmark_instance.run_graph(googletest.gpu_device_name(), n, m, - k, transpose_a, transpose_b, 1, - dtype) - self.assertTrue(duration > 1e-6) - - -if __name__ == "__main__": - dtypes = [np.float32, np.float64] - index = 0 - for _dtype in dtypes: - for _n, _m, (_transpose_a, _transpose_b) in itertools.product( - [512, 1024], [1, 8, 16, 128], [(False, False), (True, False), - (False, True)]): - _k = _n - setattr(MatmulBenchmarkTest, "testBuildGraph_" + str(index), - BuildGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype)) - setattr(MatmulBenchmarkTest, "testRunGraph_" + str(index), - RunGraphTest(_n, _m, _k, _transpose_a, _transpose_b, _dtype)) - index += 1 - googletest.main() From f0b969732d7857d5f97b9398b86b2a4571127c6b Mon Sep 17 00:00:00 2001 From: River Riddle Date: Thu, 5 Dec 2019 14:52:28 -0800 Subject: [PATCH 207/383] Refactor the IRPrinting instrumentation to take a derivable config. This allows for more interesting behavior from users, e.g. enabling the ability to dump the IR to a separate file for each pass invocation. PiperOrigin-RevId: 284059447 Change-Id: I9a762c0f9b48b67cc3849210aca0329981c38487 --- .../mlir/include/mlir/Pass/PassManager.h | 63 +++++++- third_party/mlir/lib/Pass/IRPrinting.cpp | 141 ++++++++++++------ .../mlir/lib/Pass/PassManagerOptions.cpp | 11 +- 3 files changed, 159 insertions(+), 56 deletions(-) diff --git a/third_party/mlir/include/mlir/Pass/PassManager.h b/third_party/mlir/include/mlir/Pass/PassManager.h index 5580806422f..724ee0a31cd 100644 --- a/third_party/mlir/include/mlir/Pass/PassManager.h +++ b/third_party/mlir/include/mlir/Pass/PassManager.h @@ -159,16 +159,63 @@ public: /// Add the provided instrumentation to the pass manager. void addInstrumentation(std::unique_ptr pi); - /// Add an instrumentation to print the IR before and after pass execution. + //===--------------------------------------------------------------------===// + // IR Printing + + /// A configuration struct provided to the IR printer instrumentation. + class IRPrinterConfig { + public: + using PrintCallbackFn = function_ref; + + /// Initialize the configuration. + /// * 'printModuleScope' signals if the top-level module IR should always be + /// printed. This should only be set to true when multi-threading is + /// disabled, otherwise we may try to print IR that is being modified + /// asynchronously. + explicit IRPrinterConfig(bool printModuleScope = false); + virtual ~IRPrinterConfig(); + + /// A hook that may be overridden by a derived config that checks if the IR + /// of 'operation' should be dumped *before* the pass 'pass' has been + /// executed. If the IR should be dumped, 'printCallback' should be invoked + /// with the stream to dump into. + virtual void printBeforeIfEnabled(Pass *pass, Operation *operation, + PrintCallbackFn printCallback); + + /// A hook that may be overridden by a derived config that checks if the IR + /// of 'operation' should be dumped *after* the pass 'pass' has been + /// executed. If the IR should be dumped, 'printCallback' should be invoked + /// with the stream to dump into. + virtual void printAfterIfEnabled(Pass *pass, Operation *operation, + PrintCallbackFn printCallback); + + /// Returns true if the IR should always be printed at the top-level scope. + bool shouldPrintAtModuleScope() const { return printModuleScope; } + + private: + /// A flag that indicates if the IR should be printed at module scope. + bool printModuleScope; + }; + + /// Add an instrumentation to print the IR before and after pass execution, + /// using the provided configuration. + void enableIRPrinting(std::unique_ptr config); + + /// Add an instrumentation to print the IR before and after pass execution, + /// using the provided fields to generate a default configuration: /// * 'shouldPrintBeforePass' and 'shouldPrintAfterPass' correspond to filter - /// functions that take a 'Pass *'. These function should return true if the - /// IR should be printed or not. - /// * 'printModuleScope' signals if the module IR should be printed, even for - /// non module passes. + /// functions that take a 'Pass *' and `Operation *`. These function should + /// return true if the IR should be printed or not. + /// * 'printModuleScope' signals if the module IR should be printed, even + /// for non module passes. /// * 'out' corresponds to the stream to output the printed IR to. - void enableIRPrinting(std::function shouldPrintBeforePass, - std::function shouldPrintAfterPass, - bool printModuleScope, raw_ostream &out); + void enableIRPrinting( + std::function shouldPrintBeforePass, + std::function shouldPrintAfterPass, + bool printModuleScope, raw_ostream &out); + + //===--------------------------------------------------------------------===// + // Pass Timing /// Add an instrumentation to time the execution of passes and the computation /// of analyses. diff --git a/third_party/mlir/lib/Pass/IRPrinting.cpp b/third_party/mlir/lib/Pass/IRPrinting.cpp index 7cf32f89da2..19e69feb5d8 100644 --- a/third_party/mlir/lib/Pass/IRPrinting.cpp +++ b/third_party/mlir/lib/Pass/IRPrinting.cpp @@ -27,19 +27,8 @@ using namespace mlir::detail; namespace { class IRPrinterInstrumentation : public PassInstrumentation { public: - /// A filter function to decide if the given pass should be printed. Returns - /// true if the pass should be printed, false otherwise. - using ShouldPrintFn = std::function; - - IRPrinterInstrumentation(ShouldPrintFn &&shouldPrintBeforePass, - ShouldPrintFn &&shouldPrintAfterPass, - bool printModuleScope, raw_ostream &out) - : shouldPrintBeforePass(shouldPrintBeforePass), - shouldPrintAfterPass(shouldPrintAfterPass), - printModuleScope(printModuleScope), out(out) { - assert((shouldPrintBeforePass || shouldPrintAfterPass) && - "expected atleast one valid filter function"); - } + IRPrinterInstrumentation(std::unique_ptr config) + : config(std::move(config)) {} private: /// Instrumentation hooks. @@ -47,14 +36,8 @@ private: void runAfterPass(Pass *pass, Operation *op) override; void runAfterPassFailed(Pass *pass, Operation *op) override; - /// Filter functions for before and after pass execution. - ShouldPrintFn shouldPrintBeforePass, shouldPrintAfterPass; - - /// Flag to toggle if the printer should always print at module scope. - bool printModuleScope; - - /// The stream to output to. - raw_ostream &out; + /// Configuration to use. + std::unique_ptr config; }; } // end anonymous namespace @@ -96,45 +79,117 @@ static void printIR(Operation *op, bool printModuleScope, raw_ostream &out, /// Instrumentation hooks. void IRPrinterInstrumentation::runBeforePass(Pass *pass, Operation *op) { - // Skip hidden passes and passes that the user filtered out. - if (!shouldPrintBeforePass || isHiddenPass(pass) || - !shouldPrintBeforePass(pass)) + if (isHiddenPass(pass)) return; - out << formatv("*** IR Dump Before {0} ***", pass->getName()); - printIR(op, printModuleScope, out, OpPrintingFlags()); - out << "\n\n"; + config->printBeforeIfEnabled(pass, op, [&](raw_ostream &out) { + out << formatv("*** IR Dump Before {0} ***", pass->getName()); + printIR(op, config->shouldPrintAtModuleScope(), out, OpPrintingFlags()); + out << "\n\n"; + }); } void IRPrinterInstrumentation::runAfterPass(Pass *pass, Operation *op) { - // Skip hidden passes and passes that the user filtered out. - if (!shouldPrintAfterPass || isHiddenPass(pass) || - !shouldPrintAfterPass(pass)) + if (isHiddenPass(pass)) return; - out << formatv("*** IR Dump After {0} ***", pass->getName()); - printIR(op, printModuleScope, out, OpPrintingFlags()); - out << "\n\n"; + config->printAfterIfEnabled(pass, op, [&](raw_ostream &out) { + out << formatv("*** IR Dump After {0} ***", pass->getName()); + printIR(op, config->shouldPrintAtModuleScope(), out, OpPrintingFlags()); + out << "\n\n"; + }); } void IRPrinterInstrumentation::runAfterPassFailed(Pass *pass, Operation *op) { - // Skip adaptor passes and passes that the user filtered out. - if (!shouldPrintAfterPass || isAdaptorPass(pass) || - !shouldPrintAfterPass(pass)) + if (isAdaptorPass(pass)) return; - out << formatv("*** IR Dump After {0} Failed ***", pass->getName()); - printIR(op, printModuleScope, out, OpPrintingFlags().printGenericOpForm()); - out << "\n\n"; + config->printAfterIfEnabled(pass, op, [&](raw_ostream &out) { + out << formatv("*** IR Dump After {0} Failed ***", pass->getName()); + printIR(op, config->shouldPrintAtModuleScope(), out, + OpPrintingFlags().printGenericOpForm()); + out << "\n\n"; + }); +} + +//===----------------------------------------------------------------------===// +// IRPrinterConfig +//===----------------------------------------------------------------------===// + +/// Initialize the configuration. +/// * 'printModuleScope' signals if the module IR should be printed, even +/// for non module passes. +PassManager::IRPrinterConfig::IRPrinterConfig(bool printModuleScope) + : printModuleScope(printModuleScope) {} +PassManager::IRPrinterConfig::~IRPrinterConfig() {} + +/// A hook that may be overridden by a derived config that checks if the IR +/// of 'operation' should be dumped *before* the pass 'pass' has been +/// executed. If the IR should be dumped, 'printCallback' should be invoked +/// with the stream to dump into. +void PassManager::IRPrinterConfig::printBeforeIfEnabled( + Pass *pass, Operation *operation, PrintCallbackFn printCallback) { + // By default, never print. +} + +/// A hook that may be overridden by a derived config that checks if the IR +/// of 'operation' should be dumped *after* the pass 'pass' has been +/// executed. If the IR should be dumped, 'printCallback' should be invoked +/// with the stream to dump into. +void PassManager::IRPrinterConfig::printAfterIfEnabled( + Pass *pass, Operation *operation, PrintCallbackFn printCallback) { + // By default, never print. } //===----------------------------------------------------------------------===// // PassManager //===----------------------------------------------------------------------===// +namespace { +/// Simple wrapper config that allows for the simpler interface defined above. +struct BasicIRPrinterConfig : public PassManager::IRPrinterConfig { + BasicIRPrinterConfig( + std::function shouldPrintBeforePass, + std::function shouldPrintAfterPass, + bool printModuleScope, raw_ostream &out) + : IRPrinterConfig(printModuleScope), + shouldPrintBeforePass(shouldPrintBeforePass), + shouldPrintAfterPass(shouldPrintAfterPass), out(out) { + assert((shouldPrintBeforePass || shouldPrintAfterPass) && + "expected at least one valid filter function"); + } + + void printBeforeIfEnabled(Pass *pass, Operation *operation, + PrintCallbackFn printCallback) final { + if (shouldPrintBeforePass && shouldPrintBeforePass(pass, operation)) + printCallback(out); + } + + void printAfterIfEnabled(Pass *pass, Operation *operation, + PrintCallbackFn printCallback) final { + if (shouldPrintAfterPass && shouldPrintAfterPass(pass, operation)) + printCallback(out); + } + + /// Filter functions for before and after pass execution. + std::function shouldPrintBeforePass; + std::function shouldPrintAfterPass; + + /// The stream to output to. + raw_ostream &out; +}; +} // end anonymous namespace + +/// Add an instrumentation to print the IR before and after pass execution, +/// using the provided configuration. +void PassManager::enableIRPrinting(std::unique_ptr config) { + addInstrumentation( + std::make_unique(std::move(config))); +} + /// Add an instrumentation to print the IR before and after pass execution. void PassManager::enableIRPrinting( - std::function shouldPrintBeforePass, - std::function shouldPrintAfterPass, bool printModuleScope, - raw_ostream &out) { - addInstrumentation(std::make_unique( + std::function shouldPrintBeforePass, + std::function shouldPrintAfterPass, + bool printModuleScope, raw_ostream &out) { + enableIRPrinting(std::make_unique( std::move(shouldPrintBeforePass), std::move(shouldPrintAfterPass), printModuleScope, out)); } diff --git a/third_party/mlir/lib/Pass/PassManagerOptions.cpp b/third_party/mlir/lib/Pass/PassManagerOptions.cpp index c9b19a61556..1416dfe3e8c 100644 --- a/third_party/mlir/lib/Pass/PassManagerOptions.cpp +++ b/third_party/mlir/lib/Pass/PassManagerOptions.cpp @@ -104,16 +104,17 @@ static llvm::ManagedStatic> options; /// Add an IR printing instrumentation if enabled by any 'print-ir' flags. void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) { - std::function shouldPrintBeforePass, shouldPrintAfterPass; + std::function shouldPrintBeforePass; + std::function shouldPrintAfterPass; // Handle print-before. if (printBeforeAll) { // If we are printing before all, then just return true for the filter. - shouldPrintBeforePass = [](Pass *) { return true; }; + shouldPrintBeforePass = [](Pass *, Operation *) { return true; }; } else if (printBefore.hasAnyOccurrences()) { // Otherwise if there are specific passes to print before, then check to see // if the pass info for the current pass is included in the list. - shouldPrintBeforePass = [&](Pass *pass) { + shouldPrintBeforePass = [&](Pass *pass, Operation *) { auto *passInfo = pass->lookupPassInfo(); return passInfo && printBefore.contains(passInfo); }; @@ -122,11 +123,11 @@ void PassManagerOptions::addPrinterInstrumentation(PassManager &pm) { // Handle print-after. if (printAfterAll) { // If we are printing after all, then just return true for the filter. - shouldPrintAfterPass = [](Pass *) { return true; }; + shouldPrintAfterPass = [](Pass *, Operation *) { return true; }; } else if (printAfter.hasAnyOccurrences()) { // Otherwise if there are specific passes to print after, then check to see // if the pass info for the current pass is included in the list. - shouldPrintAfterPass = [&](Pass *pass) { + shouldPrintAfterPass = [&](Pass *pass, Operation *) { auto *passInfo = pass->lookupPassInfo(); return passInfo && printAfter.contains(passInfo); }; From 69bfdcabdc47ca06f5dc3ae070bd2b72dc458ef3 Mon Sep 17 00:00:00 2001 From: Daniel Situnayake Date: Thu, 5 Dec 2019 15:06:06 -0800 Subject: [PATCH 208/383] Add training scripts for microcontroller gesture recognition model, rename classes PiperOrigin-RevId: 284062383 Change-Id: I36effc5f09a4345f2259fd2162a9fbf37ae1f3b2 --- .../micro/examples/magic_wand/BUILD | 8 +- .../micro/examples/magic_wand/Makefile.inc | 8 +- .../examples/magic_wand/gesture_predictor.cc | 2 +- .../examples/magic_wand/magic_wand_test.cc | 8 +- ...es_data.cc => ring_micro_features_data.cc} | 8 +- ...ures_data.h => ring_micro_features_data.h} | 12 +- ...s_data.cc => slope_micro_features_data.cc} | 8 +- ...res_data.h => slope_micro_features_data.h} | 12 +- .../micro/examples/magic_wand/train/README.md | 195 ++++++++++++++ .../magic_wand/train/data_augmentation.py | 73 +++++ .../train/data_augmentation_test.py | 58 ++++ .../examples/magic_wand/train/data_load.py | 105 ++++++++ .../magic_wand/train/data_load_test.py | 95 +++++++ .../examples/magic_wand/train/data_prepare.py | 164 ++++++++++++ .../magic_wand/train/data_prepare_test.py | 75 ++++++ .../examples/magic_wand/train/data_split.py | 90 +++++++ .../magic_wand/train/data_split_person.py | 75 ++++++ .../train/data_split_person_test.py | 54 ++++ .../magic_wand/train/data_split_test.py | 77 ++++++ .../magic_wand/train/netmodels/CNN/weights.h5 | Bin 0 -> 40512 bytes .../magic_wand/train/requirements.txt | 2 + .../micro/examples/magic_wand/train/train.py | 202 ++++++++++++++ .../train/train_magic_wand_model.ipynb | 252 ++++++++++++++++++ .../examples/magic_wand/train/train_test.py | 77 ++++++ 24 files changed, 1627 insertions(+), 33 deletions(-) rename tensorflow/lite/experimental/micro/examples/magic_wand/{circle_micro_features_data.cc => ring_micro_features_data.cc} (95%) rename tensorflow/lite/experimental/micro/examples/magic_wand/{angle_micro_features_data.h => ring_micro_features_data.h} (65%) rename tensorflow/lite/experimental/micro/examples/magic_wand/{angle_micro_features_data.cc => slope_micro_features_data.cc} (95%) rename tensorflow/lite/experimental/micro/examples/magic_wand/{circle_micro_features_data.h => slope_micro_features_data.h} (65%) create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/README.md create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation_test.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load_test.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare_test.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person_test.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_test.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/netmodels/CNN/weights.h5 create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/requirements.txt create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/train.py create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/train_magic_wand_model.ipynb create mode 100644 tensorflow/lite/experimental/micro/examples/magic_wand/train/train_test.py diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/BUILD b/tensorflow/lite/experimental/micro/examples/magic_wand/BUILD index bed81676304..20eacf37dfb 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/BUILD +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/BUILD @@ -23,12 +23,12 @@ cc_library( cc_library( name = "sample_feature_data", srcs = [ - "angle_micro_features_data.cc", - "circle_micro_features_data.cc", + "ring_micro_features_data.cc", + "slope_micro_features_data.cc", ], hdrs = [ - "angle_micro_features_data.h", - "circle_micro_features_data.h", + "ring_micro_features_data.h", + "slope_micro_features_data.h", ], ) diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/Makefile.inc b/tensorflow/lite/experimental/micro/examples/magic_wand/Makefile.inc index e1fcbe59be4..f739aefa074 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/Makefile.inc +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/Makefile.inc @@ -40,13 +40,13 @@ tensorflow/lite/experimental/micro/examples/magic_wand/gesture_predictor.h \ magic_wand_TEST_SRCS := \ tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_test.cc \ tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_model_data.cc \ -tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.cc \ -tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.cc +tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.cc \ +tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.cc magic_wand_TEST_HDRS := \ tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_model_data.h \ -tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.h \ -tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.h +tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.h \ +tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.h magic_wand_SRCS := \ tensorflow/lite/experimental/micro/examples/magic_wand/main.cc \ diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/gesture_predictor.cc b/tensorflow/lite/experimental/micro/examples/magic_wand/gesture_predictor.cc index ea1cf3046af..865016785ad 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/gesture_predictor.cc +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/gesture_predictor.cc @@ -23,7 +23,7 @@ int continuous_count = 0; int last_predict = -1; // Return the result of the last prediction -// 0: wing("W"), 1: ring("O"), 2: slope("angle"), 3: unknown +// 0: wing, 1: ring, 2: slope, 3: unknown int PredictGesture(float* output) { // Find whichever output has a probability > 0.8 (they sum to 1) int this_predict = -1; diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_test.cc b/tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_test.cc index 395881ff3cf..1bf26b4d34c 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_test.cc +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_test.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.h" -#include "tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.h" #include "tensorflow/lite/experimental/micro/examples/magic_wand/magic_wand_model_data.h" +#include "tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.h" +#include "tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.h" #include "tensorflow/lite/experimental/micro/kernels/micro_ops.h" #include "tensorflow/lite/experimental/micro/micro_error_reporter.h" #include "tensorflow/lite/experimental/micro/micro_interpreter.h" @@ -89,7 +89,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) { TF_LITE_MICRO_EXPECT_EQ(kTfLiteFloat32, input->type); // Provide an input value - const float* ring_features_data = g_circle_micro_f9643d42_nohash_4_data; + const float* ring_features_data = g_ring_micro_f9643d42_nohash_4_data; error_reporter->Report("%d", input->bytes); for (int i = 0; i < (input->bytes / sizeof(float)); ++i) { input->data.f[i] = ring_features_data[i]; @@ -127,7 +127,7 @@ TF_LITE_MICRO_TEST(LoadModelAndPerformInference) { TF_LITE_MICRO_EXPECT_GT(ring_score, negative_score); // Now test with a different input, from a recording of "Slope". - const float* slope_features_data = g_angle_micro_f2e59fea_nohash_1_data; + const float* slope_features_data = g_slope_micro_f2e59fea_nohash_1_data; for (int i = 0; i < (input->bytes / sizeof(float)); ++i) { input->data.f[i] = slope_features_data[i]; } diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.cc similarity index 95% rename from tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.cc rename to tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.cc index 24d44aabd5e..aa579b43457 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.cc +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.cc @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.h" +#include "tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.h" -const int g_circle_micro_f9643d42_nohash_4_length = 128; -const int g_circle_micro_f9643d42_nohash_4_dim = 3; +const int g_ring_micro_f9643d42_nohash_4_length = 128; +const int g_ring_micro_f9643d42_nohash_4_dim = 3; // Raw accelerometer data with a sample rate of 25Hz -const float g_circle_micro_f9643d42_nohash_4_data[] = { +const float g_ring_micro_f9643d42_nohash_4_data[] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.h similarity index 65% rename from tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.h rename to tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.h index 6060f128ed2..d1d0b602165 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.h +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/ring_micro_features_data.h @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_ANGLE_MICRO_FEATURES_DATA_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_ANGLE_MICRO_FEATURES_DATA_H_ +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_RING_MICRO_FEATURES_DATA_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_RING_MICRO_FEATURES_DATA_H_ -extern const int g_angle_micro_f2e59fea_nohash_1_length; -extern const int g_angle_micro_f2e59fea_nohash_1_dim; -extern const float g_angle_micro_f2e59fea_nohash_1_data[]; +extern const int g_ring_micro_f9643d42_nohash_4_length; +extern const int g_ring_micro_f9643d42_nohash_4_dim; +extern const float g_ring_micro_f9643d42_nohash_4_data[]; -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_ANGLE_MICRO_FEATURES_DATA_H_ +#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_RING_MICRO_FEATURES_DATA_H_ diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.cc b/tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.cc similarity index 95% rename from tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.cc rename to tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.cc index 922f0797032..68b3e40052b 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.cc +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.cc @@ -13,12 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/lite/experimental/micro/examples/magic_wand/angle_micro_features_data.h" +#include "tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.h" -const int g_angle_micro_f2e59fea_nohash_1_length = 128; -const int g_angle_micro_f2e59fea_nohash_1_dim = 3; +const int g_slope_micro_f2e59fea_nohash_1_length = 128; +const int g_slope_micro_f2e59fea_nohash_1_dim = 3; // Raw accelerometer data with a sample rate of 25Hz -const float g_angle_micro_f2e59fea_nohash_1_data[] = { +const float g_slope_micro_f2e59fea_nohash_1_data[] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.h b/tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.h similarity index 65% rename from tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.h rename to tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.h index f9c69f90ef4..ade97683d79 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/circle_micro_features_data.h +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/slope_micro_features_data.h @@ -13,11 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_CIRCLE_MICRO_FEATURES_DATA_H_ -#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_CIRCLE_MICRO_FEATURES_DATA_H_ +#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_SLOPE_MICRO_FEATURES_DATA_H_ +#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_SLOPE_MICRO_FEATURES_DATA_H_ -extern const int g_circle_micro_f9643d42_nohash_4_length; -extern const int g_circle_micro_f9643d42_nohash_4_dim; -extern const float g_circle_micro_f9643d42_nohash_4_data[]; +extern const int g_slope_micro_f2e59fea_nohash_1_length; +extern const int g_slope_micro_f2e59fea_nohash_1_dim; +extern const float g_slope_micro_f2e59fea_nohash_1_data[]; -#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_CIRCLE_MICRO_FEATURES_DATA_H_ +#endif // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MAGIC_WAND_SLOPE_MICRO_FEATURES_DATA_H_ diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/README.md b/tensorflow/lite/experimental/micro/examples/magic_wand/train/README.md new file mode 100644 index 00000000000..6bd45375341 --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/README.md @@ -0,0 +1,195 @@ +# Gesture Recognition Magic Wand Training Scripts + +## Introduction + +The scripts in this directory can be used to train a TensorFlow model that +classifies gestures based on accelerometer data. The code uses Python 3.7 and +TensorFlow 2.0. The resulting model is less than 20KB in size. + +The following document contains instructions on using the scripts to train a +model, and capturing your own training data. + +This project was inspired by the [Gesture Recognition Magic Wand](https://github.com/jewang/gesture-demo) +project by Jennifer Wang. + +## Training + +### Data and pre-trained model + +Three magic gestures were chosen, and data were collected from 7 +different people. Some random long movement sequences were collected and divided +into shorter pieces, which made up "negative" data along with some other +automatically generated random data. + +The dataset can be downloaded from the following URL: + +[download.tensorflow.org/models/tflite/magic_wand/data.tar.gz](http://download.tensorflow.org/models/tflite/magic_wand/data.tar.gz) + +A pre-trained, quantized model can be downloaded from the following URL: + +[download.tensorflow.org/models/tflite/magic_wand/model_quantized.tflite](http://download.tensorflow.org/models/tflite/magic_wand/model_quantized.tflite) + +### Training in Colab + +The following [Google Colaboratory](https://colab.research.google.com) +notebook demonstrates how to train the model. It's the easiest way to get +started: + + + + +
+ Run in Google Colab + + View source on GitHub +
+ +If you'd prefer to run the scripts locally, use the following instructions. + +### Running the scripts + +Use the following command to install the required dependencies: + +```shell +pip install -r requirements.txt +``` + +There are two ways to train the model: + +- Random data split, which mixes different people's data together and randomly + splits them into training, validation, and test sets +- Person data split, which splits the data by person + +#### Random data split + +Using a random split results in higher training accuracy than a person split, +but inferior performance on new data. + +```shell +$ python data_prepare.py + +$ python data_split.py + +$ python train.py --model CNN --person false +``` + +#### Person data split + +Using a person data split results in lower training accuracy but better +performance on new data. + +```shell +$ python data_prepare.py + +$ python data_split_person.py + +$ python train.py --model CNN --person true +``` + +#### Model type + +In the `--model` argument, you can can provide `CNN` or `LSTM`. The CNN +model has a smaller size and lower latency. + +## Collecting new data + +To obtain new training data using the +[SparkFun Edge development board](https://sparkfun.com/products/15170), you can +modify one of the examples in the [SparkFun Edge BSP](https://github.com/sparkfun/SparkFun_Edge_BSP) +and deploy it using the Ambiq SDK. + +### Install the Ambiq SDK and SparkFun Edge BSP + +Follow SparkFun's +[Using SparkFun Edge Board with Ambiq Apollo3 SDK](https://learn.sparkfun.com/tutorials/using-sparkfun-edge-board-with-ambiq-apollo3-sdk/all) +guide to set up the Ambiq SDK and SparkFun Edge BSP. + +#### Modify the example code + +First, `cd` into +`AmbiqSuite-Rel2.2.0/boards/SparkFun_Edge_BSP/examples/example1_edge_test`. + +##### Modify `src/tf_adc/tf_adc.c` + +Add `true` in line 62 as the second parameter of function +`am_hal_adc_samples_read`. + +##### Modify `src/main.c` + +Add the line below in `int main(void)`, just before the line `while(1)`: + +```cc +am_util_stdio_printf("-,-,-\r\n"); +``` + +Change the following lines in `while(1){...}` + +```cc +am_util_stdio_printf("Acc [mg] %04.2f x, %04.2f y, %04.2f z, Temp [deg C] %04.2f, MIC0 [counts / 2^14] %d\r\n", acceleration_mg[0], acceleration_mg[1], acceleration_mg[2], temperature_degC, (audioSample) ); +``` + +to: + +```cc +am_util_stdio_printf("%04.2f,%04.2f,%04.2f\r\n", acceleration_mg[0], acceleration_mg[1], acceleration_mg[2]); +``` + +#### Flash the binary + +Follow the instructions in +[SparkFun's guide](https://learn.sparkfun.com/tutorials/using-sparkfun-edge-board-with-ambiq-apollo3-sdk/all#example-applications) +to flash the binary to the device. + +#### Collect accelerometer data + +First, in a new terminal window, run the following command to begin logging +output to `output.txt`: + +```shell +$ script output.txt +``` + +Next, in the same window, use `screen` to connect to the device: + +```shell +$ screen ${DEVICENAME} 115200 +``` + +Output information collected from accelerometer sensor will be shown on the +screen and saved in `output.txt`, in the format of "x,y,z" per line. + +Press the `RST` button to start capturing a new gesture, then press Button 14 +when it ends. New data will begin with a line "-,-,-". + +To exit `screen`, hit +Ctrl\\+A+, immediately followed by the +K+ key, +then hit the +Y+ key. Then run + +```shell +$ exit +``` + +to stop logging data. Data will be saved in `output.txt`. For compatibility +with the training scripts, change the file name to include person's name and +the gesture name, in the following format: + +``` +output_{gesture_name}_{person_name}.txt +``` + +#### Edit and run the scripts + +Edit the following files to include your new gesture names (replacing +"wing", "ring", and "slope") + +- `data_load.py` +- `data_prepare.py` +- `data_split.py` + +Edit the following files to include your new person names (replacing "hyw", +"shiyun", "tangsy", "dengyl", "jiangyh", "xunkai", "lsj", "pengxl", "liucx", +and "zhangxy"): + +- `data_prepare.py` +- `data_split_person.py` + +Finally, run the commands described earlier to train a new model. diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation.py new file mode 100644 index 00000000000..45700b9e4a8 --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation.py @@ -0,0 +1,73 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=g-bad-import-order + +"""Data augmentation that will be used in data_load.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import random +import numpy as np + + +def time_wrapping(molecule, denominator, data): + """Generate (molecule/denominator)x speed data.""" + tmp_data = [[0 + for i in range(len(data[0]))] + for j in range((int(len(data) / molecule) - 1) * denominator)] + for i in range(int(len(data) / molecule) - 1): + for j in range(len(data[i])): + for k in range(denominator): + tmp_data[denominator * i + + k][j] = (data[molecule * i + k][j] * (denominator - k) + + data[molecule * i + k + 1][j] * k) / denominator + return tmp_data + + +def augment_data(original_data, original_label): + """Perform data augmentation.""" + new_data = [] + new_label = [] + for idx, (data, label) in enumerate(zip(original_data, original_label)): # pylint: disable=unused-variable + # Original data + new_data.append(data) + new_label.append(label) + # Sequence shift + for num in range(5): # pylint: disable=unused-variable + new_data.append((np.array(data, dtype=np.float32) + + (random.random() - 0.5) * 200).tolist()) + new_label.append(label) + # Random noise + tmp_data = [[0 for i in range(len(data[0]))] for j in range(len(data))] + for num in range(5): + for i in range(len(tmp_data)): + for j in range(len(tmp_data[i])): + tmp_data[i][j] = data[i][j] + 5 * random.random() + new_data.append(tmp_data) + new_label.append(label) + # Time warping + fractions = [(3, 2), (5, 3), (2, 3), (3, 4), (9, 5), (6, 5), (4, 5)] + for molecule, denominator in fractions: + new_data.append(time_wrapping(molecule, denominator, data)) + new_label.append(label) + # Movement amplification + for molecule, denominator in fractions: + new_data.append( + (np.array(data, dtype=np.float32) * molecule / denominator).tolist()) + new_label.append(label) + return new_data, new_label diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation_test.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation_test.py new file mode 100644 index 00000000000..76bac6585d6 --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_augmentation_test.py @@ -0,0 +1,58 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=g-bad-import-order + +"""Test for data_augmentation.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest + +import numpy as np + +from data_augmentation import augment_data +from data_augmentation import time_wrapping + + +class TestAugmentation(unittest.TestCase): + + def test_time_wrapping(self): + original_data = np.random.rand(10, 3).tolist() + wrapped_data = time_wrapping(4, 5, original_data) + self.assertEqual(len(wrapped_data), int(len(original_data) / 4 - 1) * 5) + self.assertEqual(len(wrapped_data[0]), len(original_data[0])) + + def test_augment_data(self): + original_data = [ + np.random.rand(128, 3).tolist(), + np.random.rand(66, 2).tolist(), + np.random.rand(9, 1).tolist() + ] + original_label = ["data", "augmentation", "test"] + augmented_data, augmented_label = augment_data(original_data, + original_label) + self.assertEqual(25 * len(original_data), len(augmented_data)) + self.assertIsInstance(augmented_data, list) + self.assertEqual(25 * len(original_label), len(augmented_label)) + self.assertIsInstance(augmented_label, list) + for i in range(len(original_label)): + self.assertEqual(augmented_label[25 * i], original_label[i]) + + +if __name__ == "__main__": + unittest.main() diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load.py new file mode 100644 index 00000000000..321b9c7ea0a --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load.py @@ -0,0 +1,105 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=g-bad-import-order + +"""Load data from the specified paths and format them for training.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import numpy as np +import tensorflow as tf + +from data_augmentation import augment_data + +LABEL_NAME = "gesture" +DATA_NAME = "accel_ms2_xyz" + + +class DataLoader(object): + """Loads data and prepares for training.""" + + def __init__(self, train_data_path, valid_data_path, test_data_path, + seq_length): + self.dim = 3 + self.seq_length = seq_length + self.label2id = {"wing": 0, "ring": 1, "slope": 2, "negative": 3} + self.train_data, self.train_label, self.train_len = self.get_data_file( + train_data_path, "train") + self.valid_data, self.valid_label, self.valid_len = self.get_data_file( + valid_data_path, "valid") + self.test_data, self.test_label, self.test_len = self.get_data_file( + test_data_path, "test") + + def get_data_file(self, data_path, data_type): + """Get train, valid and test data from files.""" + data = [] + label = [] + with open(data_path, "r") as f: + lines = f.readlines() + for idx, line in enumerate(lines): # pylint: disable=unused-variable + dic = json.loads(line) + data.append(dic[DATA_NAME]) + label.append(dic[LABEL_NAME]) + if data_type == "train": + data, label = augment_data(data, label) + length = len(label) + print(data_type + "_data_length:" + str(length)) + return data, label, length + + def pad(self, data, seq_length, dim): + """Get neighboor padding.""" + noise_level = 20 + padded_data = [] + # Before- Neighbour padding + tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[0] + tmp_data[(seq_length - + min(len(data), seq_length)):] = data[:min(len(data), seq_length)] + padded_data.append(tmp_data) + # After- Neighbour padding + tmp_data = (np.random.rand(seq_length, dim) - 0.5) * noise_level + data[-1] + tmp_data[:min(len(data), seq_length)] = data[:min(len(data), seq_length)] + padded_data.append(tmp_data) + return padded_data + + def format_support_func(self, padded_num, length, data, label): + """Support function for format.(Helps format train, valid and test.)""" + # Add 2 padding, initialize data and label + length *= padded_num + features = np.zeros((length, self.seq_length, self.dim)) + labels = np.zeros(length) + # Get padding for train, valid and test + for idx, (data, label) in enumerate(zip(data, label)): + padded_data = self.pad(data, self.seq_length, self.dim) + for num in range(padded_num): + features[padded_num * idx + num] = padded_data[num] + labels[padded_num * idx + num] = self.label2id[label] + # Turn into tf.data.Dataset + dataset = tf.data.Dataset.from_tensor_slices( + (features, labels.astype("int32"))) + return length, dataset + + def format(self): + """Format data(including padding, etc.) and get the dataset for the model.""" + padded_num = 2 + self.train_len, self.train_data = self.format_support_func( + padded_num, self.train_len, self.train_data, self.train_label) + self.valid_len, self.valid_data = self.format_support_func( + padded_num, self.valid_len, self.valid_data, self.valid_label) + self.test_len, self.test_data = self.format_support_func( + padded_num, self.test_len, self.test_data, self.test_label) diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load_test.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load_test.py new file mode 100644 index 00000000000..8a4ef45c7c4 --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_load_test.py @@ -0,0 +1,95 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +# pylint: disable=g-bad-import-order + +"""Test for data_load.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +from data_load import DataLoader + +import tensorflow as tf + + +class TestLoad(unittest.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + self.loader = DataLoader( + "./data/train", "./data/valid", "./data/test", seq_length=512) + + def test_get_data(self): + self.assertIsInstance(self.loader.train_data, list) + self.assertIsInstance(self.loader.train_label, list) + self.assertIsInstance(self.loader.valid_data, list) + self.assertIsInstance(self.loader.valid_label, list) + self.assertIsInstance(self.loader.test_data, list) + self.assertIsInstance(self.loader.test_label, list) + self.assertEqual(self.loader.train_len, len(self.loader.train_data)) + self.assertEqual(self.loader.train_len, len(self.loader.train_label)) + self.assertEqual(self.loader.valid_len, len(self.loader.valid_data)) + self.assertEqual(self.loader.valid_len, len(self.loader.valid_label)) + self.assertEqual(self.loader.test_len, len(self.loader.test_data)) + self.assertEqual(self.loader.test_len, len(self.loader.test_label)) + + def test_pad(self): + original_data1 = [[2, 3], [1, 1]] + expected_data1_0 = [[2, 3], [2, 3], [2, 3], [2, 3], [1, 1]] + expected_data1_1 = [[2, 3], [1, 1], [1, 1], [1, 1], [1, 1]] + original_data2 = [[-2, 3], [-77, -681], [5, 6], [9, -7], [22, 3333], + [9, 99], [-100, 0]] + expected_data2 = [[-2, 3], [-77, -681], [5, 6], [9, -7], [22, 3333]] + padding_data1 = self.loader.pad(original_data1, seq_length=5, dim=2) + padding_data2 = self.loader.pad(original_data2, seq_length=5, dim=2) + for i in range(len(padding_data1[0])): + for j in range(len(padding_data1[0].tolist()[0])): + self.assertLess( + abs(padding_data1[0].tolist()[i][j] - expected_data1_0[i][j]), + 10.001) + for i in range(len(padding_data1[1])): + for j in range(len(padding_data1[1].tolist()[0])): + self.assertLess( + abs(padding_data1[1].tolist()[i][j] - expected_data1_1[i][j]), + 10.001) + self.assertEqual(padding_data2[0].tolist(), expected_data2) + self.assertEqual(padding_data2[1].tolist(), expected_data2) + + def test_format(self): + self.loader.format() + expected_train_label = int(self.loader.label2id[self.loader.train_label[0]]) + expected_valid_label = int(self.loader.label2id[self.loader.valid_label[0]]) + expected_test_label = int(self.loader.label2id[self.loader.test_label[0]]) + for feature, label in self.loader.train_data: # pylint: disable=unused-variable + format_train_label = label.numpy() + break + for feature, label in self.loader.valid_data: + format_valid_label = label.numpy() + break + for feature, label in self.loader.test_data: + format_test_label = label.numpy() + break + self.assertEqual(expected_train_label, format_train_label) + self.assertEqual(expected_valid_label, format_valid_label) + self.assertEqual(expected_test_label, format_test_label) + self.assertIsInstance(self.loader.train_data, tf.data.Dataset) + self.assertIsInstance(self.loader.valid_data, tf.data.Dataset) + self.assertIsInstance(self.loader.test_data, tf.data.Dataset) + + +if __name__ == "__main__": + unittest.main() diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare.py new file mode 100644 index 00000000000..b5f1fcfdd01 --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare.py @@ -0,0 +1,164 @@ +# Lint as: python3 +# coding=utf-8 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Prepare data for further process. + +Read data from "/slope", "/ring", "/wing", "/negative" and save them +in "/data/complete_data" in python dict format. + +It will generate a new file with the following structure: +├── data +│   └── complete_data +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import csv +import json +import os +import random + +LABEL_NAME = "gesture" +DATA_NAME = "accel_ms2_xyz" +folders = ["wing", "ring", "slope"] +names = [ + "hyw", "shiyun", "tangsy", "dengyl", "zhangxy", "pengxl", "liucx", + "jiangyh", "xunkai" +] + + +def prepare_original_data(folder, name, data, file_to_read): # pylint: disable=redefined-outer-name + """Read collected data from files.""" + if folder != "negative": + with open(file_to_read, "r") as f: + lines = csv.reader(f) + data_new = {} + data_new[LABEL_NAME] = folder + data_new[DATA_NAME] = [] + data_new["name"] = name + for idx, line in enumerate(lines): # pylint: disable=unused-variable,redefined-outer-name + if len(line) == 3: + if line[2] == "-" and data_new[DATA_NAME]: + data.append(data_new) + data_new = {} + data_new[LABEL_NAME] = folder + data_new[DATA_NAME] = [] + data_new["name"] = name + elif line[2] != "-": + data_new[DATA_NAME].append([float(i) for i in line[0:3]]) + data.append(data_new) + else: + with open(file_to_read, "r") as f: + lines = csv.reader(f) + data_new = {} + data_new[LABEL_NAME] = folder + data_new[DATA_NAME] = [] + data_new["name"] = name + for idx, line in enumerate(lines): + if len(line) == 3 and line[2] != "-": + if len(data_new[DATA_NAME]) == 120: + data.append(data_new) + data_new = {} + data_new[LABEL_NAME] = folder + data_new[DATA_NAME] = [] + data_new["name"] = name + else: + data_new[DATA_NAME].append([float(i) for i in line[0:3]]) + data.append(data_new) + + +def generate_negative_data(data): # pylint: disable=redefined-outer-name + """Generate negative data labeled as 'negative6~8'.""" + # Big movement -> around straight line + for i in range(100): + if i > 80: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative8"} + elif i > 60: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative7"} + else: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative6"} + start_x = (random.random() - 0.5) * 2000 + start_y = (random.random() - 0.5) * 2000 + start_z = (random.random() - 0.5) * 2000 + x_increase = (random.random() - 0.5) * 10 + y_increase = (random.random() - 0.5) * 10 + z_increase = (random.random() - 0.5) * 10 + for j in range(128): + dic[DATA_NAME].append([ + start_x + j * x_increase + (random.random() - 0.5) * 6, + start_y + j * y_increase + (random.random() - 0.5) * 6, + start_z + j * z_increase + (random.random() - 0.5) * 6 + ]) + data.append(dic) + # Random + for i in range(100): + if i > 80: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative8"} + elif i > 60: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative7"} + else: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative6"} + for j in range(128): + dic[DATA_NAME].append([(random.random() - 0.5) * 1000, + (random.random() - 0.5) * 1000, + (random.random() - 0.5) * 1000]) + data.append(dic) + # Stay still + for i in range(100): + if i > 80: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative8"} + elif i > 60: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative7"} + else: + dic = {DATA_NAME: [], LABEL_NAME: "negative", "name": "negative6"} + start_x = (random.random() - 0.5) * 2000 + start_y = (random.random() - 0.5) * 2000 + start_z = (random.random() - 0.5) * 2000 + for j in range(128): + dic[DATA_NAME].append([ + start_x + (random.random() - 0.5) * 40, + start_y + (random.random() - 0.5) * 40, + start_z + (random.random() - 0.5) * 40 + ]) + data.append(dic) + + +# Write data to file +def write_data(data_to_write, path): + with open(path, "w") as f: + for idx, item in enumerate(data_to_write): # pylint: disable=unused-variable,redefined-outer-name + dic = json.dumps(item, ensure_ascii=False) + f.write(dic) + f.write("\n") + + +if __name__ == "__main__": + data = [] # pylint: disable=redefined-outer-name + for idx1, folder in enumerate(folders): + for idx2, name in enumerate(names): + prepare_original_data(folder, name, data, + "./%s/output_%s_%s.txt" % (folder, folder, name)) + for idx in range(5): + prepare_original_data("negative", "negative%d" % (idx + 1), data, + "./negative/output_negative_%d.txt" % (idx + 1)) + generate_negative_data(data) + print("data_length: " + str(len(data))) + if not os.path.exists("./data"): + os.makedirs("./data") + write_data(data, "./data/complete_data") diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare_test.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare_test.py new file mode 100644 index 00000000000..a2af0992a9d --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_prepare_test.py @@ -0,0 +1,75 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Test for data_prepare.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import csv +import json +import os +import unittest +from data_prepare import generate_negative_data +from data_prepare import prepare_original_data +from data_prepare import write_data + + +class TestPrepare(unittest.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + self.file = "./%s/output_%s_%s.txt" % (folders[0], folders[0], names[0]) # pylint: disable=undefined-variable + self.data = [] + prepare_original_data(folders[0], names[0], self.data, self.file) # pylint: disable=undefined-variable + + def test_prepare_data(self): + num = 0 + with open(self.file, "r") as f: + lines = csv.reader(f) + for idx, line in enumerate(lines): # pylint: disable=unused-variable + if len(line) == 3 and line[2] == "-": + num += 1 + self.assertEqual(len(self.data), num) + self.assertIsInstance(self.data, list) + self.assertIsInstance(self.data[0], dict) + self.assertEqual(list(self.data[-1]), ["gesture", "accel_ms2_xyz", "name"]) + self.assertEqual(self.data[0]["name"], names[0]) # pylint: disable=undefined-variable + + def test_generate_negative(self): + original_len = len(self.data) + generate_negative_data(self.data) + self.assertEqual(original_len + 300, len(self.data)) + generated_num = 0 + for idx, data in enumerate(self.data): # pylint: disable=undefined-variable, unused-variable + if data["name"] == "negative6" or data["name"] == "negative7" or data[ + "name"] == "negative8": + generated_num += 1 + self.assertEqual(generated_num, 300) + + def test_write_data(self): + data_path_test = "./data/data0" + write_data(self.data, data_path_test) + with open(data_path_test, "r") as f: + lines = f.readlines() + self.assertEqual(len(lines), len(self.data)) + self.assertEqual(json.loads(lines[0]), self.data[0]) + self.assertEqual(json.loads(lines[-1]), self.data[-1]) + os.remove(data_path_test) + + +if __name__ == "__main__": + unittest.main() diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split.py new file mode 100644 index 00000000000..3bf47472d5c --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split.py @@ -0,0 +1,90 @@ +# Lint as: python3 +# coding=utf-8 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Mix and split data. + +Mix different people's data together and randomly split them into train, +validation and test. These data would be saved separately under "/data". +It will generate new files with the following structure: + +├── data +│   ├── complete_data +│   ├── test +│   ├── train +│   └── valid +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import random +from data_prepare import write_data + + +# Read data +def read_data(path): + data = [] # pylint: disable=redefined-outer-name + with open(path, "r") as f: + lines = f.readlines() + for idx, line in enumerate(lines): # pylint: disable=unused-variable + dic = json.loads(line) + data.append(dic) + print("data_length:" + str(len(data))) + return data + + +def split_data(data, train_ratio, valid_ratio): # pylint: disable=redefined-outer-name + """Splits data into train, validation and test according to ratio.""" + train_data = [] # pylint: disable=redefined-outer-name + valid_data = [] # pylint: disable=redefined-outer-name + test_data = [] # pylint: disable=redefined-outer-name + num_dic = {"wing": 0, "ring": 0, "slope": 0, "negative": 0} + for idx, item in enumerate(data): # pylint: disable=unused-variable + for i in num_dic: + if item["gesture"] == i: + num_dic[i] += 1 + print(num_dic) + train_num_dic = {} + valid_num_dic = {} + for i in num_dic: + train_num_dic[i] = int(train_ratio * num_dic[i]) + valid_num_dic[i] = int(valid_ratio * num_dic[i]) + random.seed(30) + random.shuffle(data) + for idx, item in enumerate(data): + for i in num_dic: + if item["gesture"] == i: + if train_num_dic[i] > 0: + train_data.append(item) + train_num_dic[i] -= 1 + elif valid_num_dic[i] > 0: + valid_data.append(item) + valid_num_dic[i] -= 1 + else: + test_data.append(item) + print("train_length:" + str(len(train_data))) + print("test_length:" + str(len(test_data))) + return train_data, valid_data, test_data + + +if __name__ == "__main__": + data = read_data("./data/complete_data") + train_data, valid_data, test_data = split_data(data, 0.6, 0.2) + write_data(train_data, "./data/train") + write_data(valid_data, "./data/valid") + write_data(test_data, "./data/test") diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person.py new file mode 100644 index 00000000000..be05213411a --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person.py @@ -0,0 +1,75 @@ +# Lint as: python3 +# coding=utf-8 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Split data into train, validation and test dataset according to person. + +That is, use some people's data as train, some other people's data as +validation, and the rest ones' data as test. These data would be saved +separately under "/person_split". + +It will generate new files with the following structure: +├──person_split +│   ├── test +│   ├── train +│   └──valid +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import random +from data_split import read_data +from data_split import write_data + + +def person_split(whole_data, train_names, valid_names, test_names): # pylint: disable=redefined-outer-name + """Split data by person.""" + random.seed(30) + random.shuffle(whole_data) + train_data = [] # pylint: disable=redefined-outer-name + valid_data = [] # pylint: disable=redefined-outer-name + test_data = [] # pylint: disable=redefined-outer-name + for idx, data in enumerate(whole_data): # pylint: disable=redefined-outer-name,unused-variable + if data["name"] in train_names: + train_data.append(data) + elif data["name"] in valid_names: + valid_data.append(data) + elif data["name"] in test_names: + test_data.append(data) + print("train_length:" + str(len(train_data))) + print("valid_length:" + str(len(valid_data))) + print("test_length:" + str(len(test_data))) + return train_data, valid_data, test_data + + +if __name__ == "__main__": + data = read_data("./data/complete_data") + train_names = [ + "hyw", "shiyun", "tangsy", "dengyl", "jiangyh", "xunkai", "negative3", + "negative4", "negative5", "negative6" + ] + valid_names = ["lsj", "pengxl", "negative2", "negative7"] + test_names = ["liucx", "zhangxy", "negative1", "negative8"] + train_data, valid_data, test_data = person_split(data, train_names, + valid_names, test_names) + if not os.path.exists("./person_split"): + os.makedirs("./person_split") + write_data(train_data, "./person_split/train") + write_data(valid_data, "./person_split/valid") + write_data(test_data, "./person_split/test") diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person_test.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person_test.py new file mode 100644 index 00000000000..3a91ce429ed --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_person_test.py @@ -0,0 +1,54 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Test for data_split_person.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +from data_split_person import person_split +from data_split_person import read_data + + +class TestSplitPerson(unittest.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + self.data = read_data("./data/complete_data") + + def test_person_split(self): + train_names = ["dengyl"] + valid_names = ["liucx"] + test_names = ["tangsy"] + dengyl_num = 63 + liucx_num = 63 + tangsy_num = 30 + train_data, valid_data, test_data = person_split(self.data, train_names, + valid_names, test_names) + self.assertEqual(len(train_data), dengyl_num) + self.assertEqual(len(valid_data), liucx_num) + self.assertEqual(len(test_data), tangsy_num) + self.assertIsInstance(train_data, list) + self.assertIsInstance(valid_data, list) + self.assertIsInstance(test_data, list) + self.assertIsInstance(train_data[0], dict) + self.assertIsInstance(valid_data[0], dict) + self.assertIsInstance(test_data[0], dict) + + +if __name__ == "__main__": + unittest.main() diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_test.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_test.py new file mode 100644 index 00000000000..9a8f1519faf --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/data_split_test.py @@ -0,0 +1,77 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Test for data_split.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import json +import unittest +from data_split import read_data +from data_split import split_data + + +class TestSplit(unittest.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + self.data = read_data("./data/complete_data") + self.num_dic = {"wing": 0, "ring": 0, "slope": 0, "negative": 0} + with open("./data/complete_data", "r") as f: + lines = f.readlines() + self.num = len(lines) + + def test_read_data(self): + self.assertEqual(len(self.data), self.num) + self.assertIsInstance(self.data, list) + self.assertIsInstance(self.data[0], dict) + self.assertEqual( + set(list(self.data[-1])), set(["gesture", "accel_ms2_xyz", "name"])) + + def test_split_data(self): + with open("./data/complete_data", "r") as f: + lines = f.readlines() + for idx, line in enumerate(lines): # pylint: disable=unused-variable + dic = json.loads(line) + for ges in self.num_dic: + if dic["gesture"] == ges: + self.num_dic[ges] += 1 + train_data_0, valid_data_0, test_data_100 = split_data(self.data, 0, 0) + train_data_50, valid_data_50, test_data_0 = split_data(self.data, 0.5, 0.5) + train_data_60, valid_data_20, test_data_20 = split_data(self.data, 0.6, 0.2) + len_60 = int(self.num_dic["wing"] * 0.6) + int( + self.num_dic["ring"] * 0.6) + int(self.num_dic["slope"] * 0.6) + int( + self.num_dic["negative"] * 0.6) + len_50 = int(self.num_dic["wing"] * 0.5) + int( + self.num_dic["ring"] * 0.5) + int(self.num_dic["slope"] * 0.5) + int( + self.num_dic["negative"] * 0.5) + len_20 = int(self.num_dic["wing"] * 0.2) + int( + self.num_dic["ring"] * 0.2) + int(self.num_dic["slope"] * 0.2) + int( + self.num_dic["negative"] * 0.2) + self.assertEqual(len(train_data_0), 0) + self.assertEqual(len(train_data_50), len_50) + self.assertEqual(len(train_data_60), len_60) + self.assertEqual(len(valid_data_0), 0) + self.assertEqual(len(valid_data_50), len_50) + self.assertEqual(len(valid_data_20), len_20) + self.assertEqual(len(test_data_100), self.num) + self.assertEqual(len(test_data_0), (self.num - 2 * len_50)) + self.assertEqual(len(test_data_20), (self.num - len_60 - len_20)) + + +if __name__ == "__main__": + unittest.main() diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/netmodels/CNN/weights.h5 b/tensorflow/lite/experimental/micro/examples/magic_wand/train/netmodels/CNN/weights.h5 new file mode 100644 index 0000000000000000000000000000000000000000..1d825b3aaf7c391757c4a95f1975498a2e47861c GIT binary patch literal 40512 zcmeFa30#fcw>bVZ&vlyTIuSyWgw(Tk$CMB_wH<^I zG9?KiWJ;)nj2_At6V%#v-3$S{h`Z)Ha9`Mmb_`EqR;sqGcV zj~z0EVPqL;`_izz+8+(@+cOH{+9FPW0Q`#lPsV`zkO7{*%`dlolEU`TYeZW_MzV7M z)pa(ytvt3JUPc1-f5K_EuSfCktusvZ*ml77KwAFux>je(Bt0^cw$vGA$?6WCu`FPb z&-@wl0v7*XQ&-Z7)^Fu9Z411Oz!*jv`Yo6r=HTDn3B$~r5$+SRU_tQg`Llk@^l8{Pp|g$pMZUOwF66w1A}LTh6c=+*7XmVzc@ge_@iYXhqkQWOQkh`%Ti|q z?ekJ*^dvKzIm2&mz|b;{r%l{cQjcu>e#$ROqmcIzi+W_0U{A)+Q za{fnSK+0dd+Gi3+h~B1tnz>E_jmlL)}*an<@R;?ck8^MdmEgveHecS zS7!HTxOQ-6=%CRazt5$8!qRz3`ILl{EMBlEFnGbzwi@l*x`Z>d{T%EZ>>PW921?s2 z_>JlP`u)PdFAV&`z%LB^!oV*K{KCL54E(~tFAV&`z%LB^!oV*K{I|luxKX1A%Pp$xgV?kcGoC%$B2u{U_H<&qlG3*wuH+9pKwCz%M_ZURsx1s} zhZAmZi%8qcYEPH!Xv^of!p?Ys*Ap)d zm^~{f^j{RkKkJF5`W0!|rf=#gQJ>Ba2zIt_PtKe@W3jV++a#r$sy|OzYIl)-NSSs` zm5h8F=Gs3i({6^5g8i`${!zJ{MCbimOWJ(P-{;3Le8y0jF2&CV6H8B%WCd*ExMUN=8!Y#U8)txQ2HM*&HXIEJIkk)vx)!Ix%le5HW8PA3w1awxi9EH-=tRAI%5dq{VT9$c zz%ce8qz`w7;dXn$sK-%sG}(vtMm=cn`P0bssxCBj@B>c8T8$V@-Nv>QSwYNKMQm^w z0gl!#uwm!|GDT$=EzHs&^}e$=dHdppv!dBwnQap$&maG`Hk^`=GcYeDN+ zFM7Y;m+12B3EE61|J|40ZB>ps3Hs3cwIbTIaR4=nYa#nCf1_JE?4;H6Rzm89rNp82 z4Nb_qNw4VU(6OIikQGX*u4eChP&!(h9v#8B*3Udb)W>h7YnkJenV!xvHU&kQ^QKV2 z)<9B!dLlN>&!Qm{lOQlz)pdQ^J37-kiRQQ}kq-_rR6KPwEFW4$RZ5;yyWGpvYuIwU z(sGBE49z4q?w4WEiIX&Bc?0IT9VAuV%t*zj8X8i5k_?Z0NE(i+xe8p?aP@1BqaeGA zdi|)RnIFoiyxRmiBrlI9`pA=g&r<00M|HH;A^~H{BWQ7wAsy^hN|Y}wAp^xquK(S? z&#&?Q!oV*K{KCM$7XyEl8{6bIsUMT{+$0VE{c>QeRofW(e;VpvMvN+wPYm9kf(F^ps}&lc7}K#P8bHkf6Q4fAPE1koesh zF+b~@wE3>>vZjD>RAU@}v@3w5^!?Xi{pfY@wxATvG z#An3)+Ti~*_)8Ax{Kj*MhzNFe;q&1o$~lm~&zZ zt-Y*F!zT}=r3;plRMn9vRvM1{FSVGeork&Z3u(cSBxsP+pq_f0Fmd-tTEpqlbiOyu zy5EZkM@Dcqo16)g<{}CX>PTxd;vjOS7ELG~NALJ8B>vH^#3@OG@)ceoKbRpkkGqoE z_#q@mZz_#_=?Ec9$6>D4d{RHGGY!8qht{7v55iuPIllRMuxDDq+F=}hJI0L`_b)A}jrv3hSNYrSd&DSbGB_!^Ffg3ElGdQ5~i z9UftUSvJ@>6k+u}ZOrO-8N=mHfG5~sqRKN&b$9{xogV|cdpY5&e`6C@jUz(Y6b$h@ zgv=)y6mA_%ZQM>l^Za4N>9q<`zu$|dZX{UqbRwzml?%cRJ6#&$1`%6V7sA{L;#g-t zBJgpewhd~8S2&0W68&h|Tzw2#Falyfccmpv6EHG?tZ-GTi^mE*lGvpO4S9YMqdQ!O zpcCdq@Nf}wxPUO(JCW%*o>nY>jCx+<2rpt6CVp;#SYty>_3=QD*TbpsiWjCXc!!ZE zR+8K@Z%kjiNMg(AP3@oT1mE3{*^rhx)SK0ZmgY^R+^Xf2k&Sgx-n0?wH>(lBf&nyg zi7TyHVnb60nL>RJKNKFd#cE4sS{po*l$cjzxMvWNS71o&7m3ZKr!mbsW=GU_@JT}w z5c|lXn7ZmH3idn#-#TZKtLTV=!>(M}{lln!Q;Vkdvw@)LUL^O_N}4Lujqo+~X#F9E z)XP~yeOVQHjz5IC=}JU>pdraA4x`o6R?x)8)r4QEiOmsAX%h8Jk^`H-03vn{pyJiz2yEJv~;=AY?=eW|9ST7Es zwq5)Q>uN))QH4C!iym^dX_118h#Rm`}I-1 z&XE>axY7!DUy@~3j7+G6U(^M%Ww(sbz5ixpcPt~ht6U*8Z8<5U8^M0JBQW*$DC}%b z@2pfOo(sAmGxi(HTGA=P$|Fnqxi^b8q8+%tse*%KiA&TJB5 z-W6@4LJ41K1Lo%Jf=JCknwyah)%;S~cU!_MZ5EN-rwgE@XFM`X9b8IO#}TKm=P_nPFJk*ug~aw534;9@tk{sFu6!DW+(c-Y{h05Bg$AP0OLJ?)LwZFp7#V0G;^h7 z%^1^!j#Ef-RR`iBCm@Mu8qr-?2Yj=yZ2jb^ShIg3DfyvC`Aaj9&t2w1u31CNC{bT1@QKY>02XC#Gslf?63z>S_26j69dp)Q46yX2TR(W+cFnYl^_wFLW-u>rV^b zd^P<5jYRQ`ez75nRez!jimS2nm?%F$ZmD`*{JOuTHC(&CfWw5)Cn znoV9#PMU<%SOo_n+;zo8J%XV%YbFp!LuDdtsAFx17*cWi7p~=i4z+gcOZ+vgDBors zn|k2_XVj%9HKN^VY3%}<^}vKMD{MKhZ{dWQ(}gI~UQZy=onXm<&eSw-ZS8=OhR^9YKZWR8&&ef`WZ1oNuHjG6@}vyb>e9cildS-Or=B zIuc&}XchI^OfhnY3l$th@N_?cf0Zx)Z1<4Lk<#$rFE8d=wVkm9e>!UYRbKqBwR`-X z9;;$-+pvZ0qyMD`{?ljtmG}t*Kg*X=`9s*gj-=pHxzAIY@lS?QJ(gf#TSLC>{ry+# zfsNYLK*IL<{i2_L@=|{Q!{($iJQ8COf z`e{eRui#(Bz~9ji)BbLdw9^0A^doJz&F=A+2Y&u|;YVs0k;4DEh>mYyWTVb!kd^`#XE+@8kT#7}}Qlvn5nI z!#@xCuX3u*?*KAw*Q}W8c5rF!pVR+-xwUped(%$s;qSp;<<_A8v-m$e{*U-Z>iD)H z{Nfw`;h*BaWd}d=4Zd`g?fZ!|`5$fHtZ#ROkn*L!dv8!Wo?l@b27b10)^7Q~V@68n zxugH@aR1v%bKa%lq)}Ut>(Pj@=*+D>+zdveOIY!YP;|b0A7a(ru&%xULhY3CaV0~P zduT$+D_ugJ-?OlUZHCV$M^v z3u^$CWIjn0E`!tRI`o!aIb@lAXSwdND7 zK95~DC*Y>mIcP=+1m%2XN3Ew2RxC$#8!br9Hw}6qNSP`wxrqr$R?vJ(1NfgDxQoUj zsPDd)JKrQnO14_!?vcv0``H6HbAL3<@;ZbQ`NeQAP=PL1S0}B}-<&f)9u;1Qqr?uc|LJ{1OyG3ulwuY}t-H4~jqoZ@sgG(f@HO3q*I8mix}z^Buj z!Lv%6wM%V$p-qhJIXKi9)5Tq7dtNwyZ^!P8)d) zOX6H52FoNcUtmJYF0O*8ge@?A{#r;J-Hg$9+_3NG2Ix_*Nrh3ZZ1~SPQt(rPl4yqg^kav1>VCXh&gNv!Z00HFry5UR2X_~ z1dn8CY~ehU+&OD7lh{2qX!b50A}bh)+`X=#WAzSvH;#kql6NlmQf0_j)e@M-D3EB2 zq2S;egT?29K&hJvNgZ<)zjkYZflV90CiWbgxPJ=x4seIo%EzFX@fMzPcR|oG9yV6W z5Sb4q#3qtYr!3iyytz-=g1Q(G_6cBH)UA=xUcj;;7PMk~F_*ehnI_CJ1LrSKKv=58 zJ&n|$mPeFHanBx5e(yEhN;v}tj<&e^hyq>q)sh4q{emyJY{J`9uet#^APQu(Z@9@!59x55D(t^rpC|%_Rcg4meC3dNF&M#Mncs;LWWy>qT^P~+o(QhBfX4GI_`Wt-eIRrPnPsGwE3ShV! zao_SIkov=yosg1as2HwqkNKVsavBJ}M2gH7nhVYts5E>>?a zD;zmjG;^F1%~>)HJ_Yee?bKe_O_@SPbP@>SzHr%h_G0AuV{p`+N9WE+N6pm+G&DN` z7M%{oIS-34wlSGoYcc`njL5~WGrmB?Ts@ReHpJ#Gx49md@8Mga0WH;cfo1CQ#BJ6D z33o37vw(vTI_x%je~kd5_7KK5=fd0+b&@+r4s-99V2NfSTQYt>=Qt<;_Ro_ezT?#C zYv*#PUi;0(sIH4-PA^bA*_R_0(b#DKk6a)55^rV9#kCn9CHyZ(G_+2Sm{=YHKUW_0 z3$mbQbFSb~B(IW%-M5(#@tq16!Je_K*=2QfQd*C(`hHwDeFe##O+F+Z z>IemyrEJlAgtGAq*`_Z2Flb8#_vVrj708SQ#d+`Hoq|41b{>Q!Lte3o*)njgNS>~; z{tnvTWr>d2D?E^*L;`KqsqjsebM5^%Y(aD~TsE!2%R}^udf+Wo-CqaFHG!~yj3TLi zIF$=7FTu-U+H{rX3z#XWf?RJ~cBbE3FzabW_vJi=x*diRJ~Njs>v$H;SILutu?yhG z3oB};n*jwSd*PymDz?NtMZV5bZenr;ROom>b>l~o@W2&kqgCbXlg%c?`(zVVk59vT zw_a>sb}1-Mss!Z|?hu`*h~|?de5`w@OY3vV8rg11s~7I!d>_Pdk7hLB#VsGeuThcs zdR^syw91lLk7@AHMv<0HWZ0~``RtU-4>5Xd9ol#|px%hjXsL1=FW)hz>tf7kNFQ@d zQ#GWgeREK8)@STD&yosBxx63Gff(PH?1^i@x^)&97V3~WD^p-?pXZ<$8iod)I$)m_mZUV&6nHJqxv=L! zIA!HCs2!Wjc^>Tpnx8EwcViBY3hRe+zyh7aY~J6)UH` z1M8J)G%drB3iNm)n}XX86};pMAbn*4WG%C#<>i&|;)*7o zobLfQE{c%p#?F2o>pA?@}8?d$+5FGR$%Cx>FD0547{fasOzCfS8VSNXJ)U(E9a{5 zZ6_WG4mFDghuMMifELI#S;7i*TtrhX)Je6%4Gc7X4YKqEEGlqCALFlBY%>l#&8Op- zGiF3Dd^Y5N>xe<`)3_SHQxI=zNgt^j(A*EVG2ioow!nkKISP9=G#Ozb9g#6yu4DPrMm&7PkXR3y2~PW$prUyr zJk`Adt{yrhYS~8MoO;8BwOVxFl5pIV_84*|yhcW>CJLJp2jllPVvB1Y2<};M3)4)9 z!YmoWw-e&PC7&_8cN&*S^PqOZ8+LHtuFx+;n;hMe0^yraz?R5-Wcb_re|`N8&C{b$ zoehAUhi|~!sjcW_uL8;|3}N~Q4(%QVWA)7~P~`gtB9&F)e0N#m_Tn^@>1lxP{z`Un zsv2$bt%sW9o>>1x4>rkn#FPcPWTTlSi76S1?pOOj*uo?zS(yb1xEjQ*3%S&$d~Qy; zI&FAhLO1u6BLkh6qK&^T`X2d#L3JCri`r*F?W+=DS9w75npa$Ymu*m)eH_MkD-*>* zRw(=Q2aX@ag8@sTaDH|<&N+Gy_qy^)S0{B^@W=|_@qQGa>j@`=7h>wO3jDbDGV~2= zf@qT@&fk41L|d$985OxAFBJ(t8~K#=k5;06*55;gW*M^BSykedxkNyEhPGL)D1 zk`wlF=4!i|K~OgpiC^19WU>-reAfwJ`b>k|+VuraKh~t5Cl;XMnFW}fp@>_vui%eD zBYMhMhA@`iEN}5v&ScUiP|hBRah;547B>vkzUUJ1i3~P3-VTRv{tQm<&fwFT&A2%K z6vjl#(i<5W=(yD%kLdHryCoK6L??;wV0ty@J}@6`!k2<)s}~Ia@Cv*oJk@Ho0f||< z9YYT(V5veWzP8wmOvET>uh=C}_6)cd*>or$_zsw)*M(iTYmw@4ouO#rW;m88L(Z8M zgV(A&Hf+i&jNW7mEhfVt)v^yuq}+BIWsA30?De_~_59_Bp1#AB#&5sGi~MDx39 zzr~IiZNHiar8W1zSNThEHDv*F)66yD;OFDb4JB9@ng3NN`I>G}!b6c3wUQcjsD= zvJ^8qG2%3iKcz{N7M6kD?L(+I`aYh0XGV?uwP?nO9VlKb!=|p8hxIZWP$No#BoF9_ z7LEHMq&f+Yo-IXRD{El#jHp7g9ND>hAJoRD1JkiD7p?n}6{=^rOv&huHWxNZt>wFD@@IbiiD2oZJ~&?+iy*mxpL{Xbr|x&4Dt*r)+%VK3K18Kxeu=hiIh?=sWi_ z?DO~pf=2e^z16jzjVT7aKs8Qdf$Q( zTE_In>Af(w;~H#riH7<`!4PV%NpF7CAU*e(Qd-M;ahMyp$ zwSYD0?SW{aLxxW($2#6lTy|ZR#Kq{+o?t)=#IHHwn&&wGWChr|<>S32@u>A(iq{2!HAs|n{2^ERTd-UrOsl#Vlx-i3K%HefbSiSC!sft-;%>ENzLTVuDP z)5{XB+U1~&xJK7ylB*n1J^TjF1*%c`eO5SYnhx#0rU_q9kmwU;Ou^A^CPZYcMA-!@ z)Vi1C%otJ4G8TF?sBt@LN_Zc=+6_nT+<+FlH>15zCm2;%0_B;nFw%A$7l{8Ft>PBvI$ZkJFv z8JUQl7Cu~6p$T}M3C0U91z10>o)hde6-CA}$PHKspI_ZW_kmm4(i6t?F=tN8?H+>t zh(26R=@yJ|Y=-NNmw>4Wg=fy@q@|ZSt*D(1GtB~U@JS2sbk?ILdlMk`S~2@xU_v#= zpT*p79eUI%&zQGsK%&BTLDbk!a-fTo8=LKlAx`}=AkWZ53_;j|y zUKBhzBg*ye#&-7GjlCy}FrlOjoiCh+!|IZAeEdHKr>P&5HLhij9UoW+L=f-h}18 zui(Kv9(}X)K8CGm!BmrMm#KrRQSPt|ZJeu21**AN6KqV=9^8Wt8WV9#jRHwnk%4xD zm!eQT*TrnW3h)k(fTp(+-Qp{K?6b_2)W2BI1;%Dc==lU}iZh|9N6%nL;6+aH(pNaT z<_jENXGT^WnG3?z>zy~oYZHr{6DWLROc|FOE=fz@gSCbU&DGw3wtaL#bINY8lsSS+ zXDCs(jtW%R?*ON8MuVIx;So10K!X)!;LS*MnV-~2Sn_v>?X!g|(Fle^OK*dhhd)-R z?B=}kw{gx|#~_El6(8OHj)JR-T-pa6Dzn&>Y>>!3(IZ+|{d206KXxyB+|Ga`i?X0Z zJRjdGD^Y&vcQ5xjM*5(H%`E@izGAVR)3Zi~_(tYE~L{ zPDw%Lbr+YazMt_)Pi?}J=t~yuUWlq=FF@;?>tJn?3NKpLp|M2&Qg^!%W}mR4`gKvGDknE1_0EKQPKJd4=`Ppnx;8y= zFAD3=HRCG9Ti`ftDZ~V4q0#qxbg<6DXLC)+t){2oP^E<8m_SJAIv%UXz2`zEuH!0J zO7unsrWl#J4a$yL(Ox%F`ma z6MAf?LYRtog_-7WaIEA_uxjIa(G!Ur!C1NBv2n`8tbZ=7_Awxp>)wNj|5-?nIg6eh z4zl@$W|)xG6%&s;TJ$PP((`T(gy=@s#bEh6N$rhM) zQiFCIcm)mKZk3!9H{hI$6hf|8!W8B%_$!V@#quK65NYuK_VKlbR*Yj7mpfS3>C6Y(c48Xb|%R`AVHdCXq!T$dw|>oAY2Ty_`F9q5X| zH#C45ysOAsUY(j4&cN|=hQOwA>pTv`&WD@O-LMu1O3urM zZ(?y&pcp*QJz}@kyo7156-hlej@`0Ng*+K%OrNBw5&kx0nTXp(nX76bcIrF!P<0hv z-%|{`v~(zYZ7VzuRw2pO=kOL^p4z%=!O>U_=(j9Xtvd({BQ*(cJaB6ob*cIr$(oyY z3yPZS;r0E_SlJp2u|p29f|X^QP4IkR-Y$lj3sN9U;W^i+U`TwmEI7|(EpF_19}M{t zg@V~y+=3elr2a!VYN@CbpMBRc#P>Dlls^n)4^iOKdO?lh5@;wsiqE&HlUHk+(alMX zZizaG1ySzMBIt#l{TJdN-Hfh7*%3b)WSpp-!7xNF8Vs)Pz-0QWHy|%)Z<8Npd z{n%_yPlv^z+nA$Zmlu!ARwg4K3q-NIt=KW051`kUj@WWjR-$9!p@kZUqV5}U&oddq z<2FO_10xWwJy7&uP#P}zU`F}c{_y4#Lt@Sw(ojP=8gN#Z zeGZY-X(}4%p9#w^G$6mWH_Xw`2ENV#m$kOnq2+NDJfCGsJ-2IfnLjQ;6W0yy; zs0>VX)CryT9!`o%ur=lq3fIVqx>>!H+;{knT3+R7rj-Y8)h6J+&*!l`#ss znW%f&!*KP_Xxh_~_FG^`66a7hG{+oz251t?8E^30?!9+<1%O_N%cqr3Kc=@`!0&2Hxwt7e^XekqEaW z82j`Nq-Cm8jYw5$pIM0NDZ^mQVgr&mJDSbYr~{$Tb!XdedBAtODpGi$L4x+of#g>Q zQ8~>FPWZ{wTV)3%ccQd#?aY0^j11)#%+(@&_x8sIUCCL#-(vRAlWDkj%Uw+WZVd<1 z@=;}vC7HJQJ>IH61f_e9K;oQjkl)e_`}>#?CglbfUf_nIQ>xLXj~vwCJtR6qu{@&) zk1sYRPOa*6Z@f9twaCPfDf6+uzKP4zkHS<}3syfzLPwqftnZ`J1yMebPdW3kDXVA^>7-SvohOGM` zEYR7`KK3^tV#y7YShW>w<-Tlay_<)-7M{XsHzUz=pa}CjWI=O}6gEpa0WG%ZQ6&pL zeK-0IitjvO%Z`R~%+v!#`E~E1EG&eLR*UEE=u7Uh%pJ&8fA+;({lnmRy&3t8Ebi)O zPK=6kxX?j|VC_Rq+H1oP42j;wiaQ8Ice?w*gAf@y^I;vVyTKt$io{1(_knxt8kpXD zC}zHW0aG#&I{0tH%Aoh?;oXe*&VrQA)}f<9o`KH2`M4nECEBJbQ=x1ETm85RlMY+a zmZ2id-T8!d?=cK^j(!OjZ`^|sg;{uL-)q=CB^_(Rb)k6O2G9v8gRnl2@X)Lp;KiGA zvHK2SNW)Gb=En4G*K`mU_mk{#VlMoeEVZAu9J3T9_eE>PD9m=^WT%OtV$D*JIl?3U z_4^>SSAVeCzlZZM=>)#36uGEVnZOV5;{?$RlL}uIUG7ci0Kbo@ZjQ1+^xD9bC20^MF%fR>`M(G z9x?!!+l0F$qeI-Y3%SWF_h7JbH0bFa#EXw|;biL!OnvQOK4g>Ept3RWjM*3EF0#fQf7@Xl|c~t)md^>K{gGLw6uDw7x>;fA)trMmN=+bew z-{WA-F<7@S2AqaiVzitA1g(0-=0-cR;?=`lBqnUAK6{;IN(Zv%J{|uB>7A0FF>Lx~P)ec+FtsTolB3Xmz*=&T>AsQ~QC{i~#p0(J($kv#(N&^T&d``HcV9BKtkL#sg;XT=KM_2p*t zwxYLsThdK?A3&Yya~wHRla>wJ&b~;0j%#PHm)!4tC0aO0gB+0PJ&d-xfPzGpI`nD| zitW#F^|xNI?!ocwxk)cTZ^vU`g$~t!Q;sD0C>K@Ws!F2Jht2$_F3dPFFSL z4b|hktF&;=Z8_Say9PsV?SM<4AEKs#7M-xlgcQ`uLUg?&WIntOUO5l&V(LZQv0H~s z+%G9V8MJI0stBLwFI=d$va!SV!Gxy-7&60c5yO*mZ-ZOVA>19HBDp)8gyQpV+>0PBtRFv_ z_3U;MmadlEy_?_xMpoCj)B6f>TyJ%9q*$K}8Kg*NhndprH!@_pExmlD{5*>Aapw|TVxBxS;ne}SG#&am?1KJ@G^==|skt;a<5-4k3-UUjGLeYBhC=e{l76qv25gmtm<5P`%WzsA6D?Sam4O{bfL#pj3m!-;z#9b$k zRWD&{9*byB@3`FbvRJn_J+ zTaP*BY!{X*e1#px#9JCGkcrAUA^{IY!-KTahkJZoPpyt2cpfg2ZnhI!YOBaW9K|Sj9k?j#?3G%Vy$9s)hiR4o|FN(i<;O2gUxAoiQJ_$ z&4?TgyntiV<6zV1+gLT>FboJfjT6Rbk+B_|%*VDfg!I7lxLe}2sk zo-qsq8*LQvYFM5-s0h-A{&;NA7wggRHiyebP)ce7D(rv1D7;3KX7$hH7WoashdcCWQ=%MQ_3SOQvJ#&fgSFtsJ%Wa3yK#e;>r%z|DlEBfNR!{SVCBh;P_d~S@E025ZARh)G_XiSBsQ&<>1R=F ze#xa~AA#rtnvnYRs*B)42|9jog}bW9F{?=vuCIuJ(hieR92^fp{#!ABS~UcEJ_lQ7 zIXqu2L(i2QfzcA$`WUeh_yyW*YPUTuM~oQ~v0o2e0xU^FpDS2m@{MDz%8R0Yl);dz zx+LQsLssrlC&m5Dfpx!!v0G$)GZNePFVrJQw=M&4-$n;lw(zY6vIa)d0M;a z8D~3Cg<3iq5u-Z}80%WdIet6=4ZAx+?Ckv@o7Dg_?Mp#@&N}F?qe)8NOorS`0;u9B&yJ_SPY&z+VNye{mO@%%k?idR<4;YY$ zsdpjZvMj02m&mKvsEg^{WBAmy0E{&?N#MzB7(CMvIW`&CM}7$g<)H7;%-Gj?Jad$W}?e3W~mOgR5#UxpUM6|k{q z53|b0P2u?7YP2YtgLh3;N!JP;vTaui=02Urop9ZNNw2^xOdK_v4a@v=h$PU+iF<+M2TE?RHmVI%ON(1!|0{y_$6*T zo~*C~{=n-ZllOkOywr*W`*ep6+Sidc<|5ubWkBOw>*4!Vd766Jf_r3k9!6*J$cck8 zWZjU%=)cH;@d0&p|?|g?wi^E}Fml|xSD#VqVR`f0N8OH?hX;Y9ceb7nrhRWr6 zP+S}XxjJvSExW8}amP$JI$ENmOgoOAce-QYI!khM2#3?%qoCQf8WVFjz{RpAFp5ut zfis?9pVMEF-|$)TZcu4cI}lcYacY z6%Pgg-B$<9SUoyKqPvp6G8R3LFMuz*IUJ(JBb8OM)N}lGu0whboSf>61!@#KESZOO zh7G7aBN1&oSkn6QN^IGzbud|5hp6AP$Mb<-aN43mOiapU-zur%@HLXRTePllpIBA0 zxy6Lknit@Q@jUu=hBjT|V@~_t&cUcn=G6Ie6NG%z!$*D>z$RP;Gl#zc#RuuYlS{z7 zTPx7EJ_(eSSK}een-F&oQDGL(INc==! z?u0()_Mp2}FF0{D5shME*vFl7G4X8!#Md<(fImI13`ya1r$$ z`39TC2cS$_32#wVym!l-CcPGesw2g@eYL2Jx5S=xtpS^&P@;2Zh^$2bt~qE*j%dnI zua<={`5=$T790kruLiWJI3EVT8wW|#6lP?1AL_ z^Xsr``&9VWOO@0wOW{1use;z?Ld@$^1eqThdT3Q8OuDB)cuPuIg`Yh>vWFaqOm%*jgh)gE0>J=6r&4j#nY6S0yIg*#rTl8pP*;9yw%o4p%-& zz?bV(Npi_P%->!OFVa&Ws`DPm6+0kfpIDf>;}TanYdci*Im+Igvk_DJTae}tC&6i0 zGdtpjB=<=XZE?{WY||p-K7vkjZ%sc+n5P7pW@ zg=+_+{lWp5m0HJf#0DZ&R8UxQ%z6IetFUskJo)k}1AX&DIFY&nDN5t8v*I3{dccZQ z=en{_t{PLjO;6$B;xkwxcZ=gUDX^N))8MgzC28951xEeI2SJ9&B`;i#7WFy}CwFB6 zlk(bGm^l%3Zz~bIrt6ScB@e!Zek^myTvQJoq5iNUo0?hX!fvWXPn$>V5SAg#q8yQj zNSmG=t4ln>^x?D7QB2Z#jD-iyN%(|Lh!PoophpX+o1Z|tC0EdX$Wbn6W)f&#u7jAl zEAfy?G&s4{pw%5E+Bip!#?N{U{&TlM_I4f-ys?L;pE5Ciio`yui=yc6r#LkgJ{fDc z3kQyA!BdkC;G?4kRLTE1p1vWm+Z<8?+s(sJdB`U=@Zf2D`?e#P6rBY1*mzuVKm~o1 z`0SNgIncF9iSUZ=ak0h;Z1#RDBEM@m=*?PyatS=rXNf*(QF6pLrwxek*e20%#bfX( zQJFT*Qlfd`Wf(GYE^9Mr5b$j)Mfc`eQBAw8@On-Y_U&2=wTCxyS;k>dTx&p$JhZ|0 zOFRgC=D6HBEBQMytRJO}e5+0htb$xHxQsG}|>9>A(aSa#a&( z&uTb7$O+}wnxwK~OH^?Gwv!JHqD=!gfXvQ>_0`-c(pDI=5{(b3DIy-kpS=gC?VaJu zFabTGG8-lP{D7*;!Qv1P%xg%L&T-}Zu4^%PF43Ze4`Q)3V!-Zql%iSnFQ{;f#q=gA?%HLHtIYZA zz@l<+|KcJ;t(T!=kP(-MAqDr)-|>fs0@_>uXACS%z$|+Kd&X3sj#$ALd2Js_$Imoi zw{tlg1{2MoSbrSy+UqEB7%S4!>4G`;V?e%cE)%)*IF~P@&6+okMrR*gcI?9dIBD=b zF8ipK5r>~;tSy9ej^`3wxR6Ju?3;^%(dCrPBn&51d&Av%LiUom4xY;?#>{bCF0ex# zr5$=1Bd)HOre^5T1Gn7A+5$7yWzT-_uujC&N2}3m#{tAp1(sy}ia8A#a3snG(oVE7 z7oRj>p>_$Z-c-Sam97QPTuXY|nnsX}kwQ|o3CnA$rYx%7VZGaiqNC9;daB`WG(0sM z`8Da3Xx$Dp%%6nX3xpWN<)&0^90UhP=R&AWEDkY>VY&-!(B)PL_{C?UcHv;GXwXH8 z=6BR&xdj`$Di5kkzJ&`ddi0tnmUPYXSWJ;wfL8EbV1$#QJ^mDxJ-rgLj%~)Y2S-JM zPmG~+Un-Mk)hG(8w8ui6q&DnHN}b7z%N{$n?ThaqS8>q@BR zwk~EReG`37CgQFc9;o4MbHa{hA$(>x_!{fN1V0m2uFKWWBsqaqtRGHQjG&_gj!qvhz)lJN7rmuH-70t9Ap+>j%>o@`3EM0xl2e-55B${tC6!&4{IDjY6lK z49Kd}V_V0^!TfEbApYELSTWR`c8e>3%ouZYPjH1rk(Z#&r2;3)&ZB$yA!_-LL+M0m zI-_f!gsqB4AZxydR?}T@hJ7#=b-zRFgcN&;SpMHgV0=&>)Rt6!d{$N#c z8K(G*#AN|8s(0P^%fdHn7b>NDvk7s zbCINEB>V4`E3$vV*Qx>abBNeyv#NhO5l$T+N@eC``ah4ve$@N5{~LUqzhrzaJ`UF% zgy3?h5%R& zgqNsX*j9CJ^mV>8CoB2;&zG0{!QRlvtDi3dm7FfB{X@hi_DX1iX+?~J!~\n", + " \n", + " Run in Google Colab\n", + " \n", + " \n", + " View source on GitHub\n", + " \n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xXgS6rxyT7Qk", + "colab_type": "text" + }, + "source": [ + "Training is much faster using GPU acceleration. Before you proceed, ensure you are using a GPU runtime by going to **Runtime -> Change runtime type** and selecting **GPU**. Training will take around 5 minutes on a GPU runtime." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "LG6ErX5FRIaV", + "colab_type": "text" + }, + "source": [ + "## Install dependencies\n", + "\n", + "Run the following cell to ensure the required dependencies are installed." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "h3sE3keZZnMX", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!pip uninstall -y tensorflow\n", + "!pip install -q tensorflow-gpu==2.0.0-beta1" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "STNft9TrfoVh", + "colab_type": "text" + }, + "source": [ + "We'll also clone the TensorFlow repository, which contains the training scripts, and copy them into our workspace." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ygkWw73dRNda", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Clone the repository from GitHub\n", + "!git clone --depth 1 -q https://github.com/tensorflow/tensorflow\n", + "# Copy the training scripts into our workspace\n", + "!cp -r tensorflow/tensorflow/lite/experimental/micro/magic_wand/train train" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "pXI7R4RehFdU", + "colab_type": "text" + }, + "source": [ + "## Prepare the data\n", + "\n", + "Next, we'll download the data and extract it into the expected location within the training scripts' directory." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "W2Sg2AKzVr2L", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Download the data we will use to train the model\n", + "!wget http://download.tensorflow.org/models/tflite/magic_wand/data.tar.gz\n", + "# Extract the data into the train directory\n", + "!tar xvzf data.tar.gz -C train 1>/dev/null" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DNjukI1Sgl2C", + "colab_type": "text" + }, + "source": [ + "We'll then run the scripts that split the data into training, validation, and test sets." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "XBqSVpi6Vxss", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# The scripts must be run from within the train directory\n", + "%cd train\n", + "# Prepare the data\n", + "!python data_prepare.py\n", + "# Split the data by person\n", + "!python data_split_person.py" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5-cmVbFvhTvy", + "colab_type": "text" + }, + "source": [ + "## Load TensorBoard\n", + "\n", + "Now, we set up TensorBoard so that we can graph our accuracy and loss as training proceeds." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "CCx6SN9NWRPw", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Load TensorBoard\n", + "%load_ext tensorboard\n", + "%tensorboard --logdir logs/scalars" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ERC2Cr4PhaOl", + "colab_type": "text" + }, + "source": [ + "## Begin training\n", + "\n", + "The following cell will begin the training process. Training will take around 5 minutes on a GPU runtime. You'll see the metrics in TensorBoard after a few epochs." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DXmQZgbuWQFO", + "colab_type": "code", + "colab": {} + }, + "source": [ + "!python train.py --model CNN --person true" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4gXbVzcXhvGD", + "colab_type": "text" + }, + "source": [ + "## Create a C source file\n", + "\n", + "The `train.py` script writes a quantized model, `model_quantized.tflite`, to the training scripts' directory.\n", + "\n", + "In the following cell, we convert this model into a C++ source file we can use with TensorFlow Lite for Microcontrollers." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "8wgei4OGe3Nz", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# Install xxd if it is not available\n", + "!apt-get -qq install xxd\n", + "# Save the file as a C source file\n", + "!xxd -i model_quantized.tflite > /content/model_quantized.cc\n", + "# Print the source file\n", + "!cat /content/model_quantized.cc" + ], + "execution_count": 0, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_test.py b/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_test.py new file mode 100644 index 00000000000..18467abeae3 --- /dev/null +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_test.py @@ -0,0 +1,77 @@ +# Lint as: python3 +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +"""Test for train.py.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import numpy as np +import tensorflow as tf +from train import build_cnn +from train import build_lstm +from train import load_data +from train import reshape_function + + +class TestTrain(unittest.TestCase): + + def setUp(self): # pylint: disable=g-missing-super-call + self.seq_length = 128 + self.train_len, self.train_data, self.valid_len, self.valid_data, \ + self.test_len, self.test_data = \ + load_data("./data/train", "./data/valid", "./data/test", + self.seq_length) + + def test_load_data(self): + self.assertIsInstance(self.train_data, tf.data.Dataset) + self.assertIsInstance(self.valid_data, tf.data.Dataset) + self.assertIsInstance(self.test_data, tf.data.Dataset) + + def test_build_net(self): + cnn, cnn_path = build_cnn(self.seq_length) + lstm, lstm_path = build_lstm(self.seq_length) + cnn_data = np.random.rand(60, 128, 3, 1) + lstm_data = np.random.rand(60, 128, 3) + cnn_prob = cnn(tf.constant(cnn_data, dtype="float32")).numpy() + lstm_prob = lstm(tf.constant(lstm_data, dtype="float32")).numpy() + self.assertIsInstance(cnn, tf.keras.Sequential) + self.assertIsInstance(lstm, tf.keras.Sequential) + self.assertEqual(cnn_path, "./netmodels/CNN") + self.assertEqual(lstm_path, "./netmodels/LSTM") + self.assertEqual(cnn_prob.shape, (60, 4)) + self.assertEqual(lstm_prob.shape, (60, 4)) + + def test_reshape_function(self): + for data, label in self.train_data: + original_data_shape = data.numpy().shape + original_label_shape = label.numpy().shape + break + self.train_data = self.train_data.map(reshape_function) + for data, label in self.train_data: + reshaped_data_shape = data.numpy().shape + reshaped_label_shape = label.numpy().shape + break + self.assertEqual( + reshaped_data_shape, + (int(original_data_shape[0] * original_data_shape[1] / 3), 3, 1)) + self.assertEqual(reshaped_label_shape, original_label_shape) + + +if __name__ == "__main__": + unittest.main() From fe28ea37087df028455dd49e3d1d9c15116707a3 Mon Sep 17 00:00:00 2001 From: Jose Ignacio Gomez Date: Thu, 5 Dec 2019 15:14:22 -0800 Subject: [PATCH 209/383] [Linalg] Add permutation information to tiling This patch closes issue #271. It adds an optional permutation map to declarative tiling transformations. The map is expressed as a list of integers. Closes #288 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/288 from tetuante:issue271 2df2938d6a1f01b3bc404ded08dea2dd1e10b588 PiperOrigin-RevId: 284064151 Change-Id: Ice4bdd1060c3607e2d59262475954c24090d5e30 --- .../mlir/xla/transforms/lhlo_fuse_linalg.cc | 3 +- .../Transforms/LinalgTransformPatterns.td | 8 ++- .../Linalg/Transforms/LinalgTransforms.h | 15 ++++- .../include/mlir/Dialect/Linalg/Utils/Utils.h | 28 ++++++-- third_party/mlir/include/mlir/IR/AffineMap.h | 9 +++ .../Linalg/Transforms/LinalgTransforms.cpp | 10 +-- .../lib/Dialect/Linalg/Transforms/Tiling.cpp | 47 +++++++++++--- third_party/mlir/lib/IR/AffineMap.cpp | 14 ++++ third_party/mlir/test/BUILD | 21 +++++- .../lib/DeclarativeTransforms/CMakeLists.txt | 4 ++ .../TestLinalgTilePermutePatterns.td | 57 +++++++++++++++++ .../mlir/test/lib/Transforms/CMakeLists.txt | 2 + .../TestLinalgTilePermuteTransforms.cpp | 64 +++++++++++++++++++ 13 files changed, 255 insertions(+), 27 deletions(-) create mode 100644 third_party/mlir/test/lib/DeclarativeTransforms/TestLinalgTilePermutePatterns.td create mode 100644 third_party/mlir/test/lib/Transforms/TestLinalgTilePermuteTransforms.cpp diff --git a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc index 4dabe0dea42..928bfc20cdb 100644 --- a/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc +++ b/tensorflow/compiler/mlir/xla/transforms/lhlo_fuse_linalg.cc @@ -54,7 +54,8 @@ struct LhloFuseLinalg : public FunctionPass { auto op = cast(generic_op.getOperation()); for (const Value* result : op.getOutputs()) { if (!func_args.count(result)) continue; - if (linalg::tileLinalgOp(b, op, tile_sizes, &folder)) { + if (linalg::tileLinalgOp(b, op, tile_sizes, /*permutation=*/{}, + &folder)) { generic_op.erase(); return; } diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td b/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td index 8bc0eaf2097..f558fa5da48 100644 --- a/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td +++ b/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td @@ -57,9 +57,13 @@ class TileAndFuseLinalgOp< // In the future, tile sizes should be derived from op properties + machine // description but we do not need to wait on this to start having useful // patterns. -class TileLinalgOp sizes, string value> : NativeCodeCall< +// `permutation` is an optional parameter to specify the ordering of the +// tiled loops. If provided, it must be a list of integers with the same number +// of elements as `sizes`. +class TileLinalgOp sizes, string value, list permutation=[]> : NativeCodeCall< "if (failed(tileLinalgOpAndSetMarker($_builder, $0, {" # - StrJoinInt.result # "}, \"" # value # "\")))" # + StrJoinInt.result # "}, \"" # value # "\", {" # + StrJoinInt.result # "})))" # " return matchFailure();">; //===----------------------------------------------------------------------===// diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h b/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h index 966b8f93135..89615e113c7 100644 --- a/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h +++ b/third_party/mlir/include/mlir/Dialect/Linalg/Transforms/LinalgTransforms.h @@ -58,11 +58,20 @@ bool isProducedByOpOfType(Operation *consumerOp, Value *consumedView) { // success. //////////////////////////////////////////////////////////////////////////////// -// Tiles `op` by `sizes` and sets the attribute `kLinalgTransformMarker` to -// `linalgMarker`. +/// Tiles `op` by `sizes` permuting the looops according to `permutation` +/// and sets the attribute `kLinalgTransformMarker` to `linalgMarker`. +/// The permutation is expressed as a list of integers that specify +/// the new ordering of the loop nest. The length of `permutation` +/// must be equal to the length of `tileSizes`. +/// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with +/// `permutation = [1,2,0]`. All values in `permutation` must be +/// integers, in the range 0..`tileSizes.size()` without duplications +/// (i.e. `[1,1,2]` is an invalid permutation). An empty list +/// states for the identity permutation. LogicalResult tileLinalgOpAndSetMarker(PatternRewriter &rewriter, Operation *op, ArrayRef sizes, - StringRef linalgMarker); + StringRef linalgMarker, + ArrayRef permutation); // Tiles `op` by `sizes`, fuses the producers of `operandIndicesToFuse` and sets // the attribute `kLinalgTransformMarker` to `linalgMarker`. diff --git a/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h b/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h index 91c7082b264..8dc78458c87 100644 --- a/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h +++ b/third_party/mlir/include/mlir/Dialect/Linalg/Utils/Utils.h @@ -134,23 +134,43 @@ struct TiledLinalgOp { }; /// Performs standalone tiling of a single LinalgOp by `tileSizes`. -/// Returns a struct containing the tiled loops and the cloned op if successful, -/// llvm::None otherwise. +/// and permute the loop nest according to `permutation` +/// The permutation is expressed as a list of integers that specify +/// the new ordering of the loop nest. The length of `permutation` +/// must be equal to the length of `tileSizes`. +/// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with +/// `permutation = [1,2,0]`. All values in `permutation` must be +/// integers, in the range 0..`tileSizes.size()` without duplications +/// (i.e. `[1,1,2]` is an invalid permutation). An empty list +/// states for the identity permutation. +/// Returns a struct containing the tiled loops in the specified order +/// and the cloned op if successful, llvm::None otherwise. /// When non-null, the optional pointer `folder` is used to call into the /// `createAndFold` builder method. If `folder` is null, the regular `create` /// method is called. llvm::Optional tileLinalgOp(OpBuilder &b, LinalgOp op, ArrayRef tileSizes, + ArrayRef permutation = {}, OperationFolder *folder = nullptr); /// Performs standalone tiling of a single LinalgOp by constant `tileSizes`. -/// Returns a struct containing the tiled loops and the cloned op if successful, -/// llvm::None otherwise. +/// and permute the loop nest according to `permutation` +/// The permutation is expressed as a list of integers that specify +/// the new ordering of the loop nest. The length of `permutation` +/// must be equal to the length of `tileSizes`. +/// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with +/// `permutation = [1,2,0]`. All values in `permutation` must be +/// integers, in the range 0..`tileSizes.size()` without duplications +/// (i.e. `[1,1,2]` is an invalid permutation). An empty list +/// states for the identity permutation. +/// Returns a struct containing the tiled loops in the specified order +/// and the cloned op if successful, llvm::None otherwise. /// When non-null, the optional pointer `folder` is used to call into the /// `createAndFold` builder method. If `folder` is null, the regular `create` /// method is called. llvm::Optional tileLinalgOp(OpBuilder &b, LinalgOp op, ArrayRef tileSizes, + ArrayRef permutation = {}, OperationFolder *folder = nullptr); template diff --git a/third_party/mlir/include/mlir/IR/AffineMap.h b/third_party/mlir/include/mlir/IR/AffineMap.h index 9b30f15628a..e42173d5a2b 100644 --- a/third_party/mlir/include/mlir/IR/AffineMap.h +++ b/third_party/mlir/include/mlir/IR/AffineMap.h @@ -65,6 +65,15 @@ public: static AffineMap getMultiDimIdentityMap(unsigned numDims, MLIRContext *context); + /// Returns an AffineMap representing a permutation. + /// The permutation is expressed as a non-empty vector of integers. + /// E.g. the permutation `(i,j,k) -> (j,k,i)` will be expressed with + /// `permutation = [1,2,0]`. All values in `permutation` must be + /// integers, in the range 0..`permutation.size()-1` without duplications + /// (i.e. `[1,1,2]` is an invalid permutation). + static AffineMap getPermutationMap(ArrayRef permutation, + MLIRContext *context); + MLIRContext *getContext() const; explicit operator bool() { return map != nullptr; } diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp index 0e4aaa7ac83..1b4509ffc11 100644 --- a/third_party/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp +++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/LinalgTransforms.cpp @@ -33,11 +33,11 @@ using namespace mlir::linalg; const StringLiteral mlir::linalg::LinalgTransforms::kLinalgTransformMarker = "__internal_linalg_transform__"; -LogicalResult mlir::linalg::tileLinalgOpAndSetMarker(PatternRewriter &rewriter, - Operation *op, - ArrayRef sizes, - StringRef linalgMarker) { - auto tileRes = tileLinalgOperation(rewriter, op, sizes); +LogicalResult mlir::linalg::tileLinalgOpAndSetMarker( + PatternRewriter &rewriter, Operation *op, ArrayRef sizes, + StringRef linalgMarker, ArrayRef permutation) { + assert(permutation.empty() || permutation.size() == sizes.size()); + auto tileRes = tileLinalgOperation(rewriter, op, sizes, permutation); if (!tileRes) return failure(); tileRes->op.setAttr(LinalgTransforms::kLinalgTransformMarker, diff --git a/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp index 09a1ba6b332..2c84eeecbba 100644 --- a/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp +++ b/third_party/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp @@ -215,10 +215,17 @@ makeTiledViews(OpBuilder &b, Location loc, LinalgOp linalgOp, return res; } -llvm::Optional -mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, - ArrayRef tileSizes, - OperationFolder *folder) { +void applyPermutationToLoopRanges(SmallVector &loopRanges, + ArrayRef permutation) { + SmallVector auxVec(loopRanges.size()); + for (unsigned i = 0; i < permutation.size(); ++i) + auxVec[i] = loopRanges[permutation[i]]; + loopRanges = auxVec; +} + +llvm::Optional mlir::linalg::tileLinalgOp( + OpBuilder &b, LinalgOp op, ArrayRef tileSizes, + ArrayRef permutation, OperationFolder *folder) { // 1. Enforce the convention that "tiling by zero" skips tiling a particular // dimension. This convention is significantly simpler to handle instead of // adjusting affine maps to account for missing dimensions. @@ -226,6 +233,15 @@ mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, op.getNumWindowLoops() == tileSizes.size() && "expected matching number of tile sizes and loops"); + + // If permutation is empty, use the identity. Build the permutation map + // otherwise. + auto invPermutationMap = AffineMap::getMultiDimIdentityMap( + tileSizes.size(), ScopedContext::getContext()); + if (!permutation.empty()) + invPermutationMap = inversePermutation( + AffineMap::getPermutationMap(permutation, ScopedContext::getContext())); + OpBuilder::InsertionGuard g(b); b.setInsertionPoint(op); ScopedContext scope(b, op.getLoc()); @@ -239,6 +255,8 @@ mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, auto loopRanges = makeTiledLoopRanges(b, scope.getLocation(), viewSizesToLoopsMap, viewSizes, tileSizes, folder); + if (!permutation.empty()) + applyPermutationToLoopRanges(loopRanges, permutation); // 3. Create the tiled loops. LinalgOp res = op; @@ -248,6 +266,15 @@ mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, auto b = ScopedContext::getBuilder(); auto loc = ScopedContext::getLocation(); SmallVector ivValues(ivs.begin(), ivs.end()); + + // If we have to apply a permutation to the tiled loop nest, we have to + // reorder the induction variables This permutation is the right one + // assuming that loopRanges have previously been permuted by + // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation of + // that one: (d0,d1,d2)->(d2,d0,d1) + if (!permutation.empty()) + ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues, folder); + auto views = makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes, folder); auto operands = getAssumedNonViewOperands(op); @@ -264,10 +291,9 @@ mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, return TiledLinalgOp{res, loops}; } -llvm::Optional -mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, - ArrayRef tileSizes, - OperationFolder *folder) { +llvm::Optional mlir::linalg::tileLinalgOp( + OpBuilder &b, LinalgOp op, ArrayRef tileSizes, + ArrayRef permutation, OperationFolder *folder) { if (tileSizes.empty()) return llvm::None; @@ -297,14 +323,15 @@ mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, tileSizeValues.push_back(constant_index(folder, 0)); } - return tileLinalgOp(b, op, tileSizeValues, folder); + return tileLinalgOp(b, op, tileSizeValues, permutation, folder); } static void tileLinalgOps(FuncOp f, ArrayRef tileSizes) { OpBuilder b(f); OperationFolder folder(f.getContext()); f.walk([tileSizes, &b, &folder](LinalgOp op) { - auto opLoopsPair = tileLinalgOp(b, op, tileSizes, &folder); + auto opLoopsPair = + tileLinalgOp(b, op, tileSizes, /*permutation=*/{}, &folder); // If tiling occurred successfully, erase old op. if (opLoopsPair) op.erase(); diff --git a/third_party/mlir/lib/IR/AffineMap.cpp b/third_party/mlir/lib/IR/AffineMap.cpp index e56d0e83f65..98357b1348b 100644 --- a/third_party/mlir/lib/IR/AffineMap.cpp +++ b/third_party/mlir/lib/IR/AffineMap.cpp @@ -106,6 +106,20 @@ AffineMap AffineMap::getConstantMap(int64_t val, MLIRContext *context) { {getAffineConstantExpr(val, context)}); } +/// Returns an AffineMap representing a permutation. +AffineMap AffineMap::getPermutationMap(ArrayRef permutation, + MLIRContext *context) { + assert(!permutation.empty() && + "Cannot create permutation map from empty permutation vector"); + SmallVector affExprs; + for (auto index : permutation) + affExprs.push_back(getAffineDimExpr(index, context)); + auto m = std::max_element(permutation.begin(), permutation.end()); + auto permutationMap = AffineMap::get(*m + 1, 0, affExprs); + assert(permutationMap.isPermutation() && "Invalid permutation vector"); + return permutationMap; +} + AffineMap AffineMap::getMultiDimIdentityMap(unsigned numDims, MLIRContext *context) { SmallVector dimExprs; diff --git a/third_party/mlir/test/BUILD b/third_party/mlir/test/BUILD index 63138125ed0..693b66f1dfd 100644 --- a/third_party/mlir/test/BUILD +++ b/third_party/mlir/test/BUILD @@ -18,7 +18,7 @@ cc_library( ) gentbl( - name = "TestTransformPatternsIncGen", + name = "TestLinalgTransformPatternsIncGen", tbl_outs = [ ( "-gen-rewriters", @@ -32,6 +32,21 @@ gentbl( ], ) +gentbl( + name = "TestLinalgTilePermuteTransformPatternsIncGen", + tbl_outs = [ + ( + "-gen-rewriters", + "lib/DeclarativeTransforms/TestLinalgTilePermutePatterns.h.inc", + ), + ], + tblgen = "@local_config_mlir//:mlir-tblgen", + td_file = "lib/DeclarativeTransforms/TestLinalgTilePermutePatterns.td", + td_srcs = [ + "@local_config_mlir//:LinalgTransformPatternsTdFiles", + ], +) + gentbl( name = "TestOpsIncGen", strip_include_prefix = "lib/TestDialect", @@ -130,6 +145,7 @@ cc_library( "lib/Transforms/TestCallGraph.cpp", "lib/Transforms/TestConstantFold.cpp", "lib/Transforms/TestInlining.cpp", + "lib/Transforms/TestLinalgTilePermuteTransforms.cpp", "lib/Transforms/TestLinalgTransforms.cpp", "lib/Transforms/TestLoopFusion.cpp", "lib/Transforms/TestLoopMapping.cpp", @@ -143,7 +159,8 @@ cc_library( includes = ["lib/TestDialect"], deps = [ ":TestDialect", - ":TestTransformPatternsIncGen", + ":TestLinalgTilePermuteTransformPatternsIncGen", + ":TestLinalgTransformPatternsIncGen", "@llvm//:support", "@local_config_mlir//:AffineOps", "@local_config_mlir//:Analysis", diff --git a/third_party/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt b/third_party/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt index 06e81a098f4..1ee62d82129 100644 --- a/third_party/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt +++ b/third_party/mlir/test/lib/DeclarativeTransforms/CMakeLists.txt @@ -1,3 +1,7 @@ set(LLVM_TARGET_DEFINITIONS TestLinalgTransformPatterns.td) mlir_tablegen(TestLinalgTransformPatterns.h.inc -gen-rewriters) add_public_tablegen_target(MLIRTestLinalgTransformPatternsIncGen) + +set(LLVM_TARGET_DEFINITIONS TestLinalgTilePermutePatterns.td) +mlir_tablegen(TestLinalgTilePermutePatterns.h.inc -gen-rewriters) +add_public_tablegen_target(MLIRTestLinalgTilePermutePatternsIncGen) diff --git a/third_party/mlir/test/lib/DeclarativeTransforms/TestLinalgTilePermutePatterns.td b/third_party/mlir/test/lib/DeclarativeTransforms/TestLinalgTilePermutePatterns.td new file mode 100644 index 00000000000..6d7bfffdf71 --- /dev/null +++ b/third_party/mlir/test/lib/DeclarativeTransforms/TestLinalgTilePermutePatterns.td @@ -0,0 +1,57 @@ +//===- TestLinalgTilePermutePatterns.td - Test patterns --*- tablegen ----*-===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This is the pattern definition file for declarative Linalg transformations +// tests. +// +//===----------------------------------------------------------------------===// + +#ifndef TEST_LINALG_TILEPERMUTE_PATTERNS +#define TEST_LINALG_TILEPERMUTE_PATTERNS + +include "mlir/Dialect/Linalg/Transforms/LinalgTransformPatterns.td" + +//===----------------------------------------------------------------------===// +// Linalg tiling and permutation patterns. +//===----------------------------------------------------------------------===// +def : Pat<(MatmulOp:$op $A, $B, $C), + (TileLinalgOp<[2000, 3000, 4000], "L2", [1,2,0]> $op), + [(Constraint]>> $op)]>; +def : Pat<(MatmulOp:$op $A, $B, $C), + (TileLinalgOp<[200, 300, 400], "L1", [1,0,2]> $op), + [(Constraint> $op)]>; +def : Pat<(MatmulOp:$op $A, $B, $C), + (TileLinalgOp<[20, 30, 40], "REG"> $op), + [(Constraint> $op)]>; + + +def : Pattern<(MatvecOp:$op $A, $b, $c), + [(TileLinalgOp<[5, 6], "L1", [1,0]> $op)], + [(Constraint $op)]>; + +def : Pattern<(DotOp:$op $a, $b, $c), + [(TileLinalgOp<[8000], "L1"> $op)], + [(Constraint, + HasLinalgTransformMarker<"L3">, + HasLinalgTransformMarker<"L2">]>> $op)]>; +def : Pattern<(DotOp:$op $a, $b, $c), + [(TileLinalgOp<[8], "REG"> $op)], + [(Constraint> $op)]>; + +#endif // TEST_LINALG_TILEPERMUTE_PATTERNS diff --git a/third_party/mlir/test/lib/Transforms/CMakeLists.txt b/third_party/mlir/test/lib/Transforms/CMakeLists.txt index 8bc9c736187..8a7933451b8 100644 --- a/third_party/mlir/test/lib/Transforms/CMakeLists.txt +++ b/third_party/mlir/test/lib/Transforms/CMakeLists.txt @@ -4,6 +4,7 @@ add_llvm_library(MLIRTestTransforms TestLoopFusion.cpp TestInlining.cpp TestLinalgTransforms.cpp + TestLinalgTilePermuteTransforms.cpp TestLoopMapping.cpp TestLoopParametricTiling.cpp TestOpaqueLoc.cpp @@ -21,6 +22,7 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../DeclarativeTransforms) include_directories(${CMAKE_CURRENT_BINARY_DIR}/../DeclarativeTransforms) add_dependencies(MLIRTestTransforms MLIRStandardOpsIncGen) add_dependencies(MLIRTestTransforms MLIRTestLinalgTransformPatternsIncGen) +add_dependencies(MLIRTestTransforms MLIRTestLinalgTilePermutePatternsIncGen) target_link_libraries(MLIRTestTransforms MLIRAffineOps MLIRAnalysis diff --git a/third_party/mlir/test/lib/Transforms/TestLinalgTilePermuteTransforms.cpp b/third_party/mlir/test/lib/Transforms/TestLinalgTilePermuteTransforms.cpp new file mode 100644 index 00000000000..ec7fa4e71b4 --- /dev/null +++ b/third_party/mlir/test/lib/Transforms/TestLinalgTilePermuteTransforms.cpp @@ -0,0 +1,64 @@ +//===- TestLinalgTilePermuteTransforms.cpp - Test Linalg tile + permute ---===// +// +// Copyright 2019 The MLIR Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// ============================================================================= +// +// This file implements logic for testing Linalg transformations. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Transforms/LinalgTransforms.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/IR/PatternMatch.h" +#include "mlir/Pass/Pass.h" + +using namespace mlir; +using namespace mlir::linalg; + +namespace mlir { +namespace linalg { +namespace { +#include "TestLinalgTilePermutePatterns.h.inc" +} // end namespace +} // end namespace linalg +} // end namespace mlir + +namespace { +struct TestLinalgTilePermuteTransforms + : public FunctionPass { + void runOnFunction() override; +}; +} // end anonymous namespace + +/// Apply transformations specified as patterns. +void TestLinalgTilePermuteTransforms::runOnFunction() { + OwningRewritePatternList patterns; + auto funcOp = getFunction(); + + // Add the generated patterns to the list. + linalg::populateWithGenerated(&getContext(), &patterns); + applyPatternsGreedily(funcOp, patterns); + + // Drop the marker. + funcOp.walk([](LinalgOp op) { + op.removeAttr(LinalgTransforms::kLinalgTransformMarker); + }); +} + +static PassRegistration + pass("test-linalg-tile-and-permute-patterns", + "Test Linalg transformation with permutation patterns by applying " + "them greedily."); From 4198b1c7315aef7b86b2c07c8b29d5c42791aaf1 Mon Sep 17 00:00:00 2001 From: Sean Silva Date: Thu, 5 Dec 2019 15:18:26 -0800 Subject: [PATCH 210/383] Produce better error message on SavedModel import failure This will now produce a useful error message like this instead of crashing: ``` Invalid argument: While importing SavedModel function 'terminal.consume': in input signature: Unhandled structured value kind 12 at index path: .1 This likely means that you have @tf.function on an exported function instead of @tf.function(input_signature=[...]). Consider narrowing your set of exported names. ``` PiperOrigin-RevId: 284065012 Change-Id: I0cdcfc95b9d4d5f2950f79e5124bcc2fae84e75c --- .../tf_saved_model/exported_python_args.py | 41 +++++++++++++++ .../mlir/tensorflow/translate/import_model.cc | 52 ++++++++++++++++--- 2 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/exported_python_args.py diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/exported_python_args.py b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/exported_python_args.py new file mode 100644 index 00000000000..f73aa83a76c --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/exported_python_args.py @@ -0,0 +1,41 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +# RUN: (! %p/exported_python_args 2>&1) | FileCheck %s + +# pylint: disable=missing-docstring,line-too-long,dangerous-default-value +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow.compat.v2 as tf +from tensorflow.compiler.mlir.tensorflow.tests.tf_saved_model import common + + +class TestModule(tf.Module): + + @tf.function(input_signature=[tf.TensorSpec([], tf.float32)]) + def some_function(self, x): + return self.callee(x) + + # CHECK: While importing SavedModel function 'callee': in input signature: + # CHECK-SAME: Unhandled structured value kind {{.*}} at index path: .1.foo + @tf.function + def callee(self, x, n={'foo': 42}): + return x + + +if __name__ == '__main__': + common.do_test(TestModule) diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc index 247e5f59de2..3659a6b5a2b 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc @@ -2280,7 +2280,11 @@ class StructuredValueLinearizer { // Returns the list of index paths to each leaf of the StructuredValue, // in a linearized order matching `tf.nest.flatten`. - llvm::ArrayRef GetLeafIndexPaths() const; + // + // If an error ocurred during the linearization process, an error message with + // `error_context` prepended will be included in the returned status. + StatusOr> GetLeafIndexPaths( + llvm::StringRef error_context) const; private: // Main function that recursively traverses the StructuredValue. @@ -2292,6 +2296,8 @@ class StructuredValueLinearizer { llvm::SmallVector current_index_path_; // The list of leaf index paths we have discovered so far. llvm::SmallVector leaf_index_paths_; + // If non-empty, an error message to report. + std::string error_message_; }; StructuredValueLinearizer::StructuredValueLinearizer( @@ -2300,9 +2306,19 @@ StructuredValueLinearizer::StructuredValueLinearizer( RecursivelyFindLeaves(value); } -llvm::ArrayRef StructuredValueLinearizer::GetLeafIndexPaths() - const { - return leaf_index_paths_; +StatusOr> +StructuredValueLinearizer::GetLeafIndexPaths( + llvm::StringRef error_context) const { + if (error_message_.empty()) { + return llvm::makeArrayRef(leaf_index_paths_); + } + return errors::InvalidArgument( + error_context.str(), error_message_, + "This likely means that you have @tf.function " + "on an exported function instead of " + "@tf.function(input_signature=[...]). Consider annotating an " + "input_signature or narrowing your set of " + "exported names to not include this function."); } void StructuredValueLinearizer::RecursivelyFindLeaves( @@ -2358,7 +2374,20 @@ void StructuredValueLinearizer::RecursivelyFindLeaves( return; } default: { - llvm_unreachable("Unhandled StructuredValue kind!"); + llvm::raw_string_ostream os(error_message_); + // TODO(silvasean): Use an enumerant name string instead of a number. + os << "Unhandled structured value kind " << value.kind_case() + << " at index path: "; + for (auto path_element : current_index_path_) { + os << "."; + if (auto integer = path_element.dyn_cast()) { + os << integer.getValue(); + } else { + auto str = path_element.cast(); + os << str.getValue(); + } + } + os << "\n"; } } } @@ -2452,6 +2481,9 @@ Status CreateSavedModelIR( if (object_names.GetExportedNames(node_id).empty()) { continue; } + std::string error_context = + "While importing SavedModel function '" + + object_names.GetExportedNames(node_id)[0].str() + "': "; const SavedFunction& function = object.function(); auto orig_func = symbol_table.lookup( tf_name_to_mlir_name.find(function.concrete_functions(0))->second); @@ -2500,9 +2532,12 @@ Status CreateSavedModelIR( int bound_input_base = func.getNumArguments() - concrete_function.bound_inputs_size(); - auto input_index_paths = input_linearizer.GetLeafIndexPaths(); + TF_ASSIGN_OR_RETURN(auto input_index_paths, + input_linearizer.GetLeafIndexPaths( + error_context + "in input signature: ")); if (bound_input_base != input_index_paths.size()) { return errors::InvalidArgument( + error_context, "Argument mismatch between concrete function input signature " "vs underlying FunctionDef for concrete function '", function.concrete_functions(0), "' (", input_index_paths.size(), @@ -2523,9 +2558,12 @@ Status CreateSavedModelIR( StructuredValueLinearizer output_linearizer( concrete_function.output_signature(), builder.getContext()); - auto output_index_paths = output_linearizer.GetLeafIndexPaths(); + TF_ASSIGN_OR_RETURN(auto output_index_paths, + output_linearizer.GetLeafIndexPaths( + error_context + "in output signature: ")); if (func.getNumResults() != output_index_paths.size()) { return errors::InvalidArgument( + error_context, "Result mismatch between concrete function output signature " "vs underlying FunctionDef for concrete function '", function.concrete_functions(0), "' (", output_index_paths.size(), From 9acd3202ddcfa6518f21a657e0dd88c34af4b70e Mon Sep 17 00:00:00 2001 From: Karmel Allison Date: Thu, 5 Dec 2019 15:20:38 -0800 Subject: [PATCH 211/383] Add docstrings for get/set layer weights. PiperOrigin-RevId: 284065436 Change-Id: I11035ec0fbe39afc44cec0a3dcb45f087d6442a2 --- tensorflow/python/keras/engine/base_layer.py | 61 +++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py index 276431b615b..b5d1d31219b 100644 --- a/tensorflow/python/keras/engine/base_layer.py +++ b/tensorflow/python/keras/engine/base_layer.py @@ -124,7 +124,7 @@ class Layer(module.Module): using Python control flow. If `False`, we assume that the layer can safely be used to generate a static computation graph. - Read-only properties: + Attributes (read-only properties): name: The name of the layer (string). dtype: The dtype of the layer's computations and weights. If mixed precision is used with a `tf.keras.mixed_precision.experimental.Policy`, @@ -1264,6 +1264,36 @@ class Layer(module.Module): def set_weights(self, weights): """Sets the weights of the layer, from Numpy arrays. + The weights of a layer represent the state of the layer. This function + sets the weight values from numpy arrays. The weight values should be + passed in the order they are created by the layer. Note that the layer's + weights must be instantiated before calling this function by calling + the layer. + + For example, a Dense layer returns a list of two values-- per-output + weights and the bias value. These can be used to set the weights of another + Dense layer: + + >>> a = tf.keras.layers.Dense(1, + ... kernel_initializer=tf.constant_initializer(1.)) + >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]])) + >>> a.get_weights() + [array([[1.], + [1.], + [1.]], dtype=float32), array([0.], dtype=float32)] + >>> b = tf.keras.layers.Dense(1, + ... kernel_initializer=tf.constant_initializer(2.)) + >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]])) + >>> b.get_weights() + [array([[2.], + [2.], + [2.]], dtype=float32), array([0.], dtype=float32)] + >>> b.set_weights(a.get_weights()) + >>> b.get_weights() + [array([[1.], + [1.], + [1.]], dtype=float32), array([0.], dtype=float32)] + Arguments: weights: a list of Numpy arrays. The number of arrays and their shape must match @@ -1314,6 +1344,35 @@ class Layer(module.Module): def get_weights(self): """Returns the current weights of the layer. + The weights of a layer represent the state of the layer. This function + returns both trainable and non-trainable weight values associated with this + layer as a list of Numpy arrays, which can in turn be used to load state + into similarly parameterized layers. + + For example, a Dense layer returns a list of two values-- per-output + weights and the bias value. These can be used to set the weights of another + Dense layer: + + >>> a = tf.keras.layers.Dense(1, + ... kernel_initializer=tf.constant_initializer(1.)) + >>> a_out = a(tf.convert_to_tensor([[1., 2., 3.]])) + >>> a.get_weights() + [array([[1.], + [1.], + [1.]], dtype=float32), array([0.], dtype=float32)] + >>> b = tf.keras.layers.Dense(1, + ... kernel_initializer=tf.constant_initializer(2.)) + >>> b_out = b(tf.convert_to_tensor([[10., 20., 30.]])) + >>> b.get_weights() + [array([[2.], + [2.], + [2.]], dtype=float32), array([0.], dtype=float32)] + >>> b.set_weights(a.get_weights()) + >>> b.get_weights() + [array([[1.], + [1.], + [1.]], dtype=float32), array([0.], dtype=float32)] + Returns: Weights values as a list of numpy arrays. """ From c6a005e1bfd1cb9c285fa8f79b858565db8a95a4 Mon Sep 17 00:00:00 2001 From: Yuanzhong Xu Date: Thu, 5 Dec 2019 15:27:31 -0800 Subject: [PATCH 212/383] [MLIR:TF/XLA] Add a cross-function resource device inference pass. PiperOrigin-RevId: 284066714 Change-Id: I159d9da60b3f2c34efe36512cca4e5109404db08 --- tensorflow/compiler/mlir/tensorflow/BUILD | 1 + .../tests/resource-device-inference.mlir | 244 +++++++++++++++ .../mlir/tensorflow/transforms/bridge.cc | 1 + .../mlir/tensorflow/transforms/passes.h | 3 + .../transforms/resource_device_inference.cc | 278 ++++++++++++++++++ 5 files changed, 527 insertions(+) create mode 100644 tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir create mode 100644 tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index 0532c929658..28b94818567 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -213,6 +213,7 @@ cc_library( "transforms/raise_control_flow.cc", "transforms/replicate_invariant_op_hoisting.cc", "transforms/replicate_to_island.cc", + "transforms/resource_device_inference.cc", "transforms/resource_op_lifting.cc", "transforms/shape_inference.cc", "transforms/shape_inference_pass.cc", diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir new file mode 100644 index 00000000000..c98e40fed05 --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/tests/resource-device-inference.mlir @@ -0,0 +1,244 @@ +// RUN: tf-opt -split-input-file -verify-diagnostics -tf-resource-device-inference %s | FileCheck %s --dump-input=fail + +// Tests that the pass can correctly propagate device attributes inside the same +// function. + +// CHECK-LABEL: func @propagate_in_function +func @propagate_in_function( + %arg0: tensor<*x!tf.resource>> {tf.device = "/TPU:0"}, + %arg1: tensor<*x!tf.resource>> {tf.device = "/TPU:1"}) { + tf_executor.graph { + // CHECK: tf_executor.island + %island = tf_executor.island { + // CHECK-NEXT: "tf.VarHandleOp" + %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/CPU:0"} + : () -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id1 = "tf.Identity"(%id0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/CPU:0"} + %id2 = "tf.Identity"(%var_handle) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + %read = "tf.ReadVariableOp"(%id2) : (tensor<*x!tf.resource>>) -> tensor<32xf32> + %id3 = "tf.Identity"(%read) : (tensor<32xf32>) -> tensor<32xf32> + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + +// ----- + +// Tesets that the pass can propagate through tf.If's branches. + +// CHECK-LABEL: func @propagate_if_op +func @propagate_if_op( + %arg0: tensor<*x!tf.resource>> {tf.device = "/TPU:0"}, + %arg1: tensor) { + tf_executor.graph { + // CHECK: tf_executor.island + %island = tf_executor.island { + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.VarHandleOp" + %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} + : () -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.If" + "tf.If"(%arg1, %id0, %var_handle) { + then_branch = @if_then, + else_branch = @if_else, + output_shapes = [], is_stateless = false} + : (tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>>) -> () + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + +// CHECK-LABEL: func @if_then +func @if_then( + %arg0: tensor<*x!tf.resource>>, + %arg1: tensor<*x!tf.resource>>) { + tf_executor.graph { + // CHECK: tf_executor.island + %island = tf_executor.island { + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:1"} + %id1 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + +// CHECK-LABEL: func @if_else +func @if_else( + %arg0: tensor<*x!tf.resource>>, + %arg1: tensor<*x!tf.resource>>) { + tf_executor.graph { + // CHECK: tf_executor.island + %island = tf_executor.island { + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + + +// ----- + +// Tesets that the pass can propagate through tf.While's branches. + +// CHECK-LABEL: func @propagate_while_op +func @propagate_while_op( + %arg0: tensor<*x!tf.resource>> {tf.device = "/TPU:0"}, + %arg1: tensor) { + tf_executor.graph { + // CHECK: tf_executor.island + %island = tf_executor.island { + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.VarHandleOp" + %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} + : () -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.While" + "tf.While"(%arg1, %id0, %var_handle) { + body = @while_body, + cond = @while_cond, + output_shapes = [], is_stateless = false} + : (tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>>) -> + (tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>>) + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + +// CHECK-LABEL: func @while_body +func @while_body( + %arg0: tensor, + %arg1: tensor<*x!tf.resource>>, + %arg2: tensor<*x!tf.resource>>) -> + (tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>>) { + %graph:3 = tf_executor.graph { + // CHECK: tf_executor.island + %island:4 = tf_executor.island { + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:1"} + %id1 = "tf.Identity"(%arg2) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + tf_executor.yield %arg0, %id0, %id1 + : tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>> + } + tf_executor.fetch %island#0, %island#1, %island#2 + : tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>> + } + return %graph#0, %graph#1, %graph#2 + : tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>> +} + +// CHECK-LABEL: func @while_cond +func @while_cond( + %arg0: tensor, + %arg1: tensor<*x!tf.resource>>, + %arg2: tensor<*x!tf.resource>>) -> tensor<32xf32> { + %graph = tf_executor.graph { + // CHECK: tf_executor.island + %island:2 = tf_executor.island { + // CHECK-NEXT: "tf.Identity" + // CHECK-SAME: {device = "/TPU:0"} + %id0 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + %read = "tf.ReadVariableOp"(%id0) + : (tensor<*x!tf.resource>>) -> tensor<32xf32> + tf_executor.yield %read : tensor<32xf32> + } + tf_executor.fetch %island#0 : tensor<32xf32> + } + return %graph : tensor<32xf32> +} + +// ----- + +// Tesets that the pass reports error on conflicting assignments from multiple +// callers. + +func @error_on_conflict_multiple_callers( + %arg0: tensor<*x!tf.resource>> {tf.device = "/TPU:0"}, + %arg1: tensor) { + tf_executor.graph { + %island = tf_executor.island { + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + %var_handle = "tf.VarHandleOp"() {container = "c", shared_name = "v0", device = "/TPU:1"} + : () -> tensor<*x!tf.resource>> + "tf.If"(%arg1, %id0, %var_handle) { + then_branch = @if_then_and_else, + else_branch = @if_then_and_else, + output_shapes = [], is_stateless = false} + : (tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>>) -> () + "tf.If"(%arg1, %var_handle, %id0) { + // expected-error@above {{Conflicting device assignment for resource}} + then_branch = @if_then_and_else, + else_branch = @if_then_and_else, + output_shapes = [], is_stateless = false} + : (tensor, tensor<*x!tf.resource>>, + tensor<*x!tf.resource>>) -> () + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} + +func @if_then_and_else( + %arg0: tensor<*x!tf.resource>>, + %arg1: tensor<*x!tf.resource>>) { + tf_executor.graph { + %island = tf_executor.island { + %id0 = "tf.Identity"(%arg0) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + %id1 = "tf.Identity"(%arg1) : (tensor<*x!tf.resource>>) + -> tensor<*x!tf.resource>> + tf_executor.yield + } + tf_executor.fetch %island : !tf_executor.control + } + return +} diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc index a7f45c41f15..d964b9cf087 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc @@ -34,6 +34,7 @@ void CreateTPUBridge(OpPassManager &pm) { func_pm.addPass(tf_executor::CreateTFExecutorConstantSinkingPass()); func_pm.addPass(TFDevice::CreateResourceOpLiftingPass()); + pm.addPass(TF::CreateResourceDeviceInferencePass()); pm.addPass(TFDevice::CreateClusterOutliningPass()); pm.addPass(CreateTPURewritePass()); pm.addNestedPass(TFDevice::CreateReplicateInvariantOpHoistingPass()); diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h index 30ee91f4aea..f64d0b00406 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h +++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h @@ -57,6 +57,9 @@ struct StandardPipelineOptions : public PassOptions { // NOLINTNEXTLINE - MLIR contract is pass by mutable reference. void CreateTFStandardPipeline(OpPassManager& pm, const StandardPipelineOptions& options); + +// Propagates device attributes of resources from callers to callees. +std::unique_ptr> CreateResourceDeviceInferencePass(); } // namespace TF namespace TFControlFlow { diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc new file mode 100644 index 00000000000..616c2cb10e8 --- /dev/null +++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_device_inference.cc @@ -0,0 +1,278 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include +#include +#include +#include + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/iterator_range.h" +#include "llvm/Support/Casting.h" +#include "mlir/IR/Attributes.h" // TF:local_config_mlir +#include "mlir/IR/Builders.h" // TF:local_config_mlir +#include "mlir/IR/Function.h" // TF:local_config_mlir +#include "mlir/IR/Operation.h" // TF:local_config_mlir +#include "mlir/IR/Types.h" // TF:local_config_mlir +#include "mlir/IR/Value.h" // TF:local_config_mlir +#include "mlir/IR/Visitors.h" // TF:local_config_mlir +#include "mlir/Pass/Pass.h" // TF:local_config_mlir +#include "mlir/Pass/PassRegistry.h" // TF:local_config_mlir +#include "mlir/Support/LogicalResult.h" // TF:local_config_mlir +#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h" +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" + +namespace mlir { +namespace TF { + +namespace { +constexpr char kDeviceAttr[] = "device"; +constexpr char kFuncDeviceAttr[] = "tf.device"; + +// A pass that propagates device assignment of resources on a module. It +// performs in-function propagation, as well as cross-function propagation from +// callers to callees. +// +// This pass changes the module by adding "tf.device" attribute to function +// arguments and adding "device" attribute to TF ops. +struct ResourceDeviceInference : public ModulePass { + void runOnModule() override; +}; + +// A class that records each resource's device assignment in a function. +class PerFunctionResult { + public: + explicit PerFunctionResult(FuncOp func_op) : alias_analysis_(func_op) {} + + // Returns the recorded device assignment for a resource, if any. + llvm::Optional DeviceForResource( + const Value* resource) const { + llvm::Optional result; + if (alias_analysis_.IsUnknownResource(resource)) return result; + for (int64_t id : alias_analysis_.GetResourceUniqueIds(resource)) { + auto it = resource_id_to_device_.find(id); + if (it == resource_id_to_device_.end()) continue; + if (!result) { + result = it->getSecond(); + continue; + } + if (result != it->getSecond()) { + // Got conflicting assignments, clear the result. + result.reset(); + return result; + } + } + return result; + } + + // Records the device assignment for a resource. If the new assignment + // conflicts with an existing one, returns an error. + // + // If `changed` is provided, assign *changed to true if anything is modified. + LogicalResult AddResourceDevice(const Value* resource, llvm::StringRef device, + bool* changed = nullptr) { + if (alias_analysis_.IsUnknownResource(resource)) return success(); + for (int64_t id : alias_analysis_.GetResourceUniqueIds(resource)) { + auto emplace_res = resource_id_to_device_.try_emplace(id, device); + if (emplace_res.second) { + if (changed) *changed = true; + } else if (emplace_res.first->getSecond() != device) { + // Existing assignment does not equal the new assignment. + return failure(); + } + } + return success(); + } + + private: + llvm::SmallDenseMap resource_id_to_device_; + TF::ResourceAliasAnalysis alias_analysis_; +}; + +// Tries to record device assignment for a resource. +LogicalResult AddResourceDeviceAndEmitError(const Value* resource, + llvm::StringRef device, + Operation* error_reporting_op, + PerFunctionResult* result, + bool* changed = nullptr) { + auto res = result->AddResourceDevice(resource, device, changed); + if (failed(res)) { + error_reporting_op->emitError() + << "Conflicting device assignment for resource"; + } + return res; +} + +// Propagates device assignment inside a function. +LogicalResult ComputeResourceDevicesInComputation(FuncOp func_op, + PerFunctionResult* result) { + OpBuilder builder(func_op); + // Function arguments. + for (auto arg : func_op.getArguments()) { + if (!mlir::getElementTypeOrSelf(arg->getType()).isa()) { + continue; + } + auto device_attr = func_op.getArgAttrOfType( + arg->getArgNumber(), kFuncDeviceAttr); + if (!device_attr || device_attr.getValue() == "") { + // If device_attr does not exist, try to construct it from any recorded + // assignment. + if (auto device = result->DeviceForResource(arg)) { + func_op.setArgAttr(arg->getArgNumber(), kFuncDeviceAttr, + builder.getStringAttr(*device)); + } + continue; + } + // Record the attribute. + auto res = AddResourceDeviceAndEmitError(arg, device_attr.getValue(), + func_op, result); + if (failed(res)) return res; + } + auto walk_res = func_op.walk([&](Operation* op) { + if (auto var_handle = llvm::dyn_cast(op)) { + // Record VarHanldeOp's device attribute. + auto device_attr = + var_handle.getAttrOfType(kDeviceAttr); + if (!device_attr || device_attr.getValue().empty()) { + return WalkResult::advance(); + } + auto res = AddResourceDeviceAndEmitError( + var_handle.resource(), device_attr.getValue(), op, result); + if (failed(res)) return WalkResult::interrupt(); + } + if (auto identity = llvm::dyn_cast(op)) { + // Try to construct IdentityOp's attribute from recorded assignment. + if (!mlir::getElementTypeOrSelf(identity.output()->getType()) + .isa()) { + return WalkResult::advance(); + } + if (auto device = result->DeviceForResource(identity.output())) { + auto device_attr = + identity.getAttrOfType(kDeviceAttr); + if (!device_attr || device_attr.getValue().empty()) { + identity.setAttr(kDeviceAttr, builder.getStringAttr(*device)); + } + } + return WalkResult::advance(); + } + // Propagate and record output device assignment for other ops based on + // existing recording. E.g., IdentityN. + for (auto output : op->getResults()) { + if (!mlir::getElementTypeOrSelf(output->getType()) + .isa()) { + continue; + } + if (auto device = result->DeviceForResource(output)) { + auto res = AddResourceDeviceAndEmitError(output, *device, op, result); + if (failed(res)) return WalkResult::interrupt(); + } + } + return WalkResult::advance(); + }); + return failure(walk_res.wasInterrupted()); +} + +void ResourceDeviceInference::runOnModule() { + auto module = getModule(); + llvm::SmallDenseMap per_function_results; + llvm::SetVector worklist; + module.walk([&](FuncOp func_op) { + worklist.insert(func_op); + per_function_results.try_emplace(func_op, func_op); + }); + // Helper that propagates an op's recorded operand device assignments to its + // called function's arguments. + auto propagate_operands_to_callee_arguments = + [&](Operation* caller, + llvm::iterator_range caller_operands, + llvm::StringRef called_func_name, + const PerFunctionResult& caller_res) { + auto callee = + llvm::dyn_cast(module.lookupSymbol(called_func_name)); + assert(callee); + auto& callee_res = per_function_results.find(callee)->getSecond(); + bool callee_needs_recompute = false; + for (auto operand_and_argument : + llvm::zip(caller_operands, callee.getArguments())) { + if (!mlir::getElementTypeOrSelf( + std::get<0>(operand_and_argument)->getType()) + .isa()) { + continue; + } + auto device = + caller_res.DeviceForResource(std::get<0>(operand_and_argument)); + if (!device) continue; + if (failed(AddResourceDeviceAndEmitError( + std::get<1>(operand_and_argument), *device, caller, + &callee_res, &callee_needs_recompute))) { + return failure(); + } + } + // If the callee recording is modified, make sure that it will be + // reprocessed. + if (callee_needs_recompute) { + worklist.insert(callee); + } + return success(); + }; + while (!worklist.empty()) { + auto func_op = worklist.back(); + worklist.pop_back(); + auto& func_res = per_function_results.find(func_op)->getSecond(); + // In-function propagation. + if (failed(ComputeResourceDevicesInComputation(func_op, &func_res))) { + return signalPassFailure(); + } + // Propagation to callees. + auto walk_res = func_op.walk([&](Operation* op) { + if (auto while_op = llvm::dyn_cast(op)) { + if (failed(propagate_operands_to_callee_arguments( + while_op, while_op.getOperands(), while_op.body(), func_res)) || + failed(propagate_operands_to_callee_arguments( + while_op, while_op.getOperands(), while_op.cond(), func_res))) { + return WalkResult::interrupt(); + } + } else if (auto if_op = llvm::dyn_cast(op)) { + if (failed(propagate_operands_to_callee_arguments( + if_op, if_op.input(), if_op.then_branch(), func_res)) || + failed(propagate_operands_to_callee_arguments( + if_op, if_op.input(), if_op.else_branch(), func_res))) { + return WalkResult::interrupt(); + } + } + return WalkResult::advance(); + }); + if (walk_res.wasInterrupted()) return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr> CreateResourceDeviceInferencePass() { + return std::make_unique(); +} + +static PassRegistration pass( + "tf-resource-device-inference", + "Propagates the device attribute on resources from callers to callees."); + +} // namespace TF +} // namespace mlir From 3734535402a9d131f5106884d5bb5d67ba107894 Mon Sep 17 00:00:00 2001 From: Yu-Cheng Ling Date: Thu, 5 Dec 2019 15:32:14 -0800 Subject: [PATCH 213/383] Introduc `_USE_EXPERIMENTAL_NEW_CONVERTER` in lite module. PiperOrigin-RevId: 284067708 Change-Id: Ic6b52d518de6e85a2a779ac2a27f89dc791c5ad6 --- tensorflow/lite/python/lite.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py index 57a0f21e72e..52628218997 100644 --- a/tensorflow/lite/python/lite.py +++ b/tensorflow/lite/python/lite.py @@ -76,6 +76,10 @@ from tensorflow.python.util import deprecation as _deprecation from tensorflow.python.util.tf_export import tf_export as _tf_export +# The default value of `experimental_new_converter`. +_USE_EXPERIMENTAL_NEW_CONVERTER = False + + @_tf_export("lite.Optimize") class Optimize(enum.Enum): """Enum defining the optimizations to apply when generating tflite graphs. @@ -167,7 +171,7 @@ class TFLiteConverterBase(object): self.target_spec = TargetSpec() self.optimizations = [] self.representative_dataset = None - self.experimental_new_converter = False + self.experimental_new_converter = _USE_EXPERIMENTAL_NEW_CONVERTER self.experimental_new_quantizer = False # The 'GraphDebugInfo' contains the stack traces of all the original nodes # in the `GraphDef` to the converter. From a811aba5ce25c7b5a75e76739951dc6a448198d1 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Thu, 5 Dec 2019 15:32:59 -0800 Subject: [PATCH 214/383] Add include path to the TestDialect to fix broken build. PiperOrigin-RevId: 284067891 Change-Id: I4014753aed53b6b90dc4f8bd141e2eceff2b9f49 --- third_party/mlir/test/lib/IR/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/third_party/mlir/test/lib/IR/CMakeLists.txt b/third_party/mlir/test/lib/IR/CMakeLists.txt index 9e3b8fbf369..439d3a403b3 100644 --- a/third_party/mlir/test/lib/IR/CMakeLists.txt +++ b/third_party/mlir/test/lib/IR/CMakeLists.txt @@ -4,6 +4,8 @@ add_llvm_library(MLIRTestIR ADDITIONAL_HEADER_DIRS ) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../TestDialect) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/../TestDialect) target_link_libraries(MLIRTestIR MLIRPass ) From 5ccbaeeef543858f47f0f8c4b1ce2f9de3c9ddc9 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Thu, 5 Dec 2019 15:38:48 -0800 Subject: [PATCH 215/383] Pull register_extension_info up into core/platform/build_config_root.bzl PiperOrigin-RevId: 284069053 Change-Id: Ie9ee3e46a707c9ca109ae1192c6d419bb30cf4a2 --- tensorflow/core/platform/build_config_root.bzl | 2 ++ tensorflow/core/platform/default/build_config_root.bzl | 3 +++ tensorflow/tensorflow.bzl | 4 +--- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/platform/build_config_root.bzl b/tensorflow/core/platform/build_config_root.bzl index 7da423bfefa..6a09333e4c5 100644 --- a/tensorflow/core/platform/build_config_root.bzl +++ b/tensorflow/core/platform/build_config_root.bzl @@ -5,6 +5,7 @@ load( _if_dynamic_kernels = "if_dynamic_kernels", _if_static = "if_static", _if_static_and_not_mobile = "if_static_and_not_mobile", + _register_extension_info = "register_extension_info", _tf_additional_grpc_deps_py = "tf_additional_grpc_deps_py", _tf_additional_license_deps = "tf_additional_license_deps", _tf_additional_plugin_deps = "tf_additional_plugin_deps", @@ -18,6 +19,7 @@ load( if_dynamic_kernels = _if_dynamic_kernels if_static = _if_static if_static_and_not_mobile = _if_static_and_not_mobile +register_extension_info = _register_extension_info tf_additional_grpc_deps_py = _tf_additional_grpc_deps_py tf_additional_license_deps = _tf_additional_license_deps tf_additional_plugin_deps = _tf_additional_plugin_deps diff --git a/tensorflow/core/platform/default/build_config_root.bzl b/tensorflow/core/platform/default/build_config_root.bzl index 2c0f73c8e90..c74ccdc506a 100644 --- a/tensorflow/core/platform/default/build_config_root.bzl +++ b/tensorflow/core/platform/default/build_config_root.bzl @@ -67,3 +67,6 @@ def if_dynamic_kernels(extra_deps, otherwise = []): str(Label("//tensorflow:dynamic_loaded_kernels")): extra_deps, "//conditions:default": otherwise, }) + +def register_extension_info(**kwargs): + pass diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index e3cfbb63d34..3adf5fe9a4e 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -4,6 +4,7 @@ load( "//tensorflow/core/platform:build_config_root.bzl", "if_dynamic_kernels", "if_static", + "register_extension_info", "tf_additional_grpc_deps_py", "tf_additional_xla_deps_py", "tf_exec_compatible_with", @@ -47,9 +48,6 @@ load( "if_ngraph", ) -def register_extension_info(**kwargs): - pass - # version for the shared libraries, can # not contain rc or alpha, only numbers. # Also update tensorflow/core/public/version.h From f7ccdb449c69bb092b25eb5e0d07bb2c79e8d6db Mon Sep 17 00:00:00 2001 From: Robert David Date: Thu, 5 Dec 2019 16:09:29 -0800 Subject: [PATCH 216/383] Skip VectorVectorCwiseProduct when cell_state is known to be all zeros. PiperOrigin-RevId: 284075550 Change-Id: I4d6bdef76e5306671b3ee165e0b496095d2bf83a --- tensorflow/lite/kernels/lstm_eval.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/kernels/lstm_eval.cc b/tensorflow/lite/kernels/lstm_eval.cc index 6e1dc28ce34..a18af287b7d 100644 --- a/tensorflow/lite/kernels/lstm_eval.cc +++ b/tensorflow/lite/kernels/lstm_eval.cc @@ -702,8 +702,10 @@ inline void LstmStepWithAuxInput( forget_gate_scratch); // For each batch and cell: update the cell. - tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, - n_batch * n_cell, cell_state_ptr); + if (!is_cell_state_all_zeros) { + tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr, + n_batch * n_cell, cell_state_ptr); + } if (is_layer_norm_lstm) { tensor_utils::MeanStddevNormalization(cell_scratch, cell_scratch, n_cell, n_batch); From 36ba3b17c55002a76044390a9c7a38ab21bc1e32 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Thu, 5 Dec 2019 16:23:22 -0800 Subject: [PATCH 217/383] Fixes docstring for tf.math.add_n PiperOrigin-RevId: 284078231 Change-Id: Ia71fc572fb085f0a9c92b37b7f49d39319a0b939 --- tensorflow/python/ops/math_ops.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index c3f453a0275..b29d413e194 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -3093,8 +3093,6 @@ def _as_indexed_slices_list(inputs, optimize=True): def add_n(inputs, name=None): """Adds all input tensors element-wise. - Converts `IndexedSlices` objects into dense tensors prior to adding. - `tf.math.add_n` performs the same operation as `tf.math.accumulate_n`, but it waits for all of its inputs to be ready before beginning to sum. This buffering can result in higher memory consumption when inputs are ready @@ -3108,19 +3106,21 @@ def add_n(inputs, name=None): For example: - ```python - a = tf.constant([[3, 5], [4, 8]]) - b = tf.constant([[1, 6], [2, 9]]) - tf.math.add_n([a, b, a]) # [[7, 16], [10, 25]] - ``` + >>> a = tf.constant([[3, 5], [4, 8]]) + >>> b = tf.constant([[1, 6], [2, 9]]) + >>> tf.math.add_n([a, b, a]) + Args: - inputs: A list of `tf.Tensor` or `tf.IndexedSlices` objects, each with same - shape and type. + inputs: A list of `tf.Tensor` or `tf.IndexedSlices` objects, each with the + same shape and type. `tf.IndexedSlices` objects will be converted into + dense tensors prior to adding. name: A name for the operation (optional). Returns: - A `Tensor` of same shape and type as the elements of `inputs`. + A `tf.Tensor` of the same shape and type as the elements of `inputs`. Raises: ValueError: If `inputs` don't all have same shape and dtype or the shape @@ -4396,4 +4396,4 @@ def sqrt(x, name=None): # pylint: disable=redefined-builtin Returns: A `tf.Tensor` of same size, type and sparsity as `x`. """ - return gen_math_ops.sqrt(x, name) \ No newline at end of file + return gen_math_ops.sqrt(x, name) From c73c99ca3e0bacf2bca313f270bb3eae28869530 Mon Sep 17 00:00:00 2001 From: Thomas O'Malley Date: Thu, 5 Dec 2019 16:24:01 -0800 Subject: [PATCH 218/383] Remove single path fallbacks in v2 and start to clean up compile. PiperOrigin-RevId: 284078361 Change-Id: Iaa163487b4acee59077b5bb4c2232f5742d0e9ba --- .../keras/distribute/keras_utils_test.py | 39 +- tensorflow/python/keras/engine/training.py | 468 ++++-------------- .../python/keras/engine/training_test.py | 43 +- tensorflow/python/keras/engine/training_v1.py | 4 +- tensorflow/python/keras/engine/training_v2.py | 23 +- .../python/keras/keras_parameterized.py | 11 +- .../python/keras/keras_parameterized_test.py | 60 +-- .../experimental/keras_test.py | 7 +- tensorflow/python/keras/models.py | 4 + tensorflow/python/keras/models_test.py | 19 +- .../golden/v1/tensorflow.keras.-model.pbtxt | 6 +- .../v1/tensorflow.keras.-sequential.pbtxt | 6 +- ...low.keras.experimental.-linear-model.pbtxt | 6 +- ....keras.experimental.-wide-deep-model.pbtxt | 6 +- .../v1/tensorflow.keras.models.-model.pbtxt | 6 +- .../tensorflow.keras.models.-sequential.pbtxt | 6 +- .../golden/v2/tensorflow.keras.-model.pbtxt | 6 +- .../v2/tensorflow.keras.-sequential.pbtxt | 6 +- ...low.keras.experimental.-linear-model.pbtxt | 6 +- ....keras.experimental.-wide-deep-model.pbtxt | 6 +- .../v2/tensorflow.keras.models.-model.pbtxt | 6 +- .../tensorflow.keras.models.-sequential.pbtxt | 6 +- 22 files changed, 204 insertions(+), 546 deletions(-) diff --git a/tensorflow/python/keras/distribute/keras_utils_test.py b/tensorflow/python/keras/distribute/keras_utils_test.py index bf328e447c1..f88783c1d15 100644 --- a/tensorflow/python/keras/distribute/keras_utils_test.py +++ b/tensorflow/python/keras/distribute/keras_utils_test.py @@ -349,32 +349,6 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase): verbose=0, callbacks=[keras.callbacks.ReduceLROnPlateau()]) - @combinations.generate( - combinations.combine( - distribution=[strategy_combinations.one_device_strategy], - mode=['eager'], - experimental_run_tf_function=[True, False])) - def test_distribution_strategy_with_run_eagerly(self, distribution, - experimental_run_tf_function): - with distribution.scope(): - x = keras.layers.Input(shape=(1,)) - y = keras.layers.Dense(1, kernel_initializer='ones')(x) - model = keras.models.Model(x, y) - - if experimental_run_tf_function: - model.compile( - 'sgd', - run_eagerly=True, - experimental_run_tf_function=experimental_run_tf_function) - else: - err_msg = ('We currently do not support enabling `run_eagerly` with ' - 'distribution strategy.') - with self.assertRaisesRegex(ValueError, err_msg): - model.compile( - 'sgd', - run_eagerly=True, - experimental_run_tf_function=experimental_run_tf_function) - @combinations.generate( combinations.combine( distribution=[ @@ -631,11 +605,8 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase): experimental_run_tf_function=experimental_run_tf_function) @combinations.generate( - combinations.times( - keras_test_lib.all_strategy_combinations_minus_default(), - combinations.combine(experimental_run_tf_function=[True, False]))) - def test_model_outside_scope(self, distribution, - experimental_run_tf_function): + keras_test_lib.all_strategy_combinations_minus_default()) + def test_model_outside_scope(self, distribution): with self.cached_session(): with self.assertRaisesRegexp( ValueError, 'was not created in the distribution strategy'): @@ -646,11 +617,7 @@ class TestDistributionStrategyValidation(test.TestCase, parameterized.TestCase): optimizer = gradient_descent.GradientDescentOptimizer(0.001) loss = 'mse' metrics = ['mae', keras.metrics.CategoricalAccuracy()] - model.compile( - optimizer, - loss, - metrics=metrics, - experimental_run_tf_function=experimental_run_tf_function) + model.compile(optimizer, loss, metrics=metrics) class TestDistributionStrategyWithStaticShapes(test.TestCase, diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 36570e36cc8..86930a4cfad 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -21,10 +21,9 @@ from __future__ import print_function import collections import numpy as np -from tensorflow.python import tf2 from tensorflow.python.data.ops import dataset_ops from tensorflow.python.data.ops import iterator_ops -from tensorflow.python.distribute import distribution_strategy_context +from tensorflow.python.distribute import distribution_strategy_context as ds_context from tensorflow.python.eager import context from tensorflow.python.eager import def_function from tensorflow.python.eager import monitoring @@ -43,10 +42,7 @@ from tensorflow.python.keras import metrics as metrics_module from tensorflow.python.keras import optimizers from tensorflow.python.keras.distribute import distributed_training_utils from tensorflow.python.keras.engine import network -from tensorflow.python.keras.engine import training_arrays from tensorflow.python.keras.engine import training_distributed -from tensorflow.python.keras.engine import training_eager -from tensorflow.python.keras.engine import training_generator from tensorflow.python.keras.engine import training_utils from tensorflow.python.keras.engine import training_v2 from tensorflow.python.keras.engine import training_v2_utils @@ -66,7 +62,6 @@ from tensorflow.python.training.tracking import layer_utils as trackable_layer_u from tensorflow.python.util import deprecation from tensorflow.python.util import nest from tensorflow.python.util import tf_inspect -from tensorflow.python.util.compat import collections_abc from tensorflow.python.util.tf_export import keras_export try: @@ -146,27 +141,14 @@ class Model(network.Network, version_utils.VersionSelector): def __init__(self, *args, **kwargs): super(Model, self).__init__(*args, **kwargs) _keras_api_gauge.get_cell('model').set(True) - # initializing _distribution_strategy here since it is possible to call - # predict on a model without compiling it. - self._distribution_strategy = None - self._compile_time_distribution_strategy = None - if (ops.executing_eagerly_outside_functions() and - distribution_strategy_context.has_strategy()): - self._set_strategy( - distribution_strategy_context.get_strategy()) - - # This flag is used to track if the user is using the deprecated path of - # passing distribution strategy to compile rather than creating the model - # under distribution strategy scope. - self._compile_distribution = False + # Model must be created under scope of DistStrat it will be trained with. + if ds_context.has_strategy(): + self._distribution_strategy = ds_context.get_strategy() + else: + self._distribution_strategy = None + # Defaults to value of `tf.config.experimental_functions_run_eagerly`. self._run_eagerly = None - self._experimental_run_tf_function = ( - ops.executing_eagerly_outside_functions()) - - @trackable.no_automatic_dependency_tracking - def _set_strategy(self, strategy): - self._compile_time_distribution_strategy = strategy def get_weights(self): """Retrieves the weights of the model. @@ -174,12 +156,8 @@ class Model(network.Network, version_utils.VersionSelector): Returns: A flat list of Numpy arrays. """ - strategy = (self._distribution_strategy or - self._compile_time_distribution_strategy) - if strategy: - with strategy.scope(): - return super(Model, self).get_weights() - return super(Model, self).get_weights() + with self.distribute_strategy.scope(): + return super(Model, self).get_weights() def load_weights(self, filepath, by_name=False, skip_mismatch=False): """Loads all layer weights, either from a TensorFlow or an HDF5 weight file. @@ -242,8 +220,6 @@ class Model(network.Network, version_utils.VersionSelector): loss_weights=None, sample_weight_mode=None, weighted_metrics=None, - target_tensors=None, - distribute=None, **kwargs): """Configures the model for training. @@ -283,84 +259,15 @@ class Model(network.Network, version_utils.VersionSelector): dictionary or a list of modes. weighted_metrics: List of metrics to be evaluated and weighted by sample_weight or class_weight during training and testing. - target_tensors: By default, Keras will create placeholders for the - model's target, which will be fed with the target data during - training. If instead you would like to use your own - target tensors (in turn, Keras will not expect external - Numpy data for these targets at training time), you - can specify them via the `target_tensors` argument. It can be - a single tensor (for a single-output model), a list of tensors, - or a dict mapping output names to target tensors. - distribute: NOT SUPPORTED IN TF 2.0, please create and compile the - model under distribution strategy scope instead of passing it to - compile. **kwargs: Any additional arguments. Raises: ValueError: In case of invalid arguments for `optimizer`, `loss`, `metrics` or `sample_weight_mode`. """ + self._validate_compile(optimizer, **kwargs) self._run_eagerly = kwargs.pop('run_eagerly', None) - self._experimental_run_tf_function = kwargs.pop( - 'experimental_run_tf_function', True) - - # Prepare Session arguments (legacy). - kwargs.pop('cloning', None) # Legacy DistStrat argument, never used. - allowed_kwargs = {'feed_dict', 'fetches', 'options', 'run_metadata'} - unknown_kwargs = set(kwargs.keys()) - allowed_kwargs - if unknown_kwargs: - raise TypeError( - 'Invalid keyword argument(s) in `compile`: %s' % (unknown_kwargs,)) - self._function_kwargs = kwargs - if self._function_kwargs: - self._experimental_run_tf_function = False - if self.run_eagerly: - raise ValueError( - 'Session keyword arguments are not supported ' - 'when `run_eagerly=True`. You passed the following ' - 'Session arguments: %s' % (self._function_kwargs,)) - self._set_optimizer(optimizer) - is_any_keras_optimizer_v1 = any( - (isinstance(opt, optimizers.Optimizer) - and not isinstance(opt, optimizers.TFOptimizer) - ) for opt in nest.flatten(self.optimizer)) - - if is_any_keras_optimizer_v1 and ops.executing_eagerly_outside_functions(): - raise ValueError('`tf.compat.v1.keras` Optimizer (', optimizer, ') is ' - 'not supported when eager execution is enabled. Use a ' - '`tf.keras` Optimizer instead, or disable eager ' - 'execution.') - - if ((target_tensors is not None) - or not ops.executing_eagerly_outside_functions()): - # Fallback out of things that aren't supported with v2 loops - self._experimental_run_tf_function = False - - if distribute is not None: - if tf2.enabled() or self._experimental_run_tf_function: - raise ValueError( - 'Distribute argument in compile is not available in TF 2.0 please ' - 'create the model under the distribution strategy scope.') - logging.warning('Distribute argument in compile is deprecated please ' - 'create the model under the distribution strategy scope.') - self._distribution_strategy = distribute - self._compile_distribution = True - else: - if distribution_strategy_context.has_strategy(): - # When the user builds the model in the DS scope and cross replica - # context we want distribution strategy to be set but when building the - # replica copies of the models internally we should not be compiling - # with distribution strategy and use the default compilation path. - if distribution_strategy_context.in_cross_replica_context(): - self._distribution_strategy = ( - distribution_strategy_context.get_strategy()) - - if not self._experimental_run_tf_function: - self._validate_compile_param_for_distribution_strategy(self.run_eagerly, - sample_weight_mode, - target_tensors, - weighted_metrics) # We've disabled automatic dependency tracking for this method, but do want # to add a checkpoint dependency on the optimizer if it's trackable. if isinstance(self.optimizer, trackable.Trackable): @@ -371,10 +278,6 @@ class Model(network.Network, version_utils.VersionSelector): self.sample_weight_mode = sample_weight_mode self._compile_metrics = metrics or [] self._compile_weighted_metrics = weighted_metrics - if self.run_eagerly and target_tensors is not None: - raise ValueError( - 'target_tensors argument is not supported when ' - 'running a model eagerly.') # _training_endpoints contains a list of _TrainingEndpoint object, which has # all the model output/target/loss and related metadata. @@ -387,14 +290,9 @@ class Model(network.Network, version_utils.VersionSelector): self._distributed_model_cache = {} self._distributed_function_cache = {} - # Clear any `_eager_losses` that was added. + # Clear any `_eager_losses` cached from a previous `Model.__call__`. self._clear_losses() - if (not context.executing_eagerly() and - self._distribution_strategy is not None): - # Ensures a Session is created and configured correctly for Distribution - # Strategy. - K.configure_and_create_distributed_session(self._distribution_strategy) # Initialize model metric attributes. self._init_metric_attributes() if not self.built or not self.inputs or not self.outputs: @@ -409,8 +307,7 @@ class Model(network.Network, version_utils.VersionSelector): self.loss_functions = training_utils.prepare_loss_functions( self.loss, self.output_names) - target_tensors = self._process_target_tensor_for_compile(target_tensors) - + target_tensors = self._process_target_tensor_for_compile(None) for o, n, l, t in zip(self.outputs, self.output_names, self.loss_functions, target_tensors): endpoint = _TrainingEndpoint(o, n, l) @@ -456,21 +353,6 @@ class Model(network.Network, version_utils.VersionSelector): # Collected trainable weights, sorted in topological order. self._collected_trainable_weights = self.trainable_weights - # Validate all variables were correctly created in distribution scope. - if self._distribution_strategy and not self._compile_distribution: - for v in self.variables: - strategy = self._distribution_strategy - if not strategy.extended.variable_created_in_scope(v): - raise ValueError( - 'Variable (%s) was not created in the distribution strategy ' - 'scope of (%s). It is most likely due to not all layers or ' - 'the model or optimizer being created outside the distribution ' - 'strategy scope. Try to make sure your code looks similar ' - 'to the following.\n' - 'with strategy.scope():\n' - ' model=_create_model()\n' - ' model.compile(...)'% (v, strategy)) - @trackable.no_automatic_dependency_tracking def _init_distributed_function_cache_if_not_compiled(self): if not hasattr(self, '_distributed_function_cache'): @@ -506,6 +388,13 @@ class Model(network.Network, version_utils.VersionSelector): metrics_names += [m.name for m in self.metrics] return metrics_names + @property + def distribute_strategy(self): + """The `tf.distribute.Strategy` this model was created under.""" + if self._distribution_strategy is None: + return ds_context._get_default_strategy() # pylint: disable=protected-access + return self._distribution_strategy + @property def run_eagerly(self): """Settable attribute indicating whether the model should run eagerly. @@ -563,36 +452,11 @@ class Model(network.Network, version_utils.VersionSelector): 'original `Dataset` object instead of passing in ' '`iter(dataset)`.') - # Experiment training loop with default DS path. - if context.executing_eagerly() and self._experimental_run_tf_function: - if self._in_multi_worker_mode(): - return training_distributed.DistributionMultiWorkerTrainingLoop( - training_v2.Loop()) - else: - return training_v2.Loop() - - # Case 1: distribution strategy. - if self._distribution_strategy: - if self._in_multi_worker_mode(): - return training_distributed.DistributionMultiWorkerTrainingLoop( - training_distributed.DistributionSingleWorkerTrainingLoop()) - else: - return training_distributed.DistributionSingleWorkerTrainingLoop() - - # Case 2: generator-like. Input is Python generator, or Sequence object, - # or a non-distributed Dataset or iterator in eager execution. - if data_utils.is_generator_or_sequence(inputs): - return training_generator.GeneratorOrSequenceTrainingLoop() - if training_utils.is_eager_dataset_or_iterator(inputs): - return training_generator.EagerDatasetOrIteratorTrainingLoop() - - # Case 3: Symbolic tensors or Numpy array-like. - # This includes Datasets and iterators in graph mode (since they - # generate symbolic tensors). - if self.run_eagerly: - return training_generator.GeneratorLikeTrainingLoop() + if self._in_multi_worker_mode(): + return training_distributed.DistributionMultiWorkerTrainingLoop( + training_v2.Loop()) else: - return training_arrays.ArrayLikeTrainingLoop() + return training_v2.Loop() def fit(self, x=None, @@ -985,10 +849,6 @@ class Model(network.Network, version_utils.VersionSelector): for m in metrics: m.reset_states() - # Reset metrics on all the distributed (cloned) models. - if self._distribution_strategy: - distributed_training_utils._reset_metrics(self) # pylint: disable=protected-access - def train_on_batch(self, x, y=None, @@ -1038,62 +898,19 @@ class Model(network.Network, version_utils.VersionSelector): """ self._assert_compile_was_called() self._check_call_args('train_on_batch') - if self._experimental_run_tf_function: - outputs = training_v2_utils.train_on_batch( - self, x, y=y, sample_weight=sample_weight, - class_weight=class_weight, reset_metrics=reset_metrics, - standalone=True) - outputs = (outputs['total_loss'] + outputs['output_losses'] + - outputs['metrics']) - outputs = [ - training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access - if len(outputs) == 1: - outputs = outputs[0] - return outputs - - # If at this point we are in the replica context, then it is okay to execute - # the Eager code path. The expected way to get here is to call `fit` that - # calls `train_on_batch` on each replica. - if (self._distribution_strategy and - distribution_strategy_context.in_cross_replica_context()): - raise NotImplementedError('`train_on_batch` is not supported for models ' - 'distributed with tf.distribute.Strategy.') - # Validate and standardize user data. - x, y, sample_weights = self._standardize_user_data( - x, y, sample_weight=sample_weight, class_weight=class_weight, - extract_tensors_from_dataset=True) - - # If `self._distribution_strategy` is True, then we are in a replica context - # at this point because of the check above. `train_on_batch` is being run - # for each replica by `self._distribution_strategy` and the same code path - # as Eager is expected to be taken. - if self.run_eagerly or self._distribution_strategy: - output_dict = training_eager.train_on_batch( - self, - x, - y, - sample_weights=sample_weights, - output_loss_metrics=self._output_loss_metrics) - outputs = (output_dict['total_loss'] + output_dict['output_losses'] - + output_dict['metrics']) - outputs = [ - training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access - else: - x = training_utils.ModelInputs(x).as_list() - ins = x + list(y or []) + list(sample_weights or []) - - if not isinstance(K.symbolic_learning_phase(), int): - ins += [True] # Add learning phase value. - - self._update_sample_weight_modes(sample_weights=sample_weights) - self._make_train_function() - outputs = self.train_function(ins) # pylint: disable=not-callable - - if reset_metrics: - self.reset_metrics() - + outputs = training_v2_utils.train_on_batch( + self, + x, + y=y, + sample_weight=sample_weight, + class_weight=class_weight, + reset_metrics=reset_metrics, + standalone=True) + outputs = ( + outputs['total_loss'] + outputs['output_losses'] + outputs['metrics']) + outputs = [training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access if len(outputs) == 1: - return outputs[0] + outputs = outputs[0] return outputs def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True): @@ -1136,52 +953,18 @@ class Model(network.Network, version_utils.VersionSelector): """ self._assert_compile_was_called() self._check_call_args('test_on_batch') - if self._experimental_run_tf_function: - outputs = training_v2_utils.test_on_batch( - self, x, y=y, sample_weight=sample_weight, - reset_metrics=reset_metrics, standalone=True) - outputs = (outputs['total_loss'] + outputs['output_losses'] + - outputs['metrics']) - outputs = [ - training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access - if len(outputs) == 1: - outputs = outputs[0] - return outputs - - if (self._distribution_strategy and - distribution_strategy_context.in_cross_replica_context()): - raise NotImplementedError('`test_on_batch` is not supported for models ' - 'distributed with tf.distribute.Strategy.') - # Validate and standardize user data. - x, y, sample_weights = self._standardize_user_data( - x, y, sample_weight=sample_weight, extract_tensors_from_dataset=True) - - # If `self._distribution_strategy` is True, then we are in a replica context - # at this point. - if self.run_eagerly or self._distribution_strategy: - output_dict = training_eager.test_on_batch( - self, - x, - y, - sample_weights=sample_weights, - output_loss_metrics=self._output_loss_metrics) - outputs = (output_dict['total_loss'] + output_dict['output_losses'] - + output_dict['metrics']) - outputs = [ - training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access - else: - x = training_utils.ModelInputs(x).as_list() - inputs = x + list(y or []) + list(sample_weights or []) - - self._update_sample_weight_modes(sample_weights=sample_weights) - self._make_test_function() - outputs = self.test_function(inputs) # pylint: disable=not-callable - - if reset_metrics: - self.reset_metrics() - + outputs = training_v2_utils.test_on_batch( + self, + x, + y=y, + sample_weight=sample_weight, + reset_metrics=reset_metrics, + standalone=True) + outputs = ( + outputs['total_loss'] + outputs['output_losses'] + outputs['metrics']) + outputs = [training_v2_utils._non_none_constant_value(v) for v in outputs] # pylint: disable=protected-access if len(outputs) == 1: - return outputs[0] + outputs = outputs[0] return outputs def predict_on_batch(self, x): @@ -1203,34 +986,7 @@ class Model(network.Network, version_utils.VersionSelector): expectations of the model. """ self._check_call_args('predict_on_batch') - if self._experimental_run_tf_function: - return training_v2_utils.predict_on_batch(self, x, standalone=True) - - if (self._distribution_strategy and - distribution_strategy_context.in_cross_replica_context()): - raise NotImplementedError( - '`predict_on_batch` is not supported for models distributed with' - ' tf.distribute.Strategy.') - # Validate and standardize user data. - inputs, _, _ = self._standardize_user_data( - x, extract_tensors_from_dataset=True) - # If `self._distribution_strategy` is True, then we are in a replica context - # at this point. - if self.run_eagerly or self._distribution_strategy: - inputs = training_utils.cast_if_floating_dtype(inputs) - if isinstance(inputs, collections_abc.Sequence): - # Unwrap lists with only one input, as we do when training on batch - if len(inputs) == 1: - inputs = inputs[0] - - return self(inputs) # pylint: disable=not-callable - - self._make_predict_function() - outputs = self.predict_function(inputs) - - if len(outputs) == 1: - return outputs[0] - return outputs + return training_v2_utils.predict_on_batch(self, x, standalone=True) @deprecation.deprecated( None, 'Please use Model.fit, which supports generators.') @@ -1386,6 +1142,48 @@ class Model(network.Network, version_utils.VersionSelector): % (self.optimizer.loss_scale, self._dtype_policy.loss_scale)) + def _validate_compile(self, optimizer, **kwargs): + """Performs validation checks for the default `compile`.""" + is_any_keras_optimizer_v1 = any( + (isinstance(opt, optimizers.Optimizer) and + not isinstance(opt, optimizers.TFOptimizer)) + for opt in nest.flatten(optimizer)) + if is_any_keras_optimizer_v1: + raise ValueError( + '`tf.compat.v1.keras` Optimizer (', optimizer, ') is ' + 'not supported when eager execution is enabled. Use a ' + '`tf.keras` Optimizer instead, or disable eager ' + 'execution.') + + kwargs.pop('cloning', None) # Legacy DistStrat argument, never used. + kwargs.pop('experimental_run_tf_function', None) # Always `True`. + if kwargs.pop('distribute', None) is not None: + raise ValueError( + 'Distribute argument in compile is not available in TF 2.0 please ' + 'create the model under the distribution strategy scope.') + if kwargs.pop('target_tensors', None) is not None: + raise ValueError( + 'target_tensors argument is not supported when executing eagerly.') + invalid_kwargs = set(kwargs) - {'run_eagerly'} + if invalid_kwargs: + raise TypeError('Invalid keyword argument(s) in `compile`: %s' % + (invalid_kwargs,)) + + # Model must be created and compiled with the same DistStrat. + if self.built and ds_context.has_strategy(): + strategy = ds_context.get_strategy() + for v in self.variables: + if not strategy.extended.variable_created_in_scope(v): + raise ValueError( + 'Variable (%s) was not created in the distribution strategy ' + 'scope of (%s). It is most likely due to not all layers or ' + 'the model or optimizer being created outside the distribution ' + 'strategy scope. Try to make sure your code looks similar ' + 'to the following.\n' + 'with strategy.scope():\n' + ' model=_create_model()\n' + ' model.compile(...)' % (v, strategy)) + def _prepare_validation_data(self, validation_data, batch_size, validation_steps): """Unpack and check the validation data.""" @@ -1399,33 +1197,6 @@ class Model(network.Network, version_utils.VersionSelector): steps=validation_steps, steps_name='validation_steps') - def _validate_compile_param_for_distribution_strategy( - self, run_eagerly, sample_weight_mode, target_tensors, weighted_metrics): - # Validate that arguments passed by the user to `compile` are supported by - # tf.distribute.Strategy. - if self._distribution_strategy: - if sample_weight_mode: - raise NotImplementedError('sample_weight_mode is not supported with ' - 'tf.distribute.Strategy.') - if weighted_metrics: - raise NotImplementedError('weighted_metrics is not supported with ' - 'tf.distribute.Strategy.') - if target_tensors: - raise ValueError('target_tensors is not supported with ' - 'tf.distribute.Strategy.') - - if run_eagerly: - raise ValueError( - 'We currently do not support enabling `run_eagerly` with ' - 'distribution strategy.') - - if (distributed_training_utils.is_distributing_by_cloning(self) and - (not self.built or not self.inputs or not self.outputs)): - raise ValueError( - 'We currently do not support distribution strategy with a ' - '`Sequential` model that is created without `input_shape`/' - '`input_dim` set in its first layer or a subclassed model.') - def _process_target_tensor_for_compile(self, target_tensors): if self.run_eagerly: # target tensor is not supported with run_eagerly. Create a list with None @@ -1683,14 +1454,6 @@ class Model(network.Network, version_utils.VersionSelector): return self.callback_model return self - @trackable.no_automatic_dependency_tracking - def _make_callback_model(self, grouped_model): - first_replicated_model = self._distribution_strategy.unwrap( - grouped_model)[0] - # We initialize the callback model with the first replicated model. - self._replicated_model = DistributedCallbackModel(first_replicated_model) - self._replicated_model.set_original_model(self) - def _validate_or_infer_batch_size(self, batch_size, steps, x): """Validates that the `batch_size` provided is consistent with InputLayer. @@ -2037,8 +1800,7 @@ class Model(network.Network, version_utils.VersionSelector): fn = K.function( inputs, [self.total_loss] + metrics_tensors, updates=updates, - name='train_function', - **self._function_kwargs) + name='train_function') setattr(self, 'train_function', fn) # Restore the current trainable state @@ -2067,8 +1829,7 @@ class Model(network.Network, version_utils.VersionSelector): fn = K.function( inputs, [self.total_loss] + metrics_tensors, updates=updates, - name='test_function', - **self._function_kwargs) + name='test_function') setattr(self, 'test_function', fn) def _make_predict_function(self): @@ -2581,8 +2342,7 @@ class Model(network.Network, version_utils.VersionSelector): loss_weights=self.loss_weights, target_tensors=target_tensors, sample_weight_mode=self.sample_weight_mode, - run_eagerly=self.run_eagerly, - experimental_run_tf_function=self._experimental_run_tf_function) + run_eagerly=self.run_eagerly) # TODO(omalleyt): Consider changing to a more descriptive function name. def _set_inputs(self, inputs, outputs=None, training=None): @@ -2822,8 +2582,8 @@ class Model(network.Network, version_utils.VersionSelector): strategy = self._distribution_strategy # Otherwise, use the strategy whose scope this is in. - if not strategy and distribution_strategy_context.has_strategy(): - strategy = distribution_strategy_context.get_strategy() + if not strategy and ds_context.has_strategy(): + strategy = ds_context.get_strategy() return strategy @@ -2832,46 +2592,6 @@ class Model(network.Network, version_utils.VersionSelector): return model_serialization.ModelSavedModelSaver(self) -class DistributedCallbackModel(Model): - """Model that is used for callbacks with tf.distribute.Strategy.""" - - def __init__(self, model): - super(DistributedCallbackModel, self).__init__() - self.optimizer = model.optimizer - - def set_original_model(self, orig_model): - self._original_model = orig_model - - def save_weights(self, filepath, overwrite=True, save_format=None): - self._replicated_model.save_weights(filepath, overwrite=overwrite, - save_format=save_format) - - def save(self, filepath, overwrite=True, include_optimizer=True): - # save weights from the distributed model to the original model - distributed_model_weights = self.get_weights() - self._original_model.set_weights(distributed_model_weights) - # TODO(anjalisridhar): Do we need to save the original model here? - # Saving the first replicated model works as well. - self._original_model.save(filepath, overwrite=True, include_optimizer=False) - - def load_weights(self, filepath, by_name=False): - self._original_model.load_weights(filepath, by_name=False) - # Copy the weights from the original model to each of the replicated models. - orig_model_weights = self._original_model.get_weights() - distributed_training_utils.set_weights( - self._original_model._distribution_strategy, self, # pylint: disable=protected-access - orig_model_weights) - - def __getattr__(self, item): - # Whitelisted atttributes of the model that can be accessed by the user - # during a callback. - if item not in ('_setattr_tracking', '_layers'): - logging.warning('You are accessing attribute ' + item + ' of the ' - 'DistributedCallbackModel that may not have been set ' - 'correctly.') - return super(DistributedCallbackModel, self).__getattr__(item) - - class _TrainingEndpoint(object): """A container for the training output/target and related entities. diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index fc3f3413f7b..dc2495c6661 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -244,7 +244,7 @@ class CompileTest(keras_parameterized.TestCase): run_eagerly=testing_utils.should_run_eagerly(), experimental_run_tf_function=testing_utils.should_run_tf_function()) - @keras_parameterized.run_all_keras_modes + @tf_test_util.run_deprecated_v1 def test_compile_with_session_kwargs(self): model = testing_utils.get_small_sequential_mlp( num_hidden=10, num_classes=2, input_dim=3) @@ -258,24 +258,6 @@ class CompileTest(keras_parameterized.TestCase): loss='mse', foo=True) - if testing_utils.should_run_eagerly(): - # Test that Session kwargs cannot be used with run_eagerly - with self.assertRaisesRegexp( - ValueError, - r'not supported when `run_eagerly=True`'): - model.compile( - optimizer='adam', - loss='mse', - run_eagerly=True, - feed_dict={}) - else: - # Test that Session kwargs trigger legacy path execution - model.compile( - optimizer='adam', - loss='mse', - feed_dict={}) - self.assertFalse(model._experimental_run_tf_function) - class TrainingTest(keras_parameterized.TestCase): @@ -1675,23 +1657,10 @@ class TestExceptionsAndWarnings(keras_parameterized.TestCase): experimental_run_tf_function=False) err_msg = 'When passing input data as arrays, do not specify' - if testing_utils.should_run_eagerly(): - with self.assertRaisesRegex(ValueError, err_msg): - model.fit(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps_per_epoch=4) - - with self.assertRaisesRegex(ValueError, err_msg): - model.evaluate(x=np.zeros((100, 1)), y=np.ones((100, 1)), steps=4) - - with self.assertRaisesRegex(ValueError, err_msg): - model.predict(np.zeros((100, 1)), steps=4) - else: - with test.mock.patch.object(logging, 'warning') as mock_log: - model._standardize_user_data( - np.zeros((100, 1)), - np.ones((100, 1)), - check_steps=True, - steps=4) - self.assertRegexpMatches(str(mock_log.call_args), err_msg) + with test.mock.patch.object(logging, 'warning') as mock_log: + model._standardize_user_data( + np.zeros((100, 1)), np.ones((100, 1)), check_steps=True, steps=4) + self.assertRegexpMatches(str(mock_log.call_args), err_msg) @keras_parameterized.run_with_all_model_types @keras_parameterized.run_all_keras_modes @@ -2971,7 +2940,7 @@ class TestTrainingWithDataTensors(keras_parameterized.TestCase): self.assertEqual(out[0].shape, (10 * 3, 4)) self.assertEqual(out[1].shape, (10 * 3, 4)) - @keras_parameterized.run_all_keras_modes + @tf_test_util.run_deprecated_v1 def test_target_tensors(self): with self.cached_session(): # single-output, as list diff --git a/tensorflow/python/keras/engine/training_v1.py b/tensorflow/python/keras/engine/training_v1.py index 1afd525d202..69acc360054 100644 --- a/tensorflow/python/keras/engine/training_v1.py +++ b/tensorflow/python/keras/engine/training_v1.py @@ -175,8 +175,8 @@ class Model(training_lib.Model): self._compile_time_distribution_strategy) if strategy: with strategy.scope(): - return super(Model, self).get_weights() - return super(Model, self).get_weights() + return network.Network.get_weights(self) + return network.Network.get_weights(self) def load_weights(self, filepath, by_name=False, skip_mismatch=False): """Loads all layer weights, either from a TensorFlow or an HDF5 weight file. diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index 476da84bcf7..ad176a99a2e 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -28,7 +28,7 @@ import functools import numpy as np from tensorflow.python.data.ops import dataset_ops -from tensorflow.python.distribute import distribution_strategy_context +from tensorflow.python.distribute import distribution_strategy_context as ds_context from tensorflow.python.framework import errors from tensorflow.python.keras import callbacks as cbks from tensorflow.python.keras.distribute import distributed_training_utils as dist_utils @@ -204,7 +204,7 @@ class Loop(training_utils.TrainingLoop): batch_size = model._validate_or_infer_batch_size( batch_size, steps_per_epoch, x) - strategy = _get_distribution_strategy(model) + strategy = model.distribute_strategy batch_size, steps_per_epoch = dist_utils.process_batch_and_step_size( strategy, x, @@ -320,8 +320,7 @@ class Loop(training_utils.TrainingLoop): with training_context.on_epoch(epoch, ModeKeys.TRAIN) as epoch_logs: model.reset_metrics() if training_data_iter is None or recreate_training_iterator: - if (training_data_iter is not None and - distribution_strategy_context.has_strategy()): + if training_data_iter is not None and ds_context.has_strategy(): # TODO(kaftan): remove this when MultiDeviceIterator is a ## compositetensor (unless this is more efficient) training_data_iter._initializer # pylint: disable=pointless-statement @@ -353,8 +352,7 @@ class Loop(training_utils.TrainingLoop): if (do_validation and training_utils.should_run_validation(validation_freq, epoch) and not training_callbacks.model.stop_training): - if (eval_data_iter is not None and - distribution_strategy_context.has_strategy()): + if eval_data_iter is not None and ds_context.has_strategy(): # TODO(kaftan): remove this when MultiDeviceIterator is a ## compositetensor (unless this is more efficient) eval_data_iter._initializer # pylint: disable=pointless-statement @@ -405,7 +403,7 @@ class Loop(training_utils.TrainingLoop): batch_size = model._validate_or_infer_batch_size( batch_size, steps, x) - strategy = _get_distribution_strategy(model) + strategy = model.distribute_strategy batch_size, steps = dist_utils.process_batch_and_step_size( strategy, x, batch_size, steps, mode) dist_utils.validate_callbacks(input_callbacks=callbacks, @@ -498,17 +496,6 @@ class Loop(training_utils.TrainingLoop): workers=workers, use_multiprocessing=use_multiprocessing, **kwargs) -def _get_distribution_strategy(model): - """Get the model's distribution strategy.""" - if model._compile_time_distribution_strategy: - strategy = model._compile_time_distribution_strategy - else: - # Grab the active strategy if the model was never compiled - # but it is now predicting. - strategy = distribution_strategy_context.get_strategy() - return strategy - - def _process_training_inputs(model, x, y, diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py index 40117909ecc..f4691e2fe04 100644 --- a/tensorflow/python/keras/keras_parameterized.py +++ b/tensorflow/python/keras/keras_parameterized.py @@ -370,7 +370,7 @@ def run_all_keras_modes(test_or_class=None, a target dependency. """ - params = [('_v2_function', 'v2_function'), ('_v2_funcgraph', 'v2_funcgraph')] + params = [('_v2_function', 'v2_function')] if not always_skip_eager: params.append(('_v2_eager', 'v2_eager')) if not (always_skip_v1 or tf2.enabled()): @@ -386,8 +386,6 @@ def run_all_keras_modes(test_or_class=None, """A run of a single test case w/ specified run mode.""" if run_mode == 'v1_session': _v1_session_test(f, self, config, *args, **kwargs) - elif run_mode == 'v2_funcgraph': - _v2_graph_functions_test(f, self, *args, **kwargs) elif run_mode == 'v2_eager': _v2_eager_test(f, self, *args, **kwargs) elif run_mode == 'v2_function': @@ -407,13 +405,6 @@ def _v1_session_test(f, test_or_class, config, *args, **kwargs): f(test_or_class, *args, **kwargs) -def _v2_graph_functions_test(f, test_or_class, *args, **kwargs): - with context.eager_mode(): - with testing_utils.run_eagerly_scope(False): - with testing_utils.experimental_run_tf_function_scope(False): - f(test_or_class, *args, **kwargs) - - def _v2_eager_test(f, test_or_class, *args, **kwargs): with context.eager_mode(): with testing_utils.run_eagerly_scope(True): diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py index 0017fcb6e3e..b750f4ca9f5 100644 --- a/tensorflow/python/keras/keras_parameterized_test.py +++ b/tensorflow/python/keras/keras_parameterized_test.py @@ -217,27 +217,13 @@ class KerasParameterizedTest(keras_parameterized.TestCase): if not tf2.enabled(): e.testBody_v1_session() e.testBody_v2_eager() - e.testBody_v2_funcgraph() e.testBody_v2_function() if not tf2.enabled(): - self.assertLen(l, 4) + self.assertLen(l, 3) self.assertAllEqual(l, [ ("graph", False, False), ("eager", True, True), - ("eager", False, False), - ("eager", False, True), - ]) - - ts = unittest.makeSuite(ExampleTest) - res = unittest.TestResult() - ts.run(res) - self.assertLen(l, 8) - else: - self.assertLen(l, 3) - self.assertAllEqual(l, [ - ("eager", True, True), - ("eager", False, False), ("eager", False, True), ]) @@ -245,6 +231,17 @@ class KerasParameterizedTest(keras_parameterized.TestCase): res = unittest.TestResult() ts.run(res) self.assertLen(l, 6) + else: + self.assertLen(l, 2) + self.assertAllEqual(l, [ + ("eager", True, True), + ("eager", False, True), + ]) + + ts = unittest.makeSuite(ExampleTest) + res = unittest.TestResult() + ts.run(res) + self.assertLen(l, 4) def test_run_all_keras_modes_extra_params(self): l = [] @@ -272,18 +269,14 @@ class KerasParameterizedTest(keras_parameterized.TestCase): e.testBody_1_v1_session() e.testBody_0_v2_eager() - e.testBody_0_v2_funcgraph() e.testBody_0_v2_function() e.testBody_1_v2_eager() - e.testBody_1_v2_funcgraph() e.testBody_1_v2_function() expected_combinations = { ("with_brackets", "eager", True, True), - ("with_brackets", "eager", False, False), ("with_brackets", "eager", False, True), ("without_brackets", "eager", True, True), - ("without_brackets", "eager", False, False), ("without_brackets", "eager", False, True), } @@ -322,16 +315,13 @@ class KerasParameterizedTest(keras_parameterized.TestCase): e.testBody_v1_session() if hasattr(e, "testBody_v2_eager"): e.testBody_v2_eager() - if hasattr(e, "testBody_v2_funcgraph"): - e.testBody_v2_funcgraph() if hasattr(e, "testBody_v2_function"): e.testBody_v2_function() - self.assertLen(l, 3) + self.assertLen(l, 2) self.assertEqual( set(l), { ("eager", True, True), - ("eager", False, False), ("eager", False, True), }) @@ -354,13 +344,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): e = ExampleTest() e.testBody_v2_eager_functional() - e.testBody_v2_funcgraph_functional() e.testBody_v2_function_functional() e.testBody_v2_eager_sequential() - e.testBody_v2_funcgraph_sequential() e.testBody_v2_function_sequential() e.testBody_v2_eager_subclass() - e.testBody_v2_funcgraph_subclass() e.testBody_v2_function_subclass() if not tf2.enabled(): @@ -370,13 +357,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): expected_combinations = { ("eager", True, True, "functional"), - ("eager", False, False, "functional"), ("eager", False, True, "functional"), ("eager", True, True, "sequential"), - ("eager", False, False, "sequential"), ("eager", False, True, "sequential"), ("eager", True, True, "subclass"), - ("eager", False, False, "subclass"), ("eager", False, True, "subclass"), } @@ -415,13 +399,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): e = ExampleTest() e.testBody_functional_v2_eager() - e.testBody_functional_v2_funcgraph() e.testBody_functional_v2_function() e.testBody_sequential_v2_eager() - e.testBody_sequential_v2_funcgraph() e.testBody_sequential_v2_function() e.testBody_subclass_v2_eager() - e.testBody_subclass_v2_funcgraph() e.testBody_subclass_v2_function() if not tf2.enabled(): @@ -431,13 +412,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): expected_combinations = { ("eager", True, True, "functional"), - ("eager", False, False, "functional"), ("eager", False, True, "functional"), ("eager", True, True, "sequential"), - ("eager", False, False, "sequential"), ("eager", False, True, "sequential"), ("eager", True, True, "subclass"), - ("eager", False, False, "subclass"), ("eager", False, True, "subclass"), } @@ -478,13 +456,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): e = ExampleTest() e.testBody_arg_v2_eager_functional() - e.testBody_arg_v2_funcgraph_functional() e.testBody_arg_v2_function_functional() e.testBody_arg_v2_eager_sequential() - e.testBody_arg_v2_funcgraph_sequential() e.testBody_arg_v2_function_sequential() e.testBody_arg_v2_eager_subclass() - e.testBody_arg_v2_funcgraph_subclass() e.testBody_arg_v2_function_subclass() if not tf2.enabled(): @@ -494,13 +469,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): expected_combinations = { ("eager", True, True, "functional"), - ("eager", False, False, "functional"), ("eager", False, True, "functional"), ("eager", True, True, "sequential"), - ("eager", False, False, "sequential"), ("eager", False, True, "sequential"), ("eager", True, True, "subclass"), - ("eager", False, False, "subclass"), ("eager", False, True, "subclass"), } @@ -541,13 +513,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): e = ExampleTest() e.testBody_arg_v2_eager_functional() - e.testBody_arg_v2_funcgraph_functional() e.testBody_arg_v2_function_functional() e.testBody_arg_v2_eager_sequential() - e.testBody_arg_v2_funcgraph_sequential() e.testBody_arg_v2_function_sequential() e.testBody_arg_v2_eager_subclass() - e.testBody_arg_v2_funcgraph_subclass() e.testBody_arg_v2_function_subclass() if not tf2.enabled(): @@ -557,13 +526,10 @@ class KerasParameterizedTest(keras_parameterized.TestCase): expected_combinations = { ("eager", True, True, "functional"), - ("eager", False, False, "functional"), ("eager", False, True, "functional"), ("eager", True, True, "sequential"), - ("eager", False, False, "sequential"), ("eager", False, True, "sequential"), ("eager", True, True, "subclass"), - ("eager", False, False, "subclass"), ("eager", False, True, "subclass"), } diff --git a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py index 9c34b46f9d0..6bb73cdfdef 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/keras_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/keras_test.py @@ -841,8 +841,11 @@ class KerasModelTest(keras_parameterized.TestCase): x = layers.Input(shape=(1,)) y = mp_test_util.AddLayer()(x) model = models.Model(x, y) - with self.assertRaisesRegexp(ValueError, - 'optimizer" must be an instance of '): + if context.executing_eagerly(): + error_msg = 'Use a `tf.keras` Optimizer instead' + else: + error_msg = 'optimizer" must be an instance of ' + with self.assertRaisesRegexp(ValueError, error_msg): model.compile(optimizers.SGD(1.), 'mse') @test_util.run_in_graph_and_eager_modes diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py index 81a3e5e0f11..74634e110e0 100644 --- a/tensorflow/python/keras/models.py +++ b/tensorflow/python/keras/models.py @@ -25,6 +25,7 @@ from tensorflow.python.keras import optimizers from tensorflow.python.keras.engine import network from tensorflow.python.keras.engine import sequential from tensorflow.python.keras.engine import training +from tensorflow.python.keras.engine import training_v1 from tensorflow.python.keras.engine.base_layer import AddMetric from tensorflow.python.keras.engine.base_layer import Layer from tensorflow.python.keras.engine.input_layer import Input @@ -33,6 +34,7 @@ from tensorflow.python.keras.engine.network import Network from tensorflow.python.keras.saving import model_config from tensorflow.python.keras.saving import save from tensorflow.python.keras.utils import generic_utils +from tensorflow.python.keras.utils import version_utils from tensorflow.python.keras.utils.generic_utils import CustomObjectScope from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import nest @@ -447,6 +449,8 @@ def _in_place_subclassed_model_reset(model): ValueError: In case the model uses a subclassed model as inner layer. """ assert not model._is_graph_network # Only makes sense for subclassed networks + # Select correct base class for new Model. + version_utils.swap_class(model.__class__, training.Model, training_v1.Model) # Retrieve all layers tracked by the model as well as their attribute names attributes_cache = {} for name in dir(model): diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py index d2f1a8a646f..81f419c02c7 100644 --- a/tensorflow/python/keras/models_test.py +++ b/tensorflow/python/keras/models_test.py @@ -386,12 +386,13 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase): experimental_run_tf_function=testing_utils.should_run_tf_function()) new_model.train_on_batch(inp, out) - # Create new tensors for inputs and targets + # Create new tensors for inputs. input_a = keras.Input(shape=(4,)) - target_a = keras.Input(shape=(4,)) new_model = models.clone_and_build_model( - model, input_tensors=input_a, target_tensors=[target_a], - compile_clone=False, in_place_reset=is_subclassed) + model, + input_tensors=input_a, + compile_clone=False, + in_place_reset=is_subclassed) with self.assertRaisesRegexp(RuntimeError, 'must compile'): new_model.evaluate(inp, out) with self.assertRaisesRegexp(RuntimeError, 'must compile'): @@ -428,7 +429,7 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase): new_model.train_on_batch(inp, out) new_model.evaluate(inp, out) - # Create new tensors for inputs and targets + # Create new tensors for inputs. input_a = keras.Input(shape=(4,), name='a') new_model = models.clone_and_build_model( model, input_tensors=input_a, compile_clone=True, @@ -437,10 +438,12 @@ class TestCloneAndBuildModel(keras_parameterized.TestCase): new_model.train_on_batch(inp, out) new_model.evaluate(inp, out) - target_a = keras.Input(shape=(4,), name='b') new_model = models.clone_and_build_model( - model, input_tensors=input_a, target_tensors=[target_a], - compile_clone=True, in_place_reset=is_subclassed) + model, + input_tensors=input_a, + target_tensors=None, + compile_clone=True, + in_place_reset=is_subclassed) self._assert_same_compile_params(new_model) new_model.train_on_batch(inp, out) new_model.evaluate(inp, out) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt index 7df20bf331b..bb823d92022 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt @@ -12,6 +12,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -170,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt index 2b46b140b65..0afd4145ac5 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -175,7 +179,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt index 4862a93c628..aba2d4cddee 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-linear-model.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -171,7 +175,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt index 45edc2e8d46..e1960038187 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-wide-deep-model.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -171,7 +175,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt index 6fb8f3891c0..33d54f64a86 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt @@ -12,6 +12,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -170,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt index 069025cd8e2..d38ac47b167 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -175,7 +179,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt index 7df20bf331b..bb823d92022 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt @@ -12,6 +12,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -170,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt index 2b46b140b65..0afd4145ac5 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -175,7 +179,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt index 4862a93c628..aba2d4cddee 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-linear-model.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -171,7 +175,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt index 45edc2e8d46..e1960038187 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-wide-deep-model.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -171,7 +175,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt index 6fb8f3891c0..33d54f64a86 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt @@ -12,6 +12,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -170,7 +174,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt index 069025cd8e2..d38ac47b167 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt @@ -13,6 +13,10 @@ tf_class { name: "activity_regularizer" mtype: "" } + member { + name: "distribute_strategy" + mtype: "" + } member { name: "dtype" mtype: "" @@ -175,7 +179,7 @@ tf_class { } member_method { name: "compile" - argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], " + argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\'], varargs=None, keywords=kwargs, defaults=[\'rmsprop\', \'None\', \'None\', \'None\', \'None\', \'None\'], " } member_method { name: "compute_mask" From 4a23c9b2e3b7bbff0eb988c30dab076448707035 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Thu, 5 Dec 2019 16:29:48 -0800 Subject: [PATCH 219/383] Rollback of rollback of changelist 279340363: adding backtrace logging to status with a fix for the Windows platform. PiperOrigin-RevId: 284079459 Change-Id: Id03f13d71b583ae9aec256aa8666388e2791f334 --- tensorflow/core/platform/BUILD | 3 +++ tensorflow/core/platform/status.cc | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD index 001f7827a46..21dcfdb5583 100644 --- a/tensorflow/core/platform/BUILD +++ b/tensorflow/core/platform/BUILD @@ -542,6 +542,7 @@ cc_library( ":logging", ":macros", ":mutex", + ":stacktrace", ":str_util", ":strcat", ":stringpiece", @@ -784,6 +785,7 @@ filegroup( "**/rocm.h", "**/rocm_rocdl_path.cc", "**/subprocess.cc", + "**/stacktrace.cc", "**/tracing.cc", "**/unbounded_work_queue.cc", "**/windows_file_system.cc", @@ -797,6 +799,7 @@ filegroup( "protobuf.cc", "random.cc", "scanner.cc", + "stacktrace.cc", "strcat.cc", "stringprintf.cc", ], diff --git a/tensorflow/core/platform/status.cc b/tensorflow/core/platform/status.cc index d9cd02a27fb..a7fd3e693a1 100644 --- a/tensorflow/core/platform/status.cc +++ b/tensorflow/core/platform/status.cc @@ -22,6 +22,7 @@ limitations under the License. #include "absl/base/call_once.h" #include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/stacktrace.h" #include "tensorflow/core/platform/str_util.h" #include "tensorflow/core/platform/strcat.h" #include "tensorflow/core/platform/stringprintf.h" @@ -91,6 +92,8 @@ Status::Status(tensorflow::error::Code code, StringPiece msg) { state_ = std::unique_ptr(new State); state_->code = code; state_->msg = string(msg); + VLOG(5) << "Generated non-OK status: \"" << *this << "\". " + << CurrentStackTrace(); } void Status::Update(const Status& new_status) { From c5816f6baa408587e59114e025c3004cbcd4767f Mon Sep 17 00:00:00 2001 From: David Chen Date: Thu, 5 Dec 2019 16:31:20 -0800 Subject: [PATCH 220/383] Allow ctpu_delete to use a different project PiperOrigin-RevId: 284079788 Change-Id: I70991416174891a121bb34098e26a3fa9d8d7741 --- tensorflow/tools/ci_build/ctpu/ctpu.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tensorflow/tools/ci_build/ctpu/ctpu.sh b/tensorflow/tools/ci_build/ctpu/ctpu.sh index 9888f876371..782a2b0bddb 100644 --- a/tensorflow/tools/ci_build/ctpu/ctpu.sh +++ b/tensorflow/tools/ci_build/ctpu/ctpu.sh @@ -99,6 +99,7 @@ function ctpu_up { if [[ -v project ]]; then args+=("--project=${project}") + echo "${project}" > "${TF_ARTIFACTS_DIR}/tpu_project" fi ./ctpu up "${args[@]}" @@ -108,13 +109,18 @@ function ctpu_up { function ctpu_delete { export TPU_NAME="$(cat "${TF_GFILE_DIR}/tpu_name")" export TPU_ZONE="$(cat "${TF_GFILE_DIR}/tpu_zone")" - # TODO(rsopher): conditionally save (and load) TPU_PROJECT if it was specified. + TPU_PROJECT_FILE="${TF_GFILE_DIR}/tpu_project" + if [ -f "${TPU_PROJECT_FILE}" ]; then + export TPU_PROJECT="$(cat ${TPU_PROJECT_FILE})" + else + export TPU_PROJECT="tensorflow-testing" + fi # Retry due to rare race condition where TPU creation hasn't propagated by # the time we try to delete it. for i in 1 2 3; do ./ctpu delete \ - --project=tensorflow-testing \ + --project=${TPU_PROJECT} \ --zone="${TPU_ZONE}" \ --name="${TPU_NAME}" \ --tpu-only \ From 793d13580934062923261269000c9ec748e062ed Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 16:39:47 -0800 Subject: [PATCH 221/383] Python 3 Migration. //tensorflow/python/distribute PiperOrigin-RevId: 284081338 Change-Id: I494673b3c3543dbcefb35247cdc2aa9e03cb031f --- tensorflow/python/distribute/BUILD | 41 +++++------------------------- 1 file changed, 6 insertions(+), 35 deletions(-) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 16a49b08a7c..ee820d0ead2 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -56,7 +56,6 @@ tf_py_test( "//tensorflow/python:platform_test", "//tensorflow/python:state_ops", ], - python_version = "PY3", ) py_library( @@ -119,7 +118,6 @@ cuda_py_test( "//tensorflow/python:client_testlib", "//tensorflow/python:framework_ops", ], - python_version = "PY3", ) py_library( @@ -172,7 +170,7 @@ py_test( name = "distribute_lib_test", size = "small", srcs = ["distribute_lib_test.py"], - python_version = "PY3", + python_version = "PY2", srcs_version = "PY2AND3", tags = [ "no_rocm", @@ -222,7 +220,7 @@ py_test( name = "distribute_coordinator_test", size = "medium", srcs = ["distribute_coordinator_test.py"], - python_version = "PY3", + python_version = "PY2", srcs_version = "PY2AND3", tags = ["no_oss_py2"], # b/138443278 deps = [ @@ -391,7 +389,6 @@ tf_py_test( "//tensorflow/python/eager:def_function", "//tensorflow/python/eager:test", ], - python_version = "PY3", tags = ["no_pip"], ) @@ -429,7 +426,6 @@ cuda_py_test( "//tensorflow/python/eager:test", "//tensorflow/python:framework_ops", ], - python_version = "PY3", ) py_library( @@ -451,7 +447,7 @@ py_test( name = "numpy_dataset_test", size = "small", srcs = ["numpy_dataset_test.py"], - python_version = "PY3", + python_version = "PY2", srcs_version = "PY2AND3", deps = [ ":numpy_dataset", @@ -502,13 +498,12 @@ cuda_py_test( "//tensorflow/python:io_ops", "//tensorflow/python:util", ], - python_version = "PY3", ) py_test( name = "multi_worker_util_test", srcs = ["multi_worker_util_test.py"], - python_version = "PY3", + python_version = "PY2", srcs_version = "PY2AND3", deps = [ ":multi_worker_util", @@ -589,7 +584,7 @@ py_library( py_test( name = "shared_variable_creator_test", srcs = ["shared_variable_creator_test.py"], - python_version = "PY3", + python_version = "PY2", srcs_version = "PY2AND3", deps = [ ":shared_variable_creator", @@ -663,7 +658,7 @@ py_library( py_test( name = "strategy_combinations_test", srcs = ["strategy_combinations_test.py"], - python_version = "PY3", + python_version = "PY2", deps = [ ":combinations", ":reduce_util", @@ -717,7 +712,6 @@ cuda_py_test( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", ], - python_version = "PY3", tags = [ "multi_and_single_gpu", ], @@ -727,7 +721,6 @@ distribute_py_test( name = "checkpointing_test", srcs = ["checkpointing_test.py"], main = "checkpointing_test.py", - python_version = "PY3", deps = [ ":tpu_strategy", "//tensorflow/compiler/tests:xla_test", @@ -781,7 +774,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], - python_version = "PY3", ) cuda_py_test( @@ -803,7 +795,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], - python_version = "PY3", tags = [ # TODO(b/138143527): Re-enable after fixing Guitar failure. # "multi_and_single_gpu", @@ -820,7 +811,6 @@ cuda_py_test( "//tensorflow/python/eager:test", ], grpc_enabled = True, - python_version = "PY3", ) py_library( @@ -847,7 +837,6 @@ tf_py_test( "//tensorflow/python/compat:v2_compat", "//tensorflow/python/training/tracking:util", ], - python_version = "PY3", ) py_library( @@ -883,7 +872,6 @@ distribute_py_test( name = "values_test", srcs = ["values_test.py"], main = "values_test.py", - python_version = "PY3", tags = [ "no_oss", # http://b/119349471 ], @@ -914,7 +902,6 @@ distribute_py_test( name = "moving_averages_test", srcs = ["moving_averages_test.py"], main = "moving_averages_test.py", - python_version = "PY3", deps = [ "//tensorflow/python:array_ops", "//tensorflow/python:constant_op", @@ -932,7 +919,6 @@ distribute_py_test( name = "custom_training_loop_test", srcs = ["custom_training_loop_test.py"], main = "custom_training_loop_test.py", - python_version = "PY3", tags = [ "multi_and_single_gpu", ], @@ -951,7 +937,6 @@ distribute_py_test( name = "minimize_loss_test", srcs = ["minimize_loss_test.py"], main = "minimize_loss_test.py", - python_version = "PY3", tags = [ "multi_and_single_gpu", "no_oss", # TODO(b/139815303): enable after this is fixed. @@ -1007,7 +992,6 @@ distribute_py_test( name = "step_fn_test", srcs = ["step_fn_test.py"], main = "step_fn_test.py", - python_version = "PY3", tags = [ "multi_and_single_gpu", "no_rocm", @@ -1038,7 +1022,6 @@ cuda_py_test( "//tensorflow/python:variable_scope", "//tensorflow/python:variables", ], - python_version = "PY3", tags = [ "multi_and_single_gpu", ], @@ -1065,7 +1048,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], - python_version = "PY3", ) cuda_py_test( @@ -1091,7 +1073,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], - python_version = "PY3", shard_count = 5, tags = [ "guitar", @@ -1120,7 +1101,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/eager:test", ], - python_version = "PY3", tags = [ "guitar", "multi_and_single_gpu", @@ -1131,7 +1111,6 @@ distribute_py_test( name = "metrics_v1_test", srcs = ["metrics_v1_test.py"], main = "metrics_v1_test.py", - python_version = "PY3", tags = [ "multi_and_single_gpu", ], @@ -1152,7 +1131,6 @@ distribute_py_test( name = "keras_metrics_test", srcs = ["keras_metrics_test.py"], main = "keras_metrics_test.py", - python_version = "PY3", tags = [ "multi_and_single_gpu", ], @@ -1173,7 +1151,6 @@ distribute_py_test( name = "zero_batch_test", srcs = ["zero_batch_test.py"], main = "zero_batch_test.py", - python_version = "PY3", deps = [ ":combinations", ":multi_worker_test_base", @@ -1218,7 +1195,6 @@ distribute_py_test( srcs = ["saved_model_save_load_test.py"], full_precision = True, main = "saved_model_save_load_test.py", - python_version = "PY3", shard_count = 5, deps = [ ":saved_model_test_base", @@ -1232,7 +1208,6 @@ distribute_py_test( srcs = ["keras_save_load_test.py"], full_precision = True, main = "keras_save_load_test.py", - python_version = "PY3", shard_count = 5, deps = [ ":saved_model_test_base", @@ -1246,7 +1221,6 @@ distribute_py_test( srcs = ["saved_model_mixed_api_test.py"], full_precision = True, main = "saved_model_mixed_api_test.py", - python_version = "PY3", shard_count = 5, deps = [ ":saved_model_test_base", @@ -1259,7 +1233,6 @@ distribute_py_test( name = "ctl_correctness_test", srcs = ["ctl_correctness_test.py"], main = "ctl_correctness_test.py", - python_version = "PY3", shard_count = 10, tags = [ "multi_and_single_gpu", @@ -1304,7 +1277,6 @@ cuda_py_test( "//tensorflow/python/estimator:estimator_py", "//tensorflow/python/keras/mixed_precision/experimental:test_util", ], - python_version = "PY3", tags = [ "multi_and_single_gpu", ], @@ -1339,7 +1311,6 @@ cuda_py_test( "//tensorflow/python/eager:context", "//tensorflow/python/estimator:estimator_py", ], - python_version = "PY3", tags = [ "multi_and_single_gpu", "no_oss", # TODO(b/133330625) From 67edc16326d6328e7ef096e1b06f81dae1bfb816 Mon Sep 17 00:00:00 2001 From: Sami Date: Thu, 5 Dec 2019 16:49:20 -0800 Subject: [PATCH 222/383] Make nccl bindings compilable with cuda 10.2 --- third_party/nccl/build_defs.bzl.tpl | 16 +++++++++------- third_party/nccl/nccl_configure.bzl | 18 ++++++++++++++++-- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/third_party/nccl/build_defs.bzl.tpl b/third_party/nccl/build_defs.bzl.tpl index 57191398553..e734e49f9dc 100644 --- a/third_party/nccl/build_defs.bzl.tpl +++ b/third_party/nccl/build_defs.bzl.tpl @@ -104,19 +104,21 @@ def _device_link_impl(ctx): tmp_fatbin = ctx.actions.declare_file("%s.fatbin" % name) fatbin_h = ctx.actions.declare_file("%s_fatbin.h" % name) bin2c = ctx.file._bin2c - ctx.actions.run( - outputs = [tmp_fatbin, fatbin_h], - inputs = cubins, - executable = ctx.file._fatbinary, - arguments = [ + arguments_list = [ "-64", "--cmdline=--compile-only", "--link", "--compress-all", - "--bin2c-path=%s" % bin2c.dirname, "--create=%s" % tmp_fatbin.path, "--embedded-fatbin=%s" % fatbin_h.path, - ] + images, + ] + if %{use_bin2c_path}: + arguments_list.append("--bin2c-path=%s" % bin2c.dirname) + ctx.actions.run( + outputs = [tmp_fatbin, fatbin_h], + inputs = cubins, + executable = ctx.file._fatbinary, + arguments = arguments_list + images, tools = [bin2c], mnemonic = "fatbinary", ) diff --git a/third_party/nccl/nccl_configure.bzl b/third_party/nccl/nccl_configure.bzl index 4b341d02d22..68e093c1e7b 100644 --- a/third_party/nccl/nccl_configure.bzl +++ b/third_party/nccl/nccl_configure.bzl @@ -72,6 +72,11 @@ def _nccl_configure_impl(repository_ctx): nccl_version = repository_ctx.os.environ[_TF_NCCL_VERSION].strip() nccl_version = nccl_version.split(".")[0] + cuda_config = find_cuda_config(repository_ctx, ["cuda"]) + cuda_version = cuda_config["cuda_version"].split(".") + cuda_major = cuda_version[0] + cuda_minor = cuda_version[1] + if nccl_version == "": # Alias to open source build from @nccl_archive. repository_ctx.file("BUILD", _NCCL_ARCHIVE_BUILD_CONTENT) @@ -84,9 +89,18 @@ def _nccl_configure_impl(repository_ctx): # Round-about way to make the list unique. gpu_architectures = dict(zip(gpu_architectures, gpu_architectures)).keys() - repository_ctx.template("build_defs.bzl", _label("build_defs.bzl.tpl"), { + config_wrap = { "%{gpu_architectures}": str(gpu_architectures), - }) + "%{use_bin2c_path}": "False", + } + if (int(cuda_major), int(cuda_minor)) <= (10, 1): + config_wrap["%{use_bin2c_path}"] = "True" + + repository_ctx.template( + "build_defs.bzl", + _label("build_defs.bzl.tpl"), + config_wrap, + ) else: # Create target for locally installed NCCL. config = find_cuda_config(repository_ctx, ["nccl"]) From 275372da1945b6ed52f55e51f017d830b04d9158 Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Thu, 5 Dec 2019 16:48:21 -0800 Subject: [PATCH 223/383] [XLA GPU] Group subsequent reduction input dimensions Groups subsequent dimensions in reduction input, e.g. reducing dimensions (0,1) on [100,200] turns into reducing dimension (0) on [20000]. This change allows a considerable simplification and generalization of the pattern-matching logic in reduction emitter, as after this pass we simply have three cases: - Reducing a single, non-minor dimension: tiled column reduction - Reducing a minor dimension, and optionally another non-minor batch dimension: tiled row reduction. - Everything else: slow loop reduction PiperOrigin-RevId: 284082960 Change-Id: I049dcb61391ba8096ee97174d071434839fcec73 --- .../service/dfs_hlo_visitor_with_default.h | 12 ++ tensorflow/compiler/xla/service/gpu/BUILD | 44 ++++++ .../xla/service/gpu/nvptx_compiler.cc | 4 + .../gpu/reduction_degenerate_dim_remover.cc | 15 +- .../gpu/reduction_dimension_grouper.cc | 107 +++++++++++++++ .../service/gpu/reduction_dimension_grouper.h | 54 ++++++++ .../gpu/reduction_layout_normalizer.cc | 129 ++++++++++++++++++ .../service/gpu/reduction_layout_normalizer.h | 50 +++++++ .../compiler/xla/service/gpu/tests/BUILD | 54 ++++++++ .../gpu/tests/gpu_kernel_tiling_test.cc | 2 +- .../reduction_degenerate_dim_remover_test.cc | 9 +- .../tests/reduction_dimension_grouper_test.cc | 70 ++++++++++ .../tests/reduction_layout_normalizer_test.cc | 69 ++++++++++ 13 files changed, 604 insertions(+), 15 deletions(-) create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc create mode 100644 tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h create mode 100644 tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc create mode 100644 tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h index c5ed810c917..37a54f86d3d 100644 --- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h +++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h @@ -22,7 +22,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/compiler/xla/types.h" #include "tensorflow/compiler/xla/xla_data.pb.h" #include "tensorflow/core/lib/core/status.h" @@ -254,6 +256,16 @@ using ConstDfsHloVisitorWithDefault = // visiting. class DfsHloRewriteVisitor : public DfsHloVisitorWithDefault { public: + // Runs a visitor on the module and returns whether the module has changed. + StatusOr RunOnModule(HloModule* module) { + bool is_changed = false; + for (const auto& computation : module->computations()) { + TF_RETURN_IF_ERROR(computation->Accept(this)); + is_changed |= changed(); + } + return is_changed; + } + // Default visitor action is to do nothing and return OK. Status DefaultAction(HloInstruction* /*hlo_instruction*/) override { return Status::OK(); diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD index 96cf0c5c22b..eb8b848fc3f 100755 --- a/tensorflow/compiler/xla/service/gpu/BUILD +++ b/tensorflow/compiler/xla/service/gpu/BUILD @@ -1197,6 +1197,8 @@ cc_library( ":gpu_conv_rewriter", ":gpu_layout_assignment", ":reduction_degenerate_dim_remover", + ":reduction_dimension_grouper", + ":reduction_layout_normalizer", ":stream_executor_util", ":target_constants", "//tensorflow/compiler/xla:status_macros", @@ -1686,3 +1688,45 @@ cc_library( "@com_google_absl//absl/types:optional", ], ) + +cc_library( + name = "reduction_dimension_grouper", + srcs = ["reduction_dimension_grouper.cc"], + hdrs = ["reduction_dimension_grouper.h"], + deps = [ + ":ir_emission_utils", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_casting_utils", + "//tensorflow/compiler/xla/service:hlo_pass", + "//tensorflow/compiler/xla/service:pattern_matcher", + "//tensorflow/core:lib", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + ], +) + +cc_library( + name = "reduction_layout_normalizer", + srcs = ["reduction_layout_normalizer.cc"], + hdrs = ["reduction_layout_normalizer.h"], + deps = [ + ":ir_emission_utils", + "//tensorflow/compiler/xla:shape_util", + "//tensorflow/compiler/xla:status_macros", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_casting_utils", + "//tensorflow/compiler/xla/service:hlo_pass", + "//tensorflow/compiler/xla/service:pattern_matcher", + "//tensorflow/core:lib", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/algorithm:container", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:optional", + ], +) diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc index 0c46910e86e..6635b68899d 100755 --- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc +++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc @@ -33,6 +33,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h" #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h" #include "tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.h" +#include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h" +#include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h" #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h" #include "tensorflow/compiler/xla/service/gpu/target_constants.h" #include "tensorflow/compiler/xla/service/hlo_constant_folding.h" @@ -156,6 +158,8 @@ Status NVPTXCompiler::OptimizeHloPostLayoutAssignment( LayoutAssignment::InstructionCanChangeLayout); pipeline.AddPass(); + pipeline.AddPass(); + pipeline.AddPass(); // The LayoutAssignment pass may leave behind kCopy instructions which are // duplicate or NOPs, so remove them with algebraic simplification and CSE. diff --git a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc index e3762aaef3a..2c786b577fc 100644 --- a/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc +++ b/tensorflow/compiler/xla/service/gpu/reduction_degenerate_dim_remover.cc @@ -82,20 +82,9 @@ class ReductionDegenerateDimRemoverVisitor : public DfsHloRewriteVisitor { } }; -template -static Status RunVisitor(HloModule *module, bool *changed) { - for (const auto &computation : module->computations()) { - Visitor visitor; - TF_RETURN_IF_ERROR(computation->Accept(&visitor)); - *changed |= visitor.changed(); - } - return Status::OK(); -} - StatusOr ReductionDegenerateDimRemover::Run(HloModule *module) { - bool changed = false; - TF_RETURN_IF_ERROR( - RunVisitor(module, &changed)); + TF_ASSIGN_OR_RETURN( + bool changed, ReductionDegenerateDimRemoverVisitor().RunOnModule(module)); return changed; } diff --git a/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc new file mode 100644 index 00000000000..66b458e1ba4 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.cc @@ -0,0 +1,107 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h" + +#include + +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/pattern_matcher.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace xla { +namespace gpu { + +class ReduceDimensionGroupVisitor : public DfsHloRewriteVisitor { + public: + Status HandleReduce(HloInstruction *reduce) override { + VLOG(4) << "Input: " << reduce->ToString(); + + if (!reduce->shape().IsArray()) { + // TODO(cheshire): Handle variadic reduction. + return Status::OK(); + } + + std::vector new_grouped_dims; + std::vector reduced_dims_grouped; + HloInstruction *operand = reduce->mutable_operand(0); + const Shape &shape = operand->shape(); + CHECK(shape == LayoutUtil::GetWithDefaultLayout(shape)) + << "Default layout should be enforced on reduction operand"; + auto is_reduced = [&](int dim) { + return absl::c_linear_search(reduce->dimensions(), dim); + }; + + bool changed = false; + int64 next_dim_size = 1; + + // Since we have enforced the standard layout, iteration over logical + // dimensions is equivalent to iteration over the major-to-minor order. + for (int logical_dim = 0; logical_dim < shape.rank(); logical_dim++) { + VLOG(5) << "Processing dimension " << logical_dim << " of size " + << shape.dimensions(logical_dim); + if (is_reduced(logical_dim) && logical_dim < shape.rank() - 1 && + is_reduced(logical_dim + 1)) { + VLOG(5) << "This and consecutive dimension are reduced, merging"; + changed = true; + next_dim_size *= shape.dimensions(logical_dim); + continue; + } + + if (is_reduced(logical_dim)) { + new_grouped_dims.push_back(next_dim_size * + shape.dimensions(logical_dim)); + reduced_dims_grouped.push_back(new_grouped_dims.size() - 1); + next_dim_size = 1; + } else { + new_grouped_dims.push_back(shape.dimensions(logical_dim)); + } + } + + if (!changed) { + return Status::OK(); + } + + Shape grouped_shape = + ShapeUtil::MakeShape(shape.element_type(), new_grouped_dims); + HloInstruction *reduce_input_grouped = reduce->parent()->AddInstruction( + HloInstruction::CreateBitcast(grouped_shape, operand)); + + std::unique_ptr new_reduce = HloInstruction::CreateReduce( + reduce->shape(), reduce_input_grouped, reduce->mutable_operand(1), + reduced_dims_grouped, reduce->to_apply()); + VLOG(5) << "Generated new reduction: " << new_reduce->ToString(); + return ReplaceWithNewInstruction(reduce, std::move(new_reduce)); + } +}; + +StatusOr ReductionDimensionGrouper::Run(HloModule *module) { + TF_ASSIGN_OR_RETURN(bool changed, + ReduceDimensionGroupVisitor().RunOnModule(module)); + return changed; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h new file mode 100644 index 00000000000..8a78d3fca07 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/reduction_dimension_grouper.h @@ -0,0 +1,54 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_DIMENSION_GROUPER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_DIMENSION_GROUPER_H_ + +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { +namespace gpu { + +// Groups adjacent (logically and physically) reduced dimensions in reduction +// input. +// +// Precondition: ReductionLayoutNormalizer has been run (physical proximity and +// logical proximity become the same). +// +// For example, +// +// f[] out = reduce(f[10,20,30] input, dimensions={0,1,2}) +// +// becomes: +// +// f[600] tmp = f[600] bitcast(f[10,20,30] input) +// f[] out = reduce(f[600] tmp, dimensions={0}) +// +// TODO(cheshire): handle variadic reduction +class ReductionDimensionGrouper : public HloModulePass { + public: + absl::string_view name() const override { + return "reduction-dimension-grouper"; + } + + StatusOr Run(HloModule* module) override; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_DIMENSION_GROUPER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc new file mode 100644 index 00000000000..295ccebd442 --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.cc @@ -0,0 +1,129 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h" + +#include + +#include "absl/algorithm/container.h" +#include "absl/strings/str_join.h" +#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h" +#include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_opcode.h" +#include "tensorflow/compiler/xla/service/pattern_matcher.h" +#include "tensorflow/compiler/xla/shape_util.h" +#include "tensorflow/compiler/xla/status_macros.h" +#include "tensorflow/compiler/xla/statusor.h" +#include "tensorflow/core/lib/core/errors.h" +#include "tensorflow/stream_executor/lib/statusor.h" + +namespace xla { +namespace gpu { + +class EnforceMinorToMajorReduceOpVisitor : public DfsHloRewriteVisitor { + Status HandleReduce(HloInstruction *reduce) override { + VLOG(5) << "Input: " << reduce->ToString(); + HloInstruction *operand = reduce->mutable_operand(0); + const Shape &operand_shape = operand->shape(); + const Layout &operand_layout = operand_shape.layout(); + const Shape &reduce_shape = reduce->shape(); + + if (!reduce_shape.IsArray()) { + // TODO(cheshire): Handle variadic reduction. + return Status::OK(); + } + + std::vector new_reduce_dimensions; + std::vector new_operand_shape_data; + std::vector new_reduce_shape_data; + + // The layout order of the reduction output can be different to the + // ordering of kept dimensions in the input operand, thus we need to + // calculate the new layout. + std::vector new_reduce_shape_layout(reduce_shape.rank()); + std::vector reduce_shape_logical_to_physical = + LayoutUtil::MakeLogicalToPhysical(reduce_shape.layout()); + + auto to_reduce_logical_dim = [&](int64 op_logical_dim) { + return op_logical_dim - + absl::c_count_if(reduce->dimensions(), [&](int64 dim) { + CHECK(dim != op_logical_dim); + return dim < op_logical_dim; + }); + }; + + for (int i = 0; i < operand_shape.rank(); i++) { + // Process the dimensions in the major-to-minor order in order to enforce + // the default layout. + int64 major_to_minor_dim_idx = operand_shape.rank() - i - 1; + int64 logical_dim = operand_layout.minor_to_major(major_to_minor_dim_idx); + int64 dim_size = operand_shape.dimensions(logical_dim); + VLOG(5) << "Processing logical dimension " << logical_dim << " of size " + << dim_size; + new_operand_shape_data.push_back(dim_size); + + if (absl::c_linear_search(reduce->dimensions(), logical_dim)) { + new_reduce_dimensions.push_back(i); + } else { + new_reduce_shape_data.push_back(dim_size); + int64 logical_reduce_dim = to_reduce_logical_dim(logical_dim); + int64 physical_reduce_dim = + reduce_shape_logical_to_physical[logical_reduce_dim]; + VLOG(5) << "logical_reduce_dim = " << logical_reduce_dim << ", " + << "physical_reduce_dim = " << physical_reduce_dim; + new_reduce_shape_layout[reduce_shape.rank() - physical_reduce_dim - 1] = + new_reduce_shape_data.size() - 1; + } + } + + Shape new_operand_shape = ShapeUtil::MakeShape(operand_shape.element_type(), + new_operand_shape_data); + if (new_operand_shape == operand_shape) { + return Status::OK(); + } + + Shape new_reduce_shape = ShapeUtil::MakeShapeWithLayout( + reduce_shape.element_type(), new_reduce_shape_data, + new_reduce_shape_layout); + HloInstruction *canonical_reduce_input = reduce->parent()->AddInstruction( + HloInstruction::CreateBitcast(new_operand_shape, operand)); + + VLOG(5) << "Reduction input: " << canonical_reduce_input->ToString(); + std::unique_ptr new_reduce = HloInstruction::CreateReduce( + new_reduce_shape, canonical_reduce_input, reduce->mutable_operand(1), + new_reduce_dimensions, reduce->to_apply()); + VLOG(5) << "Generated new reduction: " << new_reduce->ToString(); + + if (new_reduce_shape != reduce_shape) { + HloInstruction *wrapped_reduce = + reduce->parent()->AddInstruction(std::move(new_reduce)); + new_reduce = HloInstruction::CreateBitcast(reduce_shape, wrapped_reduce); + } + + VLOG(5) << "Generated output: " << new_reduce->ToString(); + return ReplaceWithNewInstruction(reduce, std::move(new_reduce)); + } +}; + +StatusOr ReductionLayoutNormalizer::Run(HloModule *module) { + TF_ASSIGN_OR_RETURN(bool changed, + EnforceMinorToMajorReduceOpVisitor().RunOnModule(module)); + return changed; +} + +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h new file mode 100644 index 00000000000..d27c847f8ea --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/reduction_layout_normalizer.h @@ -0,0 +1,50 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_LAYOUT_NORMALIZER_H_ +#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_LAYOUT_NORMALIZER_H_ + +#include "absl/types/optional.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_pass_interface.h" + +namespace xla { +namespace gpu { + +// Enforces default (minor-to-major) layout on all reduction inputs. +// Note that since reduction output can request a custom layout, +// this pass only guarantees standard layout for the input. +// +// For example, +// +// f[20,30]{0,1} out = reduce(f[10,20,30]{2,0,1} input, dimensions={0}) +// +// becomes: +// +// f[20,10,30] tmp = f[20,10,30] bitcast(f[10,20,30]{2,0,1} input) +// f[20,30]{0,1} out = reduce(f[20,10,30]{2,1,0} tmp, dimensions={1}) +class ReductionLayoutNormalizer : public HloModulePass { + public: + absl::string_view name() const override { + return "reduction-layout-normalizer"; + } + + StatusOr Run(HloModule* module) override; +}; + +} // namespace gpu +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_REDUCTION_LAYOUT_NORMALIZER_H_ diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD index a472bfc19d2..51a12e1f2fe 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/BUILD +++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD @@ -108,6 +108,60 @@ tf_cc_test( ], ) +tf_cc_test( + name = "reduction_layout_normalizer_test", + srcs = [ + "reduction_layout_normalizer_test.cc", + ], + tags = tf_cuda_tests_tags(), + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/service/gpu:gemm_rewriter", + "//tensorflow/compiler/xla/service/gpu:gpu_executable", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:llvm_irgen_test_base", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/memory", + ], +) + +tf_cc_test( + name = "reduction_dimension_grouper_test", + srcs = [ + "reduction_dimension_grouper_test.cc", + ], + tags = tf_cuda_tests_tags(), + deps = [ + ":gpu_codegen_test", + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:statusor", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_module_config", + "//tensorflow/compiler/xla/service:hlo_parser", + "//tensorflow/compiler/xla/service/gpu:gemm_rewriter", + "//tensorflow/compiler/xla/service/gpu:gpu_executable", + "//tensorflow/compiler/xla/tests:filecheck", + "//tensorflow/compiler/xla/tests:hlo_test_base", + "//tensorflow/compiler/xla/tests:llvm_irgen_test_base", + "//tensorflow/core:lib", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/stream_executor/lib", + "@com_google_absl//absl/memory", + ], +) + tf_cc_test( name = "gpu_copy_test", srcs = ["gpu_copy_test.cc"], diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc index 92bb84065a2..ae10fb161d6 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc @@ -461,7 +461,7 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) { .ValueOrDie(); CompileAndVerifyIr(std::move(hlo_module), R"( -; CHECK-LABEL: define void @reduce +; CHECK-LABEL: define void @ ; CHECK: atomicrmw fadd float ; CHECK: } )", diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc index 9dd8a6fc664..686092706f7 100644 --- a/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_degenerate_dim_remover_test.cc @@ -32,7 +32,14 @@ namespace gpu { namespace { -class ReductionDegenerateDimRemoverTest : public GpuCodegenTest {}; +class ReductionDegenerateDimRemoverTest : public GpuCodegenTest { + DebugOptions GetDebugOptionsForTest() override { + DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest(); + debug_options.add_xla_disable_hlo_passes("reduction-layout-normalizer"); + debug_options.add_xla_disable_hlo_passes("reduction-dimension-grouper"); + return debug_options; + } +}; TEST_F(ReductionDegenerateDimRemoverTest, ReductionWithDegenerateDimensions) { const char* hlo_text = R"( diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc new file mode 100644 index 00000000000..a9e0b9b5c5f --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_dimension_grouper_test.cc @@ -0,0 +1,70 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { + +namespace { + +class ReductionDimensionGrouperTest : public GpuCodegenTest { + DebugOptions GetDebugOptionsForTest() override { + DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest(); + debug_options.add_xla_disable_hlo_passes("reduction-layout-normalizer"); + debug_options.add_xla_disable_hlo_passes("layout-assignment"); + return debug_options; + } +}; + +TEST_F(ReductionDimensionGrouperTest, ReductionWithGrouping) { + const char* hlo_text = R"( +HloModule ReductionWithGrouping + +add { + accum = f32[] parameter(0) + op = f32[] parameter(1) + ROOT out = f32[] add(accum, op) +} + +ENTRY main { + input = f32[100,10,32,3]{3,2,1,0} parameter(0) + zero = f32[] constant(0) + + ROOT out = f32[100,10]{0,1} reduce(input, zero), dimensions={2,3}, to_apply=add +} + + +)"; + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); + MatchOptimizedHloWithShapes(hlo_text, + R"( +// CHECK: f32[100,10]{0,1} reduce(f32[100,10,96]{2,1,0} {{.+}}, f32[] {{.+}}), dimensions={2}, to_apply=%add + )"); +} + +} // namespace +} // namespace gpu +} // namespace xla diff --git a/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc new file mode 100644 index 00000000000..49b8bbf1d6b --- /dev/null +++ b/tensorflow/compiler/xla/service/gpu/tests/reduction_layout_normalizer_test.cc @@ -0,0 +1,69 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include + +#include "tensorflow/compiler/xla/service/gpu/gpu_executable.h" +#include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h" +#include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_parser.h" +#include "tensorflow/compiler/xla/tests/filecheck.h" +#include "tensorflow/compiler/xla/tests/hlo_test_base.h" +#include "tensorflow/core/lib/core/status_test_util.h" +#include "tensorflow/core/platform/test.h" + +namespace xla { +namespace gpu { + +namespace { + +class ReductionLayoutNormalizerTest : public GpuCodegenTest { + DebugOptions GetDebugOptionsForTest() override { + DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest(); + debug_options.add_xla_disable_hlo_passes("reduction-dimension-grouper"); + debug_options.add_xla_disable_hlo_passes("layout-assignment"); + return debug_options; + } +}; + +TEST_F(ReductionLayoutNormalizerTest, LayoutCanonicalizerTest) { + const char* hlo_text = R"( +HloModule ReduceWithLayoutChange + +add { + x0 = f32[] parameter(0) + y0 = f32[] parameter(1) + ROOT add0 = f32[] add(x0, y0) +} + +ENTRY main { + arg0 = f32[4,5,5,16,12,12,3,3]{2,3,5,4,0,7,6,1} parameter(0) + constant0 = f32[] constant(0) + ROOT reduce0 = f32[4,5,16,12,12]{4,3,2,1,0} reduce(arg0, constant0), + dimensions={1,6,7}, to_apply=add +} + +)"; + + EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5})); + MatchOptimizedHloWithShapes(hlo_text, + R"( +// CHECK: f32[4,12,12,16,5]{2,1,3,4,0} reduce(f32[5,3,3,4,12,12,16,5]{7,6,5,4,3,2,1,0} {{.+}}, f32[] {{.+}}), dimensions={0,1,2}, to_apply=%add + )"); +} + +} // namespace +} // namespace gpu +} // namespace xla From 7e628ed777a39976a7cf5f5d93b63f6a15d5aea8 Mon Sep 17 00:00:00 2001 From: Haoliang Zhang Date: Thu, 5 Dec 2019 16:48:42 -0800 Subject: [PATCH 224/383] [fix] display the correct log message in benchmark_model.cc. PiperOrigin-RevId: 284083017 Change-Id: I5f921f066f518ab7043438740822f2b2509ae6bf --- tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc index dc4a43ee6cb..4d37a868aaf 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc @@ -590,8 +590,8 @@ TfLiteStatus BenchmarkTfLiteModel::Init() { if (!inputs_.empty()) { TFLITE_BENCHMARK_CHECK_EQ(inputs_.size(), interpreter_inputs.size()) - << "Inputs mismatch: Model inputs #:" << interpreter_inputs.size() - << " expected: " << inputs_.size(); + << "Inputs mismatch: Model inputs #:" << inputs_.size() + << " expected: " << interpreter_inputs.size(); } // Check if the tensor names match, and log a warning if it doesn't. From f409daa583fa3c52e76f75da341807ea5edd3bf1 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Thu, 5 Dec 2019 16:52:04 -0800 Subject: [PATCH 225/383] Fixes docstring for tf.math.abs PiperOrigin-RevId: 284083667 Change-Id: I2d2b7152b552b3fd48cac1ddbb3099c8bfc6893c --- tensorflow/python/ops/math_ops.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index b29d413e194..ee21d3f3c31 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -254,13 +254,15 @@ def abs(x, name=None): # pylint: disable=redefined-builtin corresponding element in the input. Given a tensor `x` of complex numbers, this operation returns a tensor of type - `float32` or `float64` that is the absolute value of each element in `x`. All - elements in `x` must be complex numbers of the form \\(a + bj\\). The - absolute value is computed as \\( \sqrt{a^2 + b^2}\\). For example: - ```python - x = tf.constant([[-2.25 + 4.75j], [-3.25 + 5.75j]]) - tf.abs(x) # [5.25594902, 6.60492229] - ``` + `float32` or `float64` that is the absolute value of each element in `x`. For + a complex number \\(a + bj\\), its absolute value is computed as \\(\sqrt{a^2 + + b^2}\\). For example: + + >>> x = tf.constant([[-2.25 + 4.75j], [-3.25 + 5.75j]]) + >>> tf.abs(x) + Args: x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`, @@ -268,10 +270,9 @@ def abs(x, name=None): # pylint: disable=redefined-builtin name: A name for the operation (optional). Returns: - A `Tensor` or `SparseTensor` the same size, type, and sparsity as `x` with - absolute values. - Note, for `complex64` or `complex128` input, the returned `Tensor` will be - of type `float32` or `float64`, respectively. + A `Tensor` or `SparseTensor` of the same size, type and sparsity as `x`, + with absolute values. Note, for `complex64` or `complex128` input, the + returned `Tensor` will be of type `float32` or `float64`, respectively. """ with ops.name_scope(name, "Abs", [x]) as name: x = ops.convert_to_tensor(x, name="x") From 539fb4f81051c76651ad94f41135bceb5de05a1a Mon Sep 17 00:00:00 2001 From: Anirudh Sriram Date: Thu, 5 Dec 2019 16:56:39 -0800 Subject: [PATCH 226/383] Update doc strings for tf.split PiperOrigin-RevId: 284084404 Change-Id: I8ca99bff055586a5d99a0e1e3210b9eb2d349663 --- tensorflow/python/ops/array_ops.py | 174 +++++++++++++---------------- 1 file changed, 80 insertions(+), 94 deletions(-) diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py index c7e7eda462e..6bab9dcb196 100644 --- a/tensorflow/python/ops/array_ops.py +++ b/tensorflow/python/ops/array_ops.py @@ -210,34 +210,32 @@ def fill(dims, value, name=None): For example: - # Output tensor with shape [2, 3]. - >>> tf.fill([2, 3], 9) - + ``` + # Output tensor has shape [2, 3]. + fill([2, 3], 9) ==> [[9, 9, 9] + [9, 9, 9]] + ``` - `tf.fill` evaluates at graph runtime and supports dynamic shapes based on - other runtime `tf.Tensors`, unlike `tf.constant(value, shape=dims)`, which - embeds the value as a `Const` node. + `tf.fill` differs from `tf.constant` in a few ways: + + * `tf.fill` only supports scalar contents, whereas `tf.constant` supports + Tensor values. + * `tf.fill` creates an Op in the computation graph that constructs the + actual + Tensor value at runtime. This is in contrast to `tf.constant` which embeds + the entire Tensor into the graph with a `Const` node. + * Because `tf.fill` evaluates at graph runtime, it supports dynamic shapes + based on other runtime Tensors, unlike `tf.constant`. Args: - dims: A 1-D sequence of non-negative numbers. Represents the shape of the - output `tf.Tensor`. Entries should be of type: `int32`, `int64`. - value: A value to fill the returned `tf.Tensor`. - name: Optional string. The name of the output `tf.Tensor`. + dims: A `Tensor`. Must be one of the following types: `int32`, `int64`. 1-D. + Represents the shape of the output tensor. + value: A `Tensor`. 0-D (scalar). Value to fill the returned tensor. + @compatibility(numpy) Equivalent to np.full @end_compatibility + name: A name for the operation (optional). Returns: - A `tf.Tensor` with shape `dims` and the same dtype as `value`. - - Raises: - InvalidArgumentError: `dims` contains negative entries. - NotFoundError: `dims` contains non-integer entries. - - @compatibility(numpy) - Similar to `np.full`. In `numpy`, more parameters are supported. Passing a - number argument as the shape (`np.full(5, value)`) is valid in `numpy` for - specifying a 1-D shaped result, while TensorFlow does not support this syntax. - @end_compatibility + A `Tensor`. Has the same type as `value`. """ result = gen_array_ops.fill(dims, value, name=name) tensor_util.maybe_set_static_shape(result, dims) @@ -544,8 +542,6 @@ def shape_v2(input, out_type=dtypes.int32, name=None): """Returns the shape of a tensor. This operation returns a 1-D integer tensor representing the shape of `input`. - This represents the minimal set of known information at definition time. - For example: @@ -573,10 +569,6 @@ def shape_v2(input, out_type=dtypes.int32, name=None): `int64`). Defaults to `tf.int32`. name: A name for the operation (optional). - `tf.shape` and `Tensor.shape` should be identical in eager mode. Within - `tf.function` or within a `compat.v1` context, not all dimensions may be - known until execution time. - Returns: A `Tensor` of type `out_type`. """ @@ -1889,11 +1881,11 @@ unique_with_counts.__doc__ = gen_array_ops.unique_with_counts.__doc__ @tf_export("split") def split(value, num_or_size_splits, axis=0, num=None, name="split"): - """Splits a tensor into sub tensors. + """Splits a tensor `value` into a list of sub tensors. - If `num_or_size_splits` is an integer, then `value` is split along dimension - `axis` into `num_split` smaller tensors. This requires that `num_split` evenly - divides `value.shape[axis]`. + If `num_or_size_splits` is an integer, then `value` is split along the + dimension `axis` into `num_split` smaller tensors. This requires that + `value.shape[axis]` is divisible by `num_split`. If `num_or_size_splits` is a 1-D Tensor (or list), we call it `size_splits` and `value` is split into `len(size_splits)` elements. The shape of the `i`-th @@ -1902,17 +1894,21 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"): For example: - ```python - # 'value' is a tensor with shape [5, 30] - # Split 'value' into 3 tensors with sizes [4, 15, 11] along dimension 1 - split0, split1, split2 = tf.split(value, [4, 15, 11], 1) - tf.shape(split0) # [5, 4] - tf.shape(split1) # [5, 15] - tf.shape(split2) # [5, 11] - # Split 'value' into 3 tensors along dimension 1 - split0, split1, split2 = tf.split(value, num_or_size_splits=3, axis=1) - tf.shape(split0) # [5, 10] - ``` + >>> x = tf.Variable(tf.random.uniform([5, 30], -1, 1)) + + Split `x` into 3 tensors along dimension 1 + >>> s0, s1, s2 = tf.split(x, num_or_size_splits=3, axis=1) + >>> tf.shape(s0).numpy() + array([ 5, 10], dtype=int32) + + Split `x` into 3 tensors with sizes [4, 15, 11] along dimension 1 + >>> split0, split1, split2 = tf.split(x, [4, 15, 11], 1) + >>> tf.shape(split0).numpy() + array([5, 4], dtype=int32) + >>> tf.shape(split1).numpy() + array([ 5, 15], dtype=int32) + >>> tf.shape(split2).numpy() + array([ 5, 11], dtype=int32) Args: value: The `Tensor` to split. @@ -1928,8 +1924,8 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"): name: A name for the operation (optional). Returns: - if `num_or_size_splits` is a scalar returns `num_or_size_splits` `Tensor` - objects; if `num_or_size_splits` is a 1-D Tensor returns + if `num_or_size_splits` is a scalar returns a list of `num_or_size_splits` + `Tensor` objects; if `num_or_size_splits` is a 1-D Tensor returns `num_or_size_splits.get_shape[0]` `Tensor` objects resulting from splitting `value`. @@ -1960,17 +1956,16 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"): @tf_export("transpose", v1=[]) def transpose_v2(a, perm=None, conjugate=False, name="transpose"): - """Transposes `a`, where `a` is a Tensor. + """Transposes `a`. - Permutes the dimensions according to the value of `perm`. + Permutes the dimensions according to `perm`. - The returned tensor's dimension `i` will correspond to the input dimension - `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is the rank - of the input tensor. Hence by default, this operation performs a regular - matrix transpose on 2-D input Tensors. - - If conjugate is `True` and `a.dtype` is either `complex64` or `complex128` - then the values of `a` are conjugated and transposed. + The returned tensor's dimension i will correspond to the input dimension + `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is + the rank of the input tensor. Hence by default, this operation performs a + regular matrix transpose on 2-D input Tensors. If conjugate is True and + `a.dtype` is either `complex64` or `complex128` then the values of `a` + are conjugated and transposed. @compatibility(numpy) In `numpy` transposes are memory-efficient constant time operations as they @@ -1982,52 +1977,43 @@ def transpose_v2(a, perm=None, conjugate=False, name="transpose"): For example: - >>> x = tf.constant([[1, 2, 3], [4, 5, 6]]) - >>> tf.transpose(x) - + ```python + x = tf.constant([[1, 2, 3], [4, 5, 6]]) + tf.transpose(x) # [[1, 4] + # [2, 5] + # [3, 6]] - Equivalently, you could call `tf.transpose(x, perm=[1, 0])`. + # Equivalently + tf.transpose(x, perm=[1, 0]) # [[1, 4] + # [2, 5] + # [3, 6]] - If `x` is complex, setting conjugate=True gives the conjugate transpose: + # If x is complex, setting conjugate=True gives the conjugate transpose + x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j], + [4 + 4j, 5 + 5j, 6 + 6j]]) + tf.transpose(x, conjugate=True) # [[1 - 1j, 4 - 4j], + # [2 - 2j, 5 - 5j], + # [3 - 3j, 6 - 6j]] - >>> x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j], - ... [4 + 4j, 5 + 5j, 6 + 6j]]) - >>> tf.transpose(x, conjugate=True) - + # 'perm' is more useful for n-dimensional tensors, for n > 2 + x = tf.constant([[[ 1, 2, 3], + [ 4, 5, 6]], + [[ 7, 8, 9], + [10, 11, 12]]]) - 'perm' is more useful for n-dimensional tensors where n > 2: - - >>> x = tf.constant([[[ 1, 2, 3], - ... [ 4, 5, 6]], - ... [[ 7, 8, 9], - ... [10, 11, 12]]]) - - As above, simply calling `tf.transpose` will default to `perm=[2,1,0]`. - - To take the transpose of the matrices in dimension-0 (such as when you are - transposing matrices where 0 is the batch dimesnion), you would set - `perm=[0,2,1]`. - - >>> tf.transpose(x, perm=[0, 2, 1]) - - - Note: This has a shorthand `linalg.matrix_transpose`): + # Take the transpose of the matrices in dimension-0 + # (this common operation has a shorthand `linalg.matrix_transpose`) + tf.transpose(x, perm=[0, 2, 1]) # [[[1, 4], + # [2, 5], + # [3, 6]], + # [[7, 10], + # [8, 11], + # [9, 12]]] + ``` Args: a: A `Tensor`. - perm: A permutation of the dimensions of `a`. This should be a vector. + perm: A permutation of the dimensions of `a`. conjugate: Optional bool. Setting it to `True` is mathematically equivalent to tf.math.conj(tf.transpose(input)). name: A name for the operation (optional). From 27d2a54f2bf299d8ecc29b096e35a30fb1b78243 Mon Sep 17 00:00:00 2001 From: Lei Zhang Date: Thu, 5 Dec 2019 16:57:35 -0800 Subject: [PATCH 227/383] Add tf.All/tf.Any to XLA HLO legalization Also tightened tf.All/tf.Any verification PiperOrigin-RevId: 284084536 Change-Id: I900a6fb499e5b7710c58c0f8cd9dafb109cf35f0 --- .../mlir/tensorflow/ir/tf_generated_ops.td | 4 ++ .../compiler/mlir/tensorflow/ir/tf_ops.cc | 43 +++++++++++++ .../mlir/tensorflow/tests/tf-ops.mlir | 42 +++++++++++++ .../compiler/mlir/xla/tests/legalize-tf.mlir | 60 +++++++++++++++++++ .../mlir/xla/transforms/legalize_tf.cc | 37 ++++++++++-- 5 files changed, 182 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td index ef25e27db12..4b0670256c1 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td @@ -143,6 +143,8 @@ retained with length 1. ); TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>; + + let verifier = [{ return Verify(*this); }]; } def TF_AnyOp : TF_Op<"Any", [NoSideEffect]> { @@ -169,6 +171,8 @@ retained with length 1. ); TF_DerivedOperandTypeAttr Tidx = TF_DerivedOperandTypeAttr<1>; + + let verifier = [{ return Verify(*this); }]; } def TF_ArgMaxOp : TF_Op<"ArgMax", [NoSideEffect]> { diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index 8d37ef85527..d7dee85188f 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -310,6 +310,49 @@ void AddV2Op::getCanonicalizationPatterns(OwningRewritePatternList &results, results.insert(context); } +//===----------------------------------------------------------------------===// +// AllOp +//===----------------------------------------------------------------------===// + +// Verifies an reduction op's `input` and reduction `dims`. +static LogicalResult VerifyReductionInputAndDims(Value *input, Value *dims, + Location loc) { + auto dims_type = dims->getType().dyn_cast(); + if (!dims_type) return success(); + if (dims_type.getRank() > 1) + return emitError(loc, "dimensions can only be 0D or 1D tensor"); + + auto input_type = input->getType().dyn_cast(); + if (!input_type) return success(); + int64_t rank = input_type.getRank(); + + DenseIntElementsAttr dims_attr; + if (!matchPattern(dims, m_Constant(&dims_attr))) return success(); + for (const auto &dim_pair : llvm::enumerate(dims_attr)) { + int64_t cur_dim = dim_pair.value().getSExtValue(); + if (cur_dim < -rank || cur_dim >= rank) + return emitError(loc) + << dim_pair.index() << "-th dimension should be in the range of [-" + << rank << ", " << rank << ")"; + } + + return success(); +} + +static LogicalResult Verify(AllOp op) { + return VerifyReductionInputAndDims(op.input(), op.reduction_indices(), + op.getLoc()); +} + +//===----------------------------------------------------------------------===// +// AnyOp +//===----------------------------------------------------------------------===// + +static LogicalResult Verify(AnyOp op) { + return VerifyReductionInputAndDims(op.input(), op.reduction_indices(), + op.getLoc()); +} + //===----------------------------------------------------------------------===// // AssertOp //===----------------------------------------------------------------------===// diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir index cc16b545c93..3e745efdd9f 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir @@ -1753,3 +1753,45 @@ func @testSplitV2(%input: tensor<4x4xf32>) { %0:2 = "tf.SplitV"(%input, %split_sizes, %split_dim) : (tensor<4x4xf32>, tensor<2xi32>, tensor) -> (tensor<*xf32>, tensor<*xf32>) return } + +// ----- + +//===--------------------------------------------------------------------===// +// tf.All +//===--------------------------------------------------------------------===// + +func @testAllDimWrongRank(%input: tensor<4x6xi1>, %dims: tensor<2x2xi32>) { + // expected-error @+1 {{dimensions can only be 0D or 1D tensor}} + %0 = "tf.All"(%input, %dims) : (tensor<4x6xi1>, tensor<2x2xi32>) -> (tensor<*xi1>) + return +} + +// ----- + +func @testAllDimOutOfRange(%input: tensor<4x6xi1>) { + %dims = "tf.Const"() {value = dense<[-1, 5]> : tensor<2xi32>} : () -> (tensor<2xi32>) + // expected-error @+1 {{1-th dimension should be in the range of [-2, 2)}} + %0 = "tf.All"(%input, %dims) : (tensor<4x6xi1>, tensor<2xi32>) -> (tensor<*xi1>) + return +} + +// ----- + +//===--------------------------------------------------------------------===// +// tf.Any +//===--------------------------------------------------------------------===// + +func @testAnyDimWrongRank(%input: tensor<4x6xi1>, %dims: tensor<2x2xi32>) { + // expected-error @+1 {{dimensions can only be 0D or 1D tensor}} + %0 = "tf.Any"(%input, %dims) : (tensor<4x6xi1>, tensor<2x2xi32>) -> (tensor<*xi1>) + return +} + +// ----- + +func @testAnyDimOutOfRange(%input: tensor<4x6xi1>) { + %dims = "tf.Const"() {value = dense<[-1, 5]> : tensor<2xi32>} : () -> (tensor<2xi32>) + // expected-error @+1 {{1-th dimension should be in the range of [-2, 2)}} + %0 = "tf.Any"(%input, %dims) : (tensor<4x6xi1>, tensor<2xi32>) -> (tensor<*xi1>) + return +} diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index 5620b9012a7..10567d8143c 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -1631,6 +1631,66 @@ func @max_dynamic(%arg0: tensor<4x?xf16>) -> tensor<4x1xf16> { return %0 : tensor<4x1xf16> } +// CHECK-LABEL: @all +func @all(%input: tensor<4x8xi1>) -> tensor<4xi1> { + %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32> + // CHECK: %[[INIT:.*]] = xla_hlo.constant dense : tensor + // CHECK: "xla_hlo.reduce"(%{{.*}}, %[[INIT]]) ( { + // CHECK: ^{{.*}}(%[[ARGA:.*]]: tensor, %[[ARGB:.*]]: tensor): + // CHECK: %[[AND:.*]] = xla_hlo.and %[[ARGA]], %[[ARGB]] : tensor + // CHECK: "xla_hlo.return"(%[[AND]]) : (tensor) -> () + // CHECK: }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xi1>, tensor) -> tensor<4xi1> + %0 = "tf.All"(%input, %dims) : (tensor<4x8xi1>, tensor<1xi32>) -> tensor<4xi1> + return %0 : tensor<4xi1> +} + +// CHECK-LABEL: @all_keep_dim +func @all_keep_dim(%input: tensor<4x8xi1>) -> tensor<4x1xi1> { + // CHECK: "xla_hlo.reshape"(%{{.*}}) : (tensor<4xi1>) -> tensor<4x1xi1> + %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32> + %0 = "tf.All"(%input, %dims) {keep_dims = true} : (tensor<4x8xi1>, tensor<1xi32>) -> tensor<4x1xi1> + return %0 : tensor<4x1xi1> +} + +// CHECk-LABEL: @all_dynamic +func @all_dynamic(%input: tensor<4x?xi1>) -> tensor<4x1xi1> { + %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32> + // CHECK: %[[ARG:.*]] = "xla_hlo.convert"(%{{.*}}) : (tensor<4x?xi1>) -> tensor<4x?xi1> + // CHECK: "xla_hlo.reduce"(%[[ARG]] + %0 = "tf.All"(%input, %dims) {keep_dims = true} : (tensor<4x?xi1>, tensor<1xi32>) -> tensor<4x1xi1> + return %0 : tensor<4x1xi1> +} + +// CHECK-LABEL: @any +func @any(%input: tensor<4x8xi1>) -> tensor<4xi1> { + %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32> + // CHECK: %[[INIT:.*]] = xla_hlo.constant dense : tensor + // CHECK: "xla_hlo.reduce"(%{{.*}}, %[[INIT]]) ( { + // CHECK: ^{{.*}}(%[[ARGA:.*]]: tensor, %[[ARGB:.*]]: tensor): + // CHECK: %[[AND:.*]] = xla_hlo.or %[[ARGA]], %[[ARGB]] : tensor + // CHECK: "xla_hlo.return"(%[[AND]]) : (tensor) -> () + // CHECK: }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<4x8xi1>, tensor) -> tensor<4xi1> + %0 = "tf.Any"(%input, %dims) : (tensor<4x8xi1>, tensor<1xi32>) -> tensor<4xi1> + return %0 : tensor<4xi1> +} + +// CHECK-LABEL: @any_keep_dim +func @any_keep_dim(%input: tensor<4x8xi1>) -> tensor<4x1xi1> { + // CHECK: "xla_hlo.reshape"(%{{.*}}) : (tensor<4xi1>) -> tensor<4x1xi1> + %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32> + %0 = "tf.Any"(%input, %dims) {keep_dims = true} : (tensor<4x8xi1>, tensor<1xi32>) -> tensor<4x1xi1> + return %0 : tensor<4x1xi1> +} + +// CHECk-LABEL: @any_dynamic +func @any_dynamic(%input: tensor<4x?xi1>) -> tensor<4x1xi1> { + %dims = "tf.Const"() { value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32> + // CHECK: %[[ARG:.*]] = "xla_hlo.convert"(%{{.*}}) : (tensor<4x?xi1>) -> tensor<4x?xi1> + // CHECK: "xla_hlo.reduce"(%[[ARG]] + %0 = "tf.Any"(%input, %dims) {keep_dims = true} : (tensor<4x?xi1>, tensor<1xi32>) -> tensor<4x1xi1> + return %0 : tensor<4x1xi1> +} + //===----------------------------------------------------------------------===// // Tile op legalizations. //===----------------------------------------------------------------------===// diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc index 57299525019..c6dc2c01570 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc @@ -1377,7 +1377,6 @@ class ConvertMeanOp : public GenericConvertReductionOp { public: using GenericConvertReductionOp::GenericConvertReductionOp; - static Value *GetInitialValue(Type reduce_element_type, Location loc, PatternRewriter &rewriter) { return GetScalarConstOfType(reduce_element_type, loc, 0, &rewriter); @@ -1417,6 +1416,36 @@ class ConvertMaxOp } }; +// Converts All op to HLO Reduce op. +// +// %init = constant dense<...> : tensor +// %max = "xla_hlo.reduce"(%inp, %init) ["xla_hlo.and"] +// {dimensions = ...} +class ConvertAllOp + : public GenericConvertReductionOp { + public: + using GenericConvertReductionOp::GenericConvertReductionOp; + static Value *GetInitialValue(Type reduce_element_type, Location loc, + PatternRewriter &rewriter) { + return GetScalarConstOfType(reduce_element_type, loc, 1, &rewriter); + } +}; + +// Converts Any op to HLO Reduce op. +// +// %init = constant dense<...> : tensor +// %max = "xla_hlo.reduce"(%inp, %init) ["xla_hlo.or"] +// {dimensions = ...} +class ConvertAnyOp + : public GenericConvertReductionOp { + public: + using GenericConvertReductionOp::GenericConvertReductionOp; + static Value *GetInitialValue(Type reduce_element_type, Location loc, + PatternRewriter &rewriter) { + return GetScalarConstOfType(reduce_element_type, loc, 0, &rewriter); + } +}; + // Converts tensorflow ArgMin or ArgMax op to xla_hlo operations that perform // a reduction on the original input and the corresponding index. The reduction // sub-computation selects the max (or min) value and the index for the value. @@ -2137,9 +2166,9 @@ LogicalResult legalizeTF(Operation *op, bool allow_partial_conversion) { ConvertSoftmaxOp, ConvertSoftmaxOp, ConvertSplitOp, ConvertSplitVOp, ConvertStridedSliceOp, ConvertTopKV2Op, ConvertMeanOp, ConvertSumOp, - ConvertMaxOp, ConvertTileOp, ConvertMaxPoolGradOp, ConvertOneHotOp, - ConvertConv2DBackpropInputOp, ConvertConv2DBackpropFilterOp>( - op->getContext()); + ConvertMaxOp, ConvertAllOp, ConvertAnyOp, ConvertTileOp, + ConvertMaxPoolGradOp, ConvertOneHotOp, ConvertConv2DBackpropInputOp, + ConvertConv2DBackpropFilterOp>(op->getContext()); ConversionTarget target(*context); target.addLegalDialect(); From 24cab684f517e8261ee9fed81240bb40320a3e00 Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Thu, 5 Dec 2019 16:57:42 -0800 Subject: [PATCH 228/383] Add test to ensure truvediv conversion occurs PiperOrigin-RevId: 284084562 Change-Id: Ia6ae884c31ecc0ac4701211ca087345f5a5504c1 --- tensorflow/python/ops/math_ops_test.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py index 87ab39b97fd..54df055b5f7 100644 --- a/tensorflow/python/ops/math_ops_test.py +++ b/tensorflow/python/ops/math_ops_test.py @@ -443,6 +443,16 @@ class DivAndModTest(test_util.TensorFlowTestCase): np_result = np.divide(nums, divs) self.assertAllClose(tf_result, np_result) + def testDivideType(self): + a = array_ops.constant([2], dtype=dtypes.int32) + # Since __future__.division is effect, we should always upgrade to float64 + b = math_ops.divide(a, 1) + self.assertEqual(b.dtype, dtypes.float64) + self.assertEqual(2.0, self.evaluate(b)) + c = math_ops.divide(a, 4) + self.assertEqual(c.dtype, dtypes.float64) + self.assertEqual(0.5, self.evaluate(c)) + def testComplexDiv(self): foo = array_ops.constant([1. + 3.j]) _ = math_ops.divide(foo, 1.) From 0438fe668efd9c3e3c3d4692f0c4af4f1dff6395 Mon Sep 17 00:00:00 2001 From: YoungSeok Yoon Date: Thu, 5 Dec 2019 17:08:49 -0800 Subject: [PATCH 229/383] Fix the eigen archive download path PiperOrigin-RevId: 284086540 Change-Id: Id0b6a9d71119fc6487bc94defbf4e8f4ccbda94b --- tensorflow/lite/tools/make/download_dependencies.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/tools/make/download_dependencies.sh b/tensorflow/lite/tools/make/download_dependencies.sh index cea13f8d9dd..25e7d6b7894 100755 --- a/tensorflow/lite/tools/make/download_dependencies.sh +++ b/tensorflow/lite/tools/make/download_dependencies.sh @@ -29,7 +29,7 @@ if [ ! -f $BZL_FILE_PATH ]; then exit 1; fi -EIGEN_URL="$(grep -o 'http.*github.com/eigenteam/eigen-git-mirror/archive/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.tensorflow | head -n1)" +EIGEN_URL="$(grep -o 'https.*gitlab.com/libeigen/eigen/-/archive/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.tensorflow | head -n1)" GEMMLOWP_URL="$(grep -o 'https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)" GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz" ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)" From 8d16240b346fd0dccad59213ed0c2d3e94bb221e Mon Sep 17 00:00:00 2001 From: Daniel Situnayake Date: Thu, 5 Dec 2019 17:09:12 -0800 Subject: [PATCH 230/383] Fixes for magic_wand training scripts PiperOrigin-RevId: 284086588 Change-Id: Ie571601516848f4c4fc9f3658ef7aea9d9842232 --- .../experimental/micro/examples/magic_wand/README.md | 6 ++++++ .../magic_wand/train/train_magic_wand_model.ipynb | 9 ++++----- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/README.md b/tensorflow/lite/experimental/micro/examples/magic_wand/README.md index bf095fd4018..3f97b9d85ae 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/README.md +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/README.md @@ -15,6 +15,7 @@ then outputs the gesture to the serial port. - [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge) - [Deploy to Adafruit devices](#deploy-to-adafruit) - [Run the tests on a development machine](#run-the-tests-on-a-development-machine) +- [Train your own model](#train-your-own-model) ## Deploy to Arduino @@ -360,3 +361,8 @@ To understand how TensorFlow Lite does this, you can look at the source in It's a fairly small amount of code that creates an interpreter, gets a handle to a model that's been compiled into the program, and then invokes the interpreter with the model and sample inputs. + +## Train your own model + +To train your own model, or create a new model for a new set of gestures, +follow the instructions in [magic_wand/train/README.md](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/experimental/micro/examples/magic_wand/train/README.md). diff --git a/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_magic_wand_model.ipynb b/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_magic_wand_model.ipynb index f501d474e3d..1995ef02dc3 100644 --- a/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_magic_wand_model.ipynb +++ b/tensorflow/lite/experimental/micro/examples/magic_wand/train/train_magic_wand_model.ipynb @@ -63,9 +63,9 @@ "colab_type": "text" }, "source": [ - "## Install dependencies\n", + "## Configure dependencies\n", "\n", - "Run the following cell to ensure the required dependencies are installed." + "Run the following cell to ensure the correct version of TensorFlow is used." ] }, { @@ -76,8 +76,7 @@ "colab": {} }, "source": [ - "!pip uninstall -y tensorflow\n", - "!pip install -q tensorflow-gpu==2.0.0-beta1" + "%tensorflow_version 2.x\n" ], "execution_count": 0, "outputs": [] @@ -103,7 +102,7 @@ "# Clone the repository from GitHub\n", "!git clone --depth 1 -q https://github.com/tensorflow/tensorflow\n", "# Copy the training scripts into our workspace\n", - "!cp -r tensorflow/tensorflow/lite/experimental/micro/magic_wand/train train" + "!cp -r tensorflow/tensorflow/lite/experimental/micro/examples/magic_wand/train train" ], "execution_count": 0, "outputs": [] From 6a1cbfd8d875f46b686ce7fada435851d3bf0550 Mon Sep 17 00:00:00 2001 From: Juhyun Lee Date: Thu, 5 Dec 2019 17:12:59 -0800 Subject: [PATCH 231/383] Remove StatusOr from error_util. It's currently unused and it's discouraged to use "using" directives in the header. PiperOrigin-RevId: 284087238 Change-Id: I28ee8091ae9ed95b37842c84b4338163a4663d86 --- tensorflow/compiler/mlir/tensorflow/BUILD | 1 - tensorflow/compiler/mlir/tensorflow/utils/error_util.h | 2 -- tensorflow/compiler/mlir/xla/BUILD | 1 + tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc | 2 ++ 4 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index 28b94818567..7448ecc4df8 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -567,7 +567,6 @@ cc_library( hdrs = ["utils/error_util.h"], deps = [ "//tensorflow/core:lib", - "//tensorflow/stream_executor/lib", "@llvm//:support", "@local_config_mlir//:IR", ], diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util.h b/tensorflow/compiler/mlir/tensorflow/utils/error_util.h index 198d04e0486..a60d90cbfb7 100644 --- a/tensorflow/compiler/mlir/tensorflow/utils/error_util.h +++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util.h @@ -22,13 +22,11 @@ limitations under the License. #include "mlir/IR/Location.h" // TF:local_config_mlir #include "mlir/IR/MLIRContext.h" // TF:local_config_mlir #include "tensorflow/core/lib/core/status.h" -#include "tensorflow/stream_executor/lib/statusor.h" // Error utilities for MLIR when interacting with code using Status returns. namespace mlir { // TensorFlow's Status is used for error reporting back to callers. -using stream_executor::port::StatusOr; using tensorflow::Status; // Diagnostic handler that collects all the diagnostics reported and can produce diff --git a/tensorflow/compiler/mlir/xla/BUILD b/tensorflow/compiler/mlir/xla/BUILD index 6a617206823..bf71bcda776 100644 --- a/tensorflow/compiler/mlir/xla/BUILD +++ b/tensorflow/compiler/mlir/xla/BUILD @@ -408,6 +408,7 @@ cc_library( "//tensorflow/compiler/xla/client:xla_builder", "//tensorflow/compiler/xla/client/lib:matrix", "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/stream_executor/lib", "@llvm//:support", "@local_config_mlir//:Analysis", "@local_config_mlir//:IR", diff --git a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc index e9bf3bac44b..935ddac2b67 100644 --- a/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc +++ b/tensorflow/compiler/mlir/xla/mlir_hlo_to_hlo.cc @@ -40,7 +40,9 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_module.h" #include "tensorflow/compiler/xla/status_macros.h" #include "tensorflow/compiler/xla/xla_data.pb.h" +#include "tensorflow/stream_executor/lib/statusor.h" +using ::stream_executor::port::StatusOr; using ::tensorflow::int16; using ::tensorflow::int32; using ::tensorflow::int64; From b9800f025bfac6d8a2a20b5a5401e61b4a5f66f4 Mon Sep 17 00:00:00 2001 From: Martin Wicke Date: Thu, 5 Dec 2019 17:17:13 -0800 Subject: [PATCH 232/383] Add test case for TensorArray concat. PiperOrigin-RevId: 284087911 Change-Id: I38b82b9d03d85709a89fe8c05ec72ba03023aa49 --- tensorflow/python/BUILD | 12 +++ .../python/ops/tensor_array_ops_test.py | 77 +++++++++++++++++++ 2 files changed, 89 insertions(+) create mode 100644 tensorflow/python/ops/tensor_array_ops_test.py diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 12a28007912..ea4f8970ab1 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -4811,6 +4811,18 @@ cuda_py_test( python_version = "PY3", ) +py_test( + name = "tensor_array_ops_test", + size = "small", + srcs = ["ops/tensor_array_ops_test.py"], + python_version = "PY3", + deps = [ + ":array_ops", + ":client", + ":client_testlib", + ], +) + cuda_py_test( name = "special_math_ops_test", size = "medium", diff --git a/tensorflow/python/ops/tensor_array_ops_test.py b/tensorflow/python/ops/tensor_array_ops_test.py new file mode 100644 index 00000000000..4f09ff5c22d --- /dev/null +++ b/tensorflow/python/ops/tensor_array_ops_test.py @@ -0,0 +1,77 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Tests for tensor_array_ops.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tensorflow.python.eager import def_function +from tensorflow.python.framework import constant_op +from tensorflow.python.framework import dtypes +from tensorflow.python.framework import test_util +from tensorflow.python.ops import array_ops +from tensorflow.python.ops import tensor_array_ops +from tensorflow.python.platform import test + + +class TensorArrayOpsTest(test.TestCase): + + @test_util.run_v1_only('Testing placeholders specifically.') + def test_concat_graph(self): + values = tensor_array_ops.TensorArray( + size=4, dtype=dtypes.string, element_shape=[None], infer_shape=False) + a = array_ops.placeholder(dtypes.string, [ + None, + ]) + b = array_ops.placeholder(dtypes.string, [ + None, + ]) + values = (values.write(0, a).write( + 1, constant_op.constant([], dtypes.string))).write(2, b).write( + 3, constant_op.constant([], dtypes.string)) + + with self.session() as s: + result = s.run(values.concat(), {a: ['a', 'b', 'c'], b: ['c', 'd', 'e']}) + self.assertAllEqual(result, [b'a', b'b', b'c', b'c', b'd', b'e']) + + @test_util.run_v2_only + def test_concat(self): + values = tensor_array_ops.TensorArray( + size=4, dtype=dtypes.string, element_shape=[None], infer_shape=False) + a = constant_op.constant(['a', 'b', 'c'], dtypes.string) + b = constant_op.constant(['c', 'd', 'e'], dtypes.string) + values = (values.write(0, a).write( + 1, constant_op.constant([], dtypes.string))).write(2, b).write( + 3, constant_op.constant([], dtypes.string)) + self.assertAllEqual(values.concat(), [b'a', b'b', b'c', b'c', b'd', b'e']) + + @test_util.run_v2_only + def test_concat_in_function(self): + @def_function.function + def fn(a, b): + values = tensor_array_ops.TensorArray( + size=4, dtype=dtypes.string, element_shape=[None], infer_shape=False) + values = (values.write(0, a).write( + 1, constant_op.constant([], dtypes.string))).write(2, b).write( + 3, constant_op.constant([], dtypes.string)) + return values.concat() + + self.assertAllEqual(fn(['a', 'b', 'c'], ['c', 'd', 'e']), + [b'a', b'b', b'c', b'c', b'd', b'e']) + + +if __name__ == '__main__': + test.main() From b25da3c6f9edffeac834a0e83d0a6e6c69598f79 Mon Sep 17 00:00:00 2001 From: Karim Nosir Date: Thu, 5 Dec 2019 17:36:57 -0800 Subject: [PATCH 233/383] Drop TF::Assert during TFLite legalization. PiperOrigin-RevId: 284090848 Change-Id: Ib3d69446afc674eb329a4520c7262d86ffbb758b --- tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir | 10 ++++++++++ .../compiler/mlir/lite/transforms/legalize_tf.cc | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir index 27eff39c397..ec618ffa276 100644 --- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir @@ -1280,3 +1280,13 @@ func @conv2d_backprop_unsupported_data_format(%arg0: tensor<4xi32>, %arg1: tenso // CHECK-LABEL: conv2d_backprop_unsupported_data_format // CHECK: tf.Conv2DBackpropInput } + +func @assert_remove(%arg0: tensor<1xi32>, %arg1: tensor<1xi32>) -> tensor<1xi1> { + %0 = "tf.LessEqual"(%arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi1> + "tf.Assert"(%0, %arg1) {summarize = 3} : (tensor<1xi1>, tensor<1xi32>) -> () + return %0 : tensor<1xi1> + // CHECK-LABEL: assert_remove + // CHECK: tfl.less_equal + // CHECK-NOT: Assert + // CHECK: return +} diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc index bc6ff5e3b47..5002c555905 100644 --- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc @@ -68,6 +68,7 @@ struct LegalizeTF : public FunctionPass { // TODO(antiagainst): Define this pattern in a table-driven manner once variadic // operands are properly supported in declarative rewrite rule specification. +DECL_CONVERT_OP(Assert); DECL_CONVERT_OP(Concat); DECL_CONVERT_OP(ConcatV2); DECL_CONVERT_OP(MatMul); @@ -374,6 +375,14 @@ PatternMatchResult ConvertTFMatrixDiagV3Op::matchAndRewrite( return matchFailure(); } +// TF Lite doesn't support Assert, we just drop the assert from the graph. +PatternMatchResult ConvertTFAssertOp::matchAndRewrite( + Operation* op, PatternRewriter& rewriter) const { + op->dropAllReferences(); + op->erase(); + return matchSuccess(); +} + void LegalizeTF::runOnFunction() { OwningRewritePatternList patterns; auto* ctx = &getContext(); @@ -385,7 +394,8 @@ void LegalizeTF::runOnFunction() { .insert(ctx); + ConvertTFStridedSliceOp, ConvertTFUnpackOp, ConvertTFAssertOp>( + ctx); applyPatternsGreedily(func, patterns); } From b090c3be4f45d957b96d0c59318778e65ba22f30 Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Thu, 5 Dec 2019 17:44:05 -0800 Subject: [PATCH 234/383] Fixes docstring for argmax_v2. PiperOrigin-RevId: 284091994 Change-Id: I9363fe428ce1d8abf0e9bf142322b12293c62e86 --- tensorflow/python/ops/math_ops.py | 43 +++++++++++-------------------- 1 file changed, 15 insertions(+), 28 deletions(-) diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index ee21d3f3c31..65aed1cf076 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -147,39 +147,26 @@ def argmax_v2(input, axis=None, output_type=dtypes.int64, name=None): Note that in case of ties the identity of the return value is not guaranteed. For example: - ```python - A=tf.constant([2,20,30,3,6]) # Constant 1-D Tensor - tf.math.argmax(A) # output 2 as index 2 (A[2]) is maximum in tensor A - B=tf.constant([[2,20,30,3,6],[3,11,16,1,8],[14,45,23,5,27]]) - tf.math.argmax(B,0) # [2, 2, 0, 2, 2] - tf.math.argmax(B,1) # [2, 2, 1] - ``` + + >>> A = tf.constant([2, 20, 30, 3, 6]) + >>> tf.math.argmax(A) # A[2] is maximum in tensor A + + >>> B = tf.constant([[2, 20, 30, 3, 6], [3, 11, 16, 1, 8], + ... [14, 45, 23, 5, 27]]) + >>> tf.math.argmax(B, 0) + + >>> tf.math.argmax(B, 1) + Args: - input: A `Tensor`. Must be one of the following types: `float32`, `float64`, - `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`, `qint8`, - `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`, `uint32`, - `uint64`. - axis: A `Tensor`. Must be one of the following types: `int32`, `int64`. - int32 or int64, must be in the range `-rank(input), rank(input))`. - Describes which axis of the input Tensor to reduce across. For vectors, - use axis = 0. - output_type: An optional `tf.DType` from: `tf.int32, tf.int64`. Defaults to - `tf.int64`. - name: A name for the operation (optional). + input: A `Tensor`. + axis: An integer, the axis to reduce across. Default to 0. + output_type: An optional output dtype (`tf.int32` or `tf.int64`). Defaults + to `tf.int64`. + name: An optional name for the operation. Returns: A `Tensor` of type `output_type`. - - Usage: - ```python - import tensorflow as tf - a = [1, 10, 26.9, 2.8, 166.32, 62.3] - b = tf.math.argmax(input = a) - c = tf.keras.backend.eval(b) - # c = 4 - # here a[4] = 166.32 which is the largest element of a across axis 0 - ``` """ if axis is None: axis = 0 From c1dec428a2b1fbb0d156e5c365f4875d9cdbf104 Mon Sep 17 00:00:00 2001 From: River Riddle Date: Thu, 5 Dec 2019 17:46:37 -0800 Subject: [PATCH 235/383] NFC: Add documentation for `-mlir-print-op-on-diagnostic` and `-mlir-print-stacktrace-on-diagnostic`. This change adds proper documentation in Diagnostics.md, allowing for users to more easily find them. PiperOrigin-RevId: 284092336 Change-Id: Id46f9ba6b0c78aeaba47880ae380915bbd38b633 --- third_party/mlir/g3doc/Diagnostics.md | 49 +++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/third_party/mlir/g3doc/Diagnostics.md b/third_party/mlir/g3doc/Diagnostics.md index 457cab85932..0c6ef7a24fa 100644 --- a/third_party/mlir/g3doc/Diagnostics.md +++ b/third_party/mlir/g3doc/Diagnostics.md @@ -197,6 +197,55 @@ destroyed. } // The diagnostic is automatically reported here. ``` +## Diagnostic Configuration Options + +Several options are provided to help control and enhance the behavior of +diagnostics. These options are listed below: + +### Print Operation On Diagnostic + +Command Line Flag: `-mlir-print-op-on-diagnostic` + +When a diagnostic is emitted on an operation, via `Operation::emitError/...`, +the textual form of that operation is printed and attached as a note to the +diagnostic. This option is useful for understanding the current form of an +operation that may be invalid, especially when debugging verifier failures. An +example output is shown below: + +```shell +test.mlir:3:3: error: 'module_terminator' op expects parent op 'module' + "module_terminator"() : () -> () + ^ +test.mlir:3:3: note: see current operation: "module_terminator"() : () -> () + "module_terminator"() : () -> () + ^ +``` + +### Print StackTrace On Diagnostic + +Command Line Flag: `-mlir-print-stacktrace-on-diagnostic` + +When a diagnostic is emitted, attach the current stack trace as a note to the +diagnostic. This option is useful for understanding which part of the compiler +generated certain diagnostics. An example output is shown below: + +```shell +test.mlir:3:3: error: 'module_terminator' op expects parent op 'module' + "module_terminator"() : () -> () + ^ +test.mlir:3:3: note: diagnostic emitted with trace: + #0 0x000055dd40543805 llvm::sys::PrintStackTrace(llvm::raw_ostream&) llvm/lib/Support/Unix/Signals.inc:553:11 + #1 0x000055dd3f8ac162 emitDiag(mlir::Location, mlir::DiagnosticSeverity, llvm::Twine const&) /lib/IR/Diagnostics.cpp:292:7 + #2 0x000055dd3f8abe8e mlir::emitError(mlir::Location, llvm::Twine const&) /lib/IR/Diagnostics.cpp:304:10 + #3 0x000055dd3f998e87 mlir::Operation::emitError(llvm::Twine const&) /lib/IR/Operation.cpp:324:29 + #4 0x000055dd3f99d21c mlir::Operation::emitOpError(llvm::Twine const&) /lib/IR/Operation.cpp:652:10 + #5 0x000055dd3f96b01c mlir::OpTrait::HasParent::Impl::verifyTrait(mlir::Operation*) /mlir/IR/OpDefinition.h:897:18 + #6 0x000055dd3f96ab38 mlir::Op::Impl, mlir::OpTrait::IsTerminator>::BaseVerifier::Impl, mlir::OpTrait::IsTerminator >::verifyTrait(mlir::Operation*) /mlir/IR/OpDefinition.h:1052:29 + # ... + "module_terminator"() : () -> () + ^ +``` + ## Common Diagnostic Handlers To interface with the diagnostics infrastructure, users will need to register a From ada269be4bb24f4599774a209131ad5a7ec239be Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Thu, 5 Dec 2019 17:48:39 -0800 Subject: [PATCH 236/383] [tf.distribute] Disable XLA on collective_all_reduce_strategy_test PiperOrigin-RevId: 284092600 Change-Id: I6801d8860907d47f770b992256c2a6ff54bf514b --- tensorflow/python/distribute/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index ee820d0ead2..7ce283749f3 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -1280,6 +1280,7 @@ cuda_py_test( tags = [ "multi_and_single_gpu", ], + xla_enable_strict_auto_jit = False, ) cuda_py_test( From 27efd9fd37802a4968694e031c007a403312e8ad Mon Sep 17 00:00:00 2001 From: Yifei Feng Date: Thu, 5 Dec 2019 18:12:47 -0800 Subject: [PATCH 237/383] Install python3.7, pip3.5 in custom op Docker images. PiperOrigin-RevId: 284096018 Change-Id: Ia26ec2bf0eaeab98c4f7bb1020905bf140e7829b --- .../ci_build/Dockerfile.custom_op_ubuntu_16 | 6 +++++ .../Dockerfile.custom_op_ubuntu_16_gpu | 6 +++++ .../ci_build/install/install_python37.sh | 23 +++++++++++++++++++ 3 files changed, 35 insertions(+) create mode 100644 tensorflow/tools/ci_build/install/install_python37.sh diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 index 6645ad7c88b..830b073a7da 100644 --- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 +++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16 @@ -60,6 +60,12 @@ RUN add-apt-repository ppa:jonathonf/python-3.6 && \ python3.6 -m pip install pip --upgrade && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0 +# Install python 3.7 +RUN /install/install_python37.sh + +# Install pip3.5 +RUN wget https://bootstrap.pypa.io/get-pip.py && python3.5 get-pip.py && rm get-pip.py + RUN /install/install_pip_packages.sh RUN /install/install_auditwheel.sh diff --git a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu index 5db23056c89..0fee596cce2 100644 --- a/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu +++ b/tensorflow/tools/ci_build/Dockerfile.custom_op_ubuntu_16_gpu @@ -60,6 +60,12 @@ RUN add-apt-repository ppa:jonathonf/python-3.6 && \ python3.6 -m pip install pip --upgrade && \ update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.6 0 +# Install python 3.7 +RUN /install/install_python37.sh + +# Install pip3.5 +RUN wget https://bootstrap.pypa.io/get-pip.py && python3.5 get-pip.py && rm get-pip.py + RUN /install/install_pip_packages.sh RUN /install/install_auditwheel.sh diff --git a/tensorflow/tools/ci_build/install/install_python37.sh b/tensorflow/tools/ci_build/install/install_python37.sh new file mode 100644 index 00000000000..26f663803f8 --- /dev/null +++ b/tensorflow/tools/ci_build/install/install_python37.sh @@ -0,0 +1,23 @@ +#!/usr/bin/env bash +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== + +cd /usr/src +wget https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz +tar xzf Python-3.7.0.tgz +cd Python-3.7.0 +./configure --enable-optimizations +make altinstall +rm /usr/src/Python-3.7.0.tgz From 5337757d860598e39fac01f0e8097c1b88e766a5 Mon Sep 17 00:00:00 2001 From: Brian Zhao Date: Thu, 5 Dec 2019 18:15:16 -0800 Subject: [PATCH 238/383] Fix rules_cc.bzl loading path. PiperOrigin-RevId: 284096316 Change-Id: I2422b80bfe61c1c876afe0c8f162a39a397e7a2d --- tensorflow/core/platform/rules_cc.bzl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/platform/rules_cc.bzl b/tensorflow/core/platform/rules_cc.bzl index e1331fab9fc..e664512aa4c 100644 --- a/tensorflow/core/platform/rules_cc.bzl +++ b/tensorflow/core/platform/rules_cc.bzl @@ -1,7 +1,7 @@ """Provides an indirection layer to bazel cc_rules""" load( - "//tensorflow/core/platform:google/rules_cc.bzl", + "//tensorflow/core/platform:default/rules_cc.bzl", _cc_binary = "cc_binary", _cc_import = "cc_import", _cc_library = "cc_library", From 6c66b3a1a43effa597fce76c39a4f9e5439e37ea Mon Sep 17 00:00:00 2001 From: Skye Wanderman-Milne Date: Thu, 5 Dec 2019 19:46:44 -0800 Subject: [PATCH 239/383] [XLA:Python] Add __repr__ to TpuDevice. It now prints 'TpuDevice(id=0)' instead of ''. PiperOrigin-RevId: 284106050 Change-Id: Ia6308d402694fa54d5f23d0248bc5634836c071a --- .../xla/python/tpu_driver/client/tpu_client_extension.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc index 09d3350b590..60886416a62 100644 --- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc +++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client_extension.cc @@ -204,6 +204,11 @@ PYBIND11_MODULE(tpu_client_extension, m) { py::call_guard(), py::arg("arguments")) .def("ExecutePerReplica", &PyTpuExecutable::ExecutePerReplica, py::call_guard(), py::arg("arguments")); + + py::class_>(m, "TpuDevice") + .def("__repr__", [](const TpuDevice& device) { + return absl::StrFormat("TpuDevice(id=%i)", device.id()); + }); } // NOLINT(readability/fn_size) } // namespace xla From 5339810df374c44a3ea9503483eaf7cc57410be3 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 5 Dec 2019 20:00:38 -0800 Subject: [PATCH 240/383] Add more test cases for fully_quantized reduce. PiperOrigin-RevId: 284107384 Change-Id: I2a3edadcea2bb5917b6931e191178724fd394db3 --- tensorflow/lite/testing/op_tests/reduce.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/testing/op_tests/reduce.py b/tensorflow/lite/testing/op_tests/reduce.py index ae4513ef952..1437bdffbfe 100644 --- a/tensorflow/lite/testing/op_tests/reduce.py +++ b/tensorflow/lite/testing/op_tests/reduce.py @@ -105,7 +105,7 @@ def make_reduce_tests(reduce_op, "input_shape": [[1, 8, 8, 4], [1, 8, 8, 3]], "axis": [ 0, 1, 2, 3, [0], [1], [2], [3], [-1], [-2], [-3], [1, 2], - [0, 3], [1, 2, 3] + [0, 3], [1, 2, 3], [1, 3], [2, 3] ], "const_axis": [True], "keepdims": [True, False], From 2b6e43a707aa63f27f3a35e87794d66c6d79a336 Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Thu, 5 Dec 2019 20:17:52 -0800 Subject: [PATCH 241/383] Fix the confusing profiling output by separately processing profiling events caught during the initialization phase and those caught during the regular run phase. PiperOrigin-RevId: 284109495 Change-Id: I6d4ff3e9208f0807c20ca622c7f47fa21b7658e9 --- .../tools/benchmark/benchmark_tflite_model.cc | 59 ++++++++++++------- 1 file changed, 38 insertions(+), 21 deletions(-) diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc index 4d37a868aaf..197907ec9e8 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc @@ -81,10 +81,16 @@ class ProfilingListener : public BenchmarkListener { : interpreter_(interpreter), profiler_(max_num_entries) { TFLITE_BENCHMARK_CHECK(interpreter); interpreter_->SetProfiler(&profiler_); + + // We start profiling here in order to catch events that are recorded during + // the benchmark run preparation stage where TFLite interpreter is + // initialized and model graph is prepared. profiler_.Reset(); profiler_.StartProfiling(); } + void OnBenchmarkStart(const BenchmarkParams& params) override; + void OnSingleRunStart(RunType run_type) override; void OnSingleRunEnd() override; @@ -94,7 +100,8 @@ class ProfilingListener : public BenchmarkListener { private: Interpreter* interpreter_; profiling::BufferedProfiler profiler_; - profiling::ProfileSummarizer summarizer_; + profiling::ProfileSummarizer run_summarizer_; + profiling::ProfileSummarizer init_summarizer_; }; // Dumps gemmlowp profiling events if gemmlowp profiling is enabled. @@ -105,29 +112,39 @@ class GemmlowpProfilingListener : public BenchmarkListener { void OnBenchmarkEnd(const BenchmarkResults& results) override; }; +void ProfilingListener::OnBenchmarkStart(const BenchmarkParams& params) { + // At this point, we have completed the prepration for benchmark runs + // including TFLite interpreter initialization etc. So we are going to process + // profiling events recorded during this stage. + profiler_.StopProfiling(); + auto profile_events = profiler_.GetProfileEvents(); + init_summarizer_.ProcessProfiles(profile_events, *interpreter_); + profiler_.Reset(); +} + void ProfilingListener::OnSingleRunStart(RunType run_type) { - // Note: we have started profiling when this listener is created. In order - // not to count events during the WARMUP phase, we need to stop profiling and - // process already-recorded profile events when the WARMUP run starts and - // restart profiling at the REGULAR run. - if (run_type == WARMUP) { - OnSingleRunEnd(); - } else if (run_type == REGULAR) { + if (run_type == REGULAR) { profiler_.Reset(); profiler_.StartProfiling(); } } void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) { - if (summarizer_.HasProfiles()) { - TFLITE_LOG(INFO) << summarizer_.GetOutputString(); + if (init_summarizer_.HasProfiles()) { + TFLITE_LOG(INFO) << "Profiling Info for Benchmark Initialization:"; + TFLITE_LOG(INFO) << init_summarizer_.GetOutputString(); + } + if (run_summarizer_.HasProfiles()) { + TFLITE_LOG(INFO) + << "Operator-wise Profiling Info for Regular Benchmark Runs:"; + TFLITE_LOG(INFO) << run_summarizer_.GetOutputString(); } } void ProfilingListener::OnSingleRunEnd() { profiler_.StopProfiling(); auto profile_events = profiler_.GetProfileEvents(); - summarizer_.ProcessProfiles(profile_events, *interpreter_); + run_summarizer_.ProcessProfiles(profile_events, *interpreter_); } void GemmlowpProfilingListener::OnBenchmarkStart( @@ -553,6 +570,16 @@ TfLiteStatus BenchmarkTfLiteModel::Init() { return kTfLiteError; } + // Install profilers if necessary right after interpreter is created so that + // any memory allocations inside the TFLite runtime could be recorded if the + // installed profiler profile memory usage information. + if (params_.Get("enable_op_profiling")) { + profiling_listener_.reset(new ProfilingListener( + interpreter_.get(), + params_.Get("max_profiling_buffer_entries"))); + AddListener(profiling_listener_.get()); + } + interpreter_->UseNNAPI(params_.Get("use_legacy_nnapi")); interpreter_->SetAllowFp16PrecisionForFp32(params_.Get("allow_fp16")); @@ -617,16 +644,6 @@ TfLiteStatus BenchmarkTfLiteModel::Init() { } } - // Install profilers if necessary but *before* any memory allocations inside - // the TFLite interpreter because the installed profiler might profile memory - // usage information. - if (params_.Get("enable_op_profiling")) { - profiling_listener_.reset(new ProfilingListener( - interpreter_.get(), - params_.Get("max_profiling_buffer_entries"))); - AddListener(profiling_listener_.get()); - } - if (interpreter_->AllocateTensors() != kTfLiteOk) { TFLITE_LOG(ERROR) << "Failed to allocate tensors!"; return kTfLiteError; From 54451cd071192e46936b45744c507afcdddc32a6 Mon Sep 17 00:00:00 2001 From: Reed Wanderman-Milne Date: Thu, 5 Dec 2019 21:29:15 -0800 Subject: [PATCH 242/383] Support DistributionStrategy in LossScaleGradientTape, take 2. I previous tried this in de0be0deae93ea4c4452ceb23c91dd24a88fe62e, but it was rolled back due to breaking Windows. Autograph was causing an ImportError on Windows, so I now explicitly use a tf.while_loop. PiperOrigin-RevId: 284116353 Change-Id: Ia5ef17ae8ddf36af3244c157ebc0ecbd807eccb0 --- tensorflow/python/BUILD | 14 +- .../loss_scaling_gradient_tape.py | 107 +++++- .../loss_scaling_gradient_tape_test.py | 361 +++++++++++++----- 3 files changed, 360 insertions(+), 122 deletions(-) diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index ea4f8970ab1..2ba923f8b29 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -3657,24 +3657,26 @@ py_library( ":loss_scale", ":unconnected_gradients", ":util", + "//tensorflow/python/distribute:distribute_lib", "//tensorflow/python/eager:backprop", ], ) -py_test( +cuda_py_test( name = "loss_scaling_gradient_tape_test", size = "medium", srcs = ["training/experimental/loss_scaling_gradient_tape_test.py"], - python_version = "PY3", - deps = [ + additional_deps = [ ":client_testlib", ":constant_op", + ":framework_test_combinations_lib", ":loss_scale", ":loss_scaling_gradient_tape", - "//tensorflow/python/compat:v2_compat", - "//tensorflow/python/eager:def_function", - "//third_party/py/numpy", "@absl_py//absl/testing:parameterized", + "//third_party/py/numpy", + "//tensorflow/python/compat:v2_compat", + "//tensorflow/python/distribute:mirrored_strategy", + "//tensorflow/python/eager:def_function", ], ) diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py index 4b75a74bc3b..caae7052b84 100644 --- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py +++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape.py @@ -18,8 +18,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.eager import backprop -from tensorflow.python.ops import array_ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients from tensorflow.python.training.experimental import loss_scale as loss_scale_module from tensorflow.python.util import nest @@ -60,6 +62,13 @@ class LossScaleGradientTape(backprop.GradientTape): grads = tape.gradient(loss, vars) opt.apply_gradients(zip(grads, vars)) ``` + + WARNING: Computing second-order (or higher) gradients with a + `LossScaleGradientTape` does not yet work properly when a + `tf.distribute.Strategy` is used. Computing second-order gradients will return + None instead of the gradient tensors. This only occurs when you nest multiple + gradient tapes under each other; if you do not nest them, this issue will not + occur. """ def __init__(self, @@ -133,22 +142,90 @@ class LossScaleGradientTape(backprop.GradientTape): if self._tape is None: # pylint: disable=access-member-before-definition raise RuntimeError("GradientTape.gradient can only be called once on " "non-persistent tapes.") + if distribution_strategy_context.in_cross_replica_context(): + raise ValueError("LossScaleGradientTape.gradient() must be called in a " + "replica context.") - ready_to_update = False - grads = nest.map_structure(array_ops.zeros_like, sources) - - while not ready_to_update and self._loss_scale() > 1: - with self: # re-enter the gradient tape so it sees the loss scaling - loss_scale = self._loss_scale() - scaled_target = nest.map_structure(lambda t: t * loss_scale, target) - - old_grads = super(LossScaleGradientTape, self).gradient( - scaled_target, sources, output_gradients, unconnected_gradients) - inv_loss_scale = 1.0 / self._loss_scale() - grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads) - # Check for non-finite gradients possibly resulting from scaling - _, ready_to_update = self._loss_scale.update(grads) + # Note: DistributionStrategy does not support running a while loop in a + # replica context. So, we call `_compute_gradients_until_finite` in a cross- + # replica context. + replica_context = distribution_strategy_context.get_replica_context() + grads = replica_context.merge_call( + _compute_gradients_until_finite, + args=(self, self._loss_scale, target, sources, output_gradients, + unconnected_gradients)) if not self._outer_persistent: self._tape = None # free up resources if a persistent tape was not needed return grads + + +def _compute_gradients_until_finite( + distribution, loss_scale_gradient_tapes, loss_scale, target, sources, + output_gradients, unconnected_gradients): + """Compute gradients and update the loss scale until the gradients are finite. + + This must be called in a cross-replica context. + + This is a function instead of a method of LossScaleGradientTape, as the `self` + parameter would be meaningless. There is one LossScaleGradientTape per + replica, but this function is called once total (not per replica), so there + cannot be a singular `self` parameter. + + Args: + distribution: The distribution strategy in effect. + loss_scale_gradient_tapes: A PerReplica value of LossScaleGradientTapes. + Contains the LossScaleGradientTape of each replica. + loss_scale: The loss scale to use to scale the loss and unscale the + gradient. + target: a list or nested structure of Tensors or Variables to be + differentiated. + sources: a list or nested structure of Tensors or Variables. `target` will + be differentiated against elements in `sources`. + output_gradients: Passed to GradientTape.gradient + unconnected_gradients: Pass to GradientTape.gradient. + + Returns: + The gradients of `target` with respect to `sources`. + """ + # Autograph cannot convert this function, so we must use an explicit + # tf.while_loop. + # TODO(b/143572314): Fix Autograph so that it can convert this function, then + # replace the tf.while_loop with a Python while loop. + + def cond(grads, ready_to_update): + """The condition of the while loop.""" + del grads + # Equivalent to: `not ready_to_update and loss_scale() > 1` + return math_ops.logical_and(math_ops.logical_not(ready_to_update), + math_ops.greater(loss_scale(), 1)) + + def body(grads, ready_to_update): + """The body of the while loop.""" + del grads, ready_to_update + def replica_fn(gradient_tape, target, sources, output_gradients): + """Scales the loss, computes the gradients, and unscales the gradients.""" + loss_scale_val = loss_scale() + with gradient_tape: # re-enter gradient tape so it sees the loss scaling + scaled_target = nest.map_structure(lambda t: t * loss_scale_val, target) + old_grads = super(LossScaleGradientTape, gradient_tape).gradient( + scaled_target, sources, output_gradients, unconnected_gradients) + inv_loss_scale = 1.0 / loss_scale_val + grads = nest.map_structure(lambda g: inv_loss_scale * g, old_grads) + return grads + + # Switch to a replica-context to compute gradients once per replica. + grads = distribution.experimental_run_v2( + replica_fn, args=(loss_scale_gradient_tapes, target, sources, + output_gradients)) + # Check for non-finite gradients possibly resulting from scaling + _, ready_to_update = loss_scale.update(grads) + return grads, ready_to_update + + # Dummy value for initial_grads. The first iteration of the loop will + # overwrite `grads` to the actual gradients. + initial_grads = sources + initial_ready_to_update = False + grads, _ = control_flow_ops.while_loop( + cond, body, [initial_grads, initial_ready_to_update]) + return grads diff --git a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py index b8c85a929da..36d7d18a93b 100644 --- a/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py +++ b/tensorflow/python/training/experimental/loss_scaling_gradient_tape_test.py @@ -20,58 +20,137 @@ from __future__ import print_function from absl.testing import parameterized import numpy as np from tensorflow.python.compat import v2_compat +from tensorflow.python.distribute import distribution_strategy_context +from tensorflow.python.distribute import mirrored_strategy +from tensorflow.python.distribute import values +from tensorflow.python.eager import context from tensorflow.python.eager import def_function from tensorflow.python.framework import constant_op +from tensorflow.python.framework import test_combinations from tensorflow.python.platform import test from tensorflow.python.training.experimental import loss_scale as loss_scale_module from tensorflow.python.training.experimental import loss_scaling_gradient_tape as lsgt +from tensorflow.python.util import nest + + +# If called outside any strategy.scope() calls, this will return the default +# strategy. +default_strategy_fn = distribution_strategy_context.get_strategy + + +def create_mirrored_strategy(): + if context.num_gpus() >= 1: + return mirrored_strategy.MirroredStrategy(['cpu:0', 'gpu:0']) + else: + return mirrored_strategy.MirroredStrategy(['cpu:0']) class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase): - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_basic_tapes_eager_mode(self, loss_scale): - x = constant_op.constant(3.0) - with lsgt.LossScaleGradientTape(loss_scale(32)) as g: - g.watch(x) - y = x * x - dy_dx = g.gradient(y, x) - self.assertEqual(self.evaluate(dy_dx), 6.0) + def _run_with_strategy(self, run_fn, strategy, use_tf_function=False): + """Runs `run_fn` under the DistributionStrategy `strategy`. - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_basic_tapes_graph_mode(self, loss_scale): + Runs `run_fn` with `strategy.experimental_run_v2`. Returns a list of the + return values of `run_fn`, one per replica. + + Args: + run_fn: The function to run. + strategy: The DistributionStrategy to run `run_fn` with. + use_tf_function: If True, call `run_fn` under a tf.function. + + Returns: + A list of tensors, each being the return value of `run_fn` from one + replica. If a nested structure is returned from `run_fn`, returns a + nested structure, where each element is a list of tensors. + """ + strategy_fn = lambda: strategy.experimental_run_v2(run_fn) + if use_tf_function: + strategy_fn = def_function.function(strategy_fn) + + results = strategy_fn() + + def convert_tensor_to_list(tensor): + if isinstance(tensor, values.DistributedValues): + return tensor.values + else: + return [tensor] + return nest.map_structure(convert_tensor_to_list, results) + + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + use_tf_function=[True, False] + )) + def test_basic_tapes(self, loss_scale, strategy_fn, use_tf_function): loss_scale = loss_scale(32) - - @def_function.function - def _inner_test(): + def run_fn(): x = constant_op.constant(3.0) with lsgt.LossScaleGradientTape(loss_scale) as g: g.watch(x) y = x * x return g.gradient(y, x) - self.assertEqual(self.evaluate(_inner_test()), 6.0) + dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) + self.assertEqual(loss_scale(), 32) + for dy_dx in dy_dx_list: + self.assertEqual(dy_dx, 6.0) - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_nested_tapes(self, loss_scale): - x = constant_op.constant(3.0) - with lsgt.LossScaleGradientTape(loss_scale(32)) as g: - g.watch(x) - with lsgt.LossScaleGradientTape(loss_scale(32)) as gg: - gg.watch(x) + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + use_tf_function=[True, False] + )) + def test_output_gradients(self, loss_scale, strategy_fn, use_tf_function): + loss_scale = loss_scale(32) + def run_fn(): + x = constant_op.constant(3.0) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) y = x * x - dy_dx = gg.gradient(y, x) - self.assertEqual(self.evaluate(dy_dx), 6.0) - d2y_dx2 = g.gradient(dy_dx, x) - self.assertEqual(self.evaluate(d2y_dx2), 2.0) + return g.gradient(y, x, output_gradients=constant_op.constant(2.0)) + dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) + self.assertEqual(loss_scale(), 32) + for dy_dx in dy_dx_list: + self.assertEqual(dy_dx, 12.0) - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_non_persistent_tapes_error(self, loss_scale): + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + strategy_fn=[default_strategy_fn], + use_tf_function=[True, False] + )) + def test_nested_tapes(self, loss_scale, strategy_fn, use_tf_function): + # TODO(reedwm): Support nested tapes with mirrored strategy. Currently this + # does not work, as the set of active gradient tapes is a thread-local + # variable. Mirrored strategy spawns new threads, making the outer gradient + # tape non-active when using the inner gradient tape. + outer_loss_scale = loss_scale(32) + inner_loss_scale = loss_scale(32) + def run_fn(): + x = constant_op.constant(3.0) + with lsgt.LossScaleGradientTape(outer_loss_scale) as g: + g.watch(x) + with lsgt.LossScaleGradientTape(inner_loss_scale) as gg: + gg.watch(x) + y = x * x + dy_dx = gg.gradient(y, x) + d2y_dx2 = g.gradient(dy_dx, x) + return dy_dx, d2y_dx2 + + dy_dx_list, d2y_dx2_list = self._run_with_strategy(run_fn, strategy_fn(), + use_tf_function) + self.assertEqual(outer_loss_scale(), 32) + self.assertEqual(inner_loss_scale(), 32) + for dy_dx in dy_dx_list: + self.assertEqual(dy_dx, 6.0) + for d2y_dx2 in d2y_dx2_list: + self.assertEqual(d2y_dx2, 2.0) + + def test_non_persistent_tapes_error(self): x = constant_op.constant(3.0) - with lsgt.LossScaleGradientTape(loss_scale(32), persistent=False) as g: + with lsgt.LossScaleGradientTape(loss_scale_module.FixedLossScale(32), + persistent=False) as g: g.watch(x) y = x * x z = y * y @@ -79,21 +158,36 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase): with self.assertRaisesRegexp(RuntimeError, 'persistent'): g.gradient(y, x) - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_persistent_tapes(self, loss_scale): - x = constant_op.constant(3.0) - with lsgt.LossScaleGradientTape(loss_scale(32), persistent=True) as g: - g.watch(x) - y = x * x - z = y * y - dz_dx = g.gradient(z, x) - self.assertEqual(self.evaluate(dz_dx), 108.0) - dy_dx = g.gradient(y, x) - self.assertEqual(self.evaluate(dy_dx), 6.0) + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + use_tf_function=[True, False] + )) + def test_persistent_tapes(self, loss_scale, strategy_fn, use_tf_function): - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) + ls = loss_scale(32) + def run_fn(): + x = constant_op.constant(3.0) + with lsgt.LossScaleGradientTape(ls, persistent=True) as g: + g.watch(x) + y = x * x + z = y * y + dz_dx = g.gradient(z, x) + dy_dx = g.gradient(y, x) + return dz_dx, dy_dx + + dz_dx_list, dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), + use_tf_function) + for dz_dx in dz_dx_list: + self.assertEqual(dz_dx, 108.0) + for dy_dx in dy_dx_list: + self.assertEqual(dy_dx, 6.0) + + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + )) def test_nested_sources(self, loss_scale): x = (constant_op.constant(19.0), (constant_op.constant(8.), constant_op.constant(9.))) @@ -103,8 +197,10 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase): dy_dx = g.gradient(y, x) self.assertEqual(self.evaluate(dy_dx), (13., (13., 13.))) - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + )) def test_nested_targets(self, loss_scale): w = constant_op.constant(3.0) with lsgt.LossScaleGradientTape(loss_scale(32)) as g: @@ -115,68 +211,131 @@ class LossScaleGradientTapeTest(test.TestCase, parameterized.TestCase): grad = g.gradient([x, (y, z)], w) self.assertEqual(self.evaluate(grad), 23) - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_scaling_inf_gradient(self, loss_scale): - x = constant_op.constant(1.0) - with lsgt.LossScaleGradientTape(loss_scale(32)) as g: - g.watch(x) - y = x * np.inf - dy_dx = g.gradient(y, x) - self.assertEqual(self.evaluate(dy_dx), np.inf) + @test_combinations.generate(test_combinations.combine( + loss_scale=[loss_scale_module.FixedLossScale, + loss_scale_module.DynamicLossScale], + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + non_finite_term=[np.inf, np.nan], + )) + def test_scaling_non_finite_gradient(self, loss_scale, strategy_fn, + non_finite_term): + loss_scale = loss_scale(32) + def run_fn(): + x = constant_op.constant(1.0) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) + y = x * non_finite_term + return g.gradient(y, x) - @parameterized.parameters(loss_scale_module.FixedLossScale, - loss_scale_module.DynamicLossScale) - def test_scaling_nan_gradient(self, loss_scale): - x = constant_op.constant(1.0) - with lsgt.LossScaleGradientTape(loss_scale(32)) as g: - g.watch(x) - y = x * np.nan - dy_dx = g.gradient(y, x) - self.assertTrue(np.isnan(self.evaluate(dy_dx))) + dy_dx_list = self._run_with_strategy(run_fn, strategy_fn()) + check_fn = np.isposinf if non_finite_term == np.inf else np.isnan + for dy_dx in dy_dx_list: + self.assertTrue(check_fn(dy_dx)) - @parameterized.parameters(np.inf, np.nan) - def test_dynamic_scale_to_one_on_non_finite_gradient(self, non_finite_term): + @test_combinations.generate(test_combinations.combine( + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + non_finite_term=[np.inf, np.nan], + use_tf_function=[True, False], + )) + def test_dynamic_scale_to_one_on_non_finite_gradient( + self, strategy_fn, non_finite_term, use_tf_function): loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) - x = constant_op.constant(1.0) - with lsgt.LossScaleGradientTape(loss_scale) as g: - g.watch(x) - y = x * non_finite_term - g.gradient(y, x) + def run_fn(): + x = constant_op.constant(1.0) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) + y = x * non_finite_term + g.gradient(y, x) + + self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) self.assertEqual(self.evaluate(loss_scale()), 1.0) - @parameterized.parameters([np.inf, np.isposinf], [np.nan, np.isnan]) - def test_fixed_scaling_no_change_non_finite_gradient(self, non_finite_term, - is_non_finite): - loss_scale = loss_scale_module.FixedLossScale(32) - x = constant_op.constant(1.0) - with lsgt.LossScaleGradientTape(loss_scale) as g: - g.watch(x) - y = x * non_finite_term - dy_dx = g.gradient(y, x) - self.assertTrue(is_non_finite(self.evaluate(dy_dx))) - self.assertEqual(self.evaluate(loss_scale()), 32.0) - - def test_dynamic_loss_scaling_down_loop(self): + @test_combinations.generate(test_combinations.combine( + use_tf_function=[True, False], + )) + def test_dynamic_scale_to_one_on_non_finite_gradient_on_last_replica( + self, use_tf_function): + if context.num_gpus() < 1: + # Requires the mirrored strategy to have two replicas: one on the CPU and + # one on the GPU + self.skipTest('Test requires at least 1 GPU') loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) - x = constant_op.constant(1.0) - with lsgt.LossScaleGradientTape(loss_scale) as g: - g.watch(x) - y = x * (3.0 * (10**37)) # grad will be inf after scaling - dy_dx = g.gradient(y, x) - self.assertEqual(self.evaluate(loss_scale()), 8.0) - self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06) + def run_fn(): + x = constant_op.constant(1.0) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) + # The gradient will be finite on the first replica, and infinite on the + # second + rep_ctx = distribution_strategy_context.get_replica_context() + if rep_ctx.replica_id_in_sync_group == rep_ctx.num_replicas_in_sync - 1: + y = x * np.inf + else: + y = x * 2 + return g.gradient(y, x) - def test_dynamic_loss_scaling_inf_target_post_scale(self): - loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0) - x = constant_op.constant(3.0 * (10**37)) - with lsgt.LossScaleGradientTape(loss_scale) as g: - g.watch(x) - y = x * 3.0 # target will be inf after scaling - dy_dx = g.gradient(y, x) - self.assertAllClose(self.evaluate(dy_dx), 3.0) + replica0_grad, replica1_grad = self._run_with_strategy( + run_fn, create_mirrored_strategy(), use_tf_function) + self.assertEqual(self.evaluate(loss_scale()), 1.0) + self.assertEqual(replica0_grad, 2.0) + self.assertEqual(replica1_grad, np.inf) + + @test_combinations.generate(test_combinations.combine( + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + non_finite_term=[np.inf, np.nan], + )) + def test_fixed_scaling_no_change_non_finite_gradient(self, strategy_fn, + non_finite_term): + loss_scale = loss_scale_module.FixedLossScale(32) + def run_fn(): + x = constant_op.constant(1.0) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) + y = x * non_finite_term + return g.gradient(y, x) + + dy_dx_list = self._run_with_strategy(run_fn, strategy_fn()) + check_fn = np.isposinf if non_finite_term == np.inf else np.isnan + for dy_dx in dy_dx_list: + self.assertTrue(check_fn(self.evaluate(dy_dx))) self.assertEqual(self.evaluate(loss_scale()), 32.0) + @test_combinations.generate(test_combinations.combine( + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + use_tf_function=[True, False] + )) + def test_dynamic_loss_scaling_down_loop(self, strategy_fn, use_tf_function): + loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32) + def run_fn(): + x = constant_op.constant(1.0) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) + y = x * (3.0 * (10**37)) # grad will be inf after scaling + return g.gradient(y, x) + + dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) + self.assertEqual(self.evaluate(loss_scale()), 8.0) + for dy_dx in dy_dx_list: + self.assertAllClose(self.evaluate(dy_dx), (3.0 * (10**37)), atol=1e-06) + + @test_combinations.generate(test_combinations.combine( + strategy_fn=[default_strategy_fn, create_mirrored_strategy], + use_tf_function=[True, False] + )) + def test_dynamic_loss_scaling_inf_target_post_scale(self, strategy_fn, + use_tf_function): + loss_scale = loss_scale_module.DynamicLossScale(initial_loss_scale=32.0) + def run_fn(): + x = constant_op.constant(3.0 * (10**37)) + with lsgt.LossScaleGradientTape(loss_scale) as g: + g.watch(x) + y = x * 3.0 # target will be inf after scaling + return g.gradient(y, x) + + dy_dx_list = self._run_with_strategy(run_fn, strategy_fn(), use_tf_function) + self.assertEqual(self.evaluate(loss_scale()), 32.0) + for dy_dx in dy_dx_list: + self.assertAllClose(self.evaluate(dy_dx), 3.0) + if __name__ == '__main__': v2_compat.enable_v2_behavior() From 86aacf06c9d74678920eefd3d18ff3415e55d70b Mon Sep 17 00:00:00 2001 From: Gaurav Jain Date: Thu, 5 Dec 2019 21:48:27 -0800 Subject: [PATCH 243/383] Improvement documentation of tf.math.exp PiperOrigin-RevId: 284118355 Change-Id: Icc79eada9ae2973aaade4135141cfc5a069213e9 --- .../core/api_def/python_api/api_def_Exp.pbtxt | 7 +-- tensorflow/python/ops/math_ops.py | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 6 deletions(-) diff --git a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt index 38a9078d9f6..4c89cd7afcc 100644 --- a/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt +++ b/tensorflow/core/api_def/python_api/api_def_Exp.pbtxt @@ -1,9 +1,4 @@ op { graph_op_name: "Exp" - endpoint { - name: "math.exp" - } - endpoint { - name: "exp" - } + visibility: HIDDEN } diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py index 65aed1cf076..68be7493962 100644 --- a/tensorflow/python/ops/math_ops.py +++ b/tensorflow/python/ops/math_ops.py @@ -4385,3 +4385,52 @@ def sqrt(x, name=None): # pylint: disable=redefined-builtin A `tf.Tensor` of same size, type and sparsity as `x`. """ return gen_math_ops.sqrt(x, name) + + +# pylint: disable=g-docstring-has-escape +@tf_export("math.exp", "exp") +@dispatch.add_dispatch_support +def exp(x, name=None): + """Computes exponential of x element-wise. \\(y = e^x\\). + + This function computes the exponential of the input tensor element-wise. + i.e. `math.exp(x)` or \\(e^x\\), where `x` is the input tensor. + \\(e\\) denotes Euler's number and is approximately equal to 2.718281. + Output is positive for any real input. + + >>> x = tf.constant(2.0) + >>> tf.math.exp(x) + + + >>> x = tf.constant([2.0, 8.0]) + >>> tf.math.exp(x) + + + For complex numbers, the exponential value is calculated as + \\(e^{x+iy}={e^x}{e^{iy}}={e^x}{\\cos(y)+i\\sin(y)}\\) + + For `1+1j` the value would be computed as: + \\(e^1{\\cos(1)+i\\sin(1)} = 2.7182817 \\times (0.5403023+0.84147096j)\\) + + >>> x = tf.constant(1 + 1j) + >>> tf.math.exp(x) + + + Args: + x: A `tf.Tensor`. Must be one of the following types: `bfloat16`, `half`, + `float32`, `float64`, `complex64`, `complex128`. + name: A name for the operation (optional). + + Returns: + A `tf.Tensor`. Has the same type as `x`. + + @compatibility(numpy) + Equivalent to np.exp + @end_compatibility + """ + return gen_math_ops.exp(x, name) + + +# pylint: enable=g-docstring-has-escape From 8e25bb77707a4ee09a3d8287906be65f6bb51ab6 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 21:53:41 -0800 Subject: [PATCH 244/383] Lower TensorFlow reduction ops with scalar dimensions attribute to HLO PiperOrigin-RevId: 284118982 Change-Id: I35d9c2a43e0adfdaa5df861433f8e75593f7aca0 --- tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir | 10 ++++++++++ tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc | 3 +-- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index 10567d8143c..b111d41d954 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -1555,6 +1555,16 @@ func @mean(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> { return %0 : tensor<4x1xf16> } +// CHECK-LABEL: func @mean_scalar_dim +func @mean_scalar_dim(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> { + // Verify that tf.Mean op with scalar attributes are lowered successfully. + + // CHECK-NOT: tf.Mean + %dimension = "tf.Const"() { value = dense<1> : tensor } : () -> tensor + %0 = "tf.Mean"(%arg0, %dimension) { keep_dims = true }: (tensor<4x8xf16>, tensor) -> tensor<4x1xf16> + return %0 : tensor<4x1xf16> +} + // CHECK-LABEL: func @mean_dynamic func @mean_dynamic(%arg0: tensor<4x?xf16>) -> tensor<4x1xf16> { %dimension = "tf.Const"() { value = dense<1> : tensor<1xi64> } : () -> tensor<1xi64> diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc index c6dc2c01570..2d82b8c78bd 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc @@ -1299,8 +1299,7 @@ class GenericConvertReductionOp : public OpRewritePattern { ArrayRef input_shape = input_ty.getShape(); DenseIntElementsAttr dimensions; - if (!matchPattern(op.reduction_indices(), m_Constant(&dimensions)) || - dimensions.getType().getRank() != 1) + if (!matchPattern(op.reduction_indices(), m_Constant(&dimensions))) return this->matchFailure(); // Build the final shape from input_shape and dimensions using a bitmap From 1c4c9b85c5afda8ddb587528930a3c67c1724b64 Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Thu, 5 Dec 2019 22:22:59 -0800 Subject: [PATCH 245/383] Handle 64-bit integer operands in TF SliceOp lowering to HLO PiperOrigin-RevId: 284122095 Change-Id: Ia1baec690f7d7b58cef709cd82ce21c52f25b130 --- .../compiler/mlir/xla/tests/legalize-tf.mlir | 23 +++++++++++++++---- .../mlir/xla/transforms/legalize_tf.cc | 7 +++--- .../xla/transforms/legalize_tf_patterns.td | 13 ++++++----- 3 files changed, 30 insertions(+), 13 deletions(-) diff --git a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir index b111d41d954..5e8e093bebd 100644 --- a/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir +++ b/tensorflow/compiler/mlir/xla/tests/legalize-tf.mlir @@ -1410,7 +1410,8 @@ func @expand_dims(%arg0: tensor<2xf32>, %axis: tensor) -> tensor<1x2xf32> { // CHECK-LABEL: slice_constant_start func @slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> { // CHECK: %[[START:.*]] = xla_hlo.constant dense<1> : tensor<1xi64> - // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START]]) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<2xi32> + // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<1xi64>) -> tensor<1xi64> + // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<2> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<2xi32> // CHECK: return %[[RESULT]] : tensor<2xi32> %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>) %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi64>} : () -> (tensor<1xi64>) @@ -1418,10 +1419,22 @@ func @slice_constant_start(%arg0: tensor<4xi32>) -> tensor<2xi32> { return %0 : tensor<2xi32> } +// CHECK-LABEL: slice_i32_consts +func @slice_i32_consts(%arg0: tensor<4xi32>) -> tensor<2xi32> { + // CHECK: %[[START:.*]] = xla_hlo.constant dense<1> : tensor<1xi32> + // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<1xi32>) -> tensor<1xi64> + // CHECK: slice_sizes = dense<2> : tensor<1xi64> + %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi32>} : () -> (tensor<1xi32>) + %sizes = "tf.Const"() {value = dense<[2]> : tensor<1xi32>} : () -> (tensor<1xi32>) + %0 = "tf.Slice"(%arg0, %starts, %sizes) : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32> + return %0 : tensor<2xi32> +} + // CHECK-LABEL: slice_constant_start_negative_one_size func @slice_constant_start_negative_one_size(%arg0: tensor<4xi32>) -> tensor<3xi32> { // CHECK: %[[START:.*]] = xla_hlo.constant dense<1> : tensor<1xi64> - // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START]]) {slice_sizes = dense<3> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<3xi32> + // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<1xi64>) -> tensor<1xi64> + // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<3> : tensor<1xi64>} : (tensor<4xi32>, tensor<1xi64>) -> tensor<3xi32> // CHECK: return %[[RESULT]] : tensor<3xi32> %starts = "tf.Const"() {value = dense<[1]> : tensor<1xi64>} : () -> (tensor<1xi64>) %sizes = "tf.Const"() {value = dense<[-1]> : tensor<1xi64>} : () -> (tensor<1xi64>) @@ -1432,7 +1445,8 @@ func @slice_constant_start_negative_one_size(%arg0: tensor<4xi32>) -> tensor<3xi // CHECK-LABEL: slice_constant_start_dynamic_shape func @slice_constant_start_dynamic_shape(%arg0: tensor, %arg1: tensor<2xi64>) -> tensor<1x4xi32> { // CHECK: %[[START:.*]] = xla_hlo.constant dense<[1, 0]> : tensor<2xi64> - // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor, tensor<2xi64>) -> tensor<1x4xi32> + // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%[[START]]) : (tensor<2xi64>) -> tensor<2xi64> + // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor, tensor<2xi64>) -> tensor<1x4xi32> // CHECK: return %[[RESULT]] : tensor<1x4xi32> %starts = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> (tensor<2xi64>) %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>) @@ -1442,7 +1456,8 @@ func @slice_constant_start_dynamic_shape(%arg0: tensor, %arg1: tensor<2 // CHECK-LABEL: slice_variable_start func @slice_variable_start(%arg0: tensor<3x4xi32>, %arg1: tensor<2xi64>) -> tensor<1x4xi32> { - // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %arg1) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32> + // CHECK: %[[START_I64:.*]] = "xla_hlo.convert"(%arg1) : (tensor<2xi64>) -> tensor<2xi64> + // CHECK: %[[RESULT:.*]] = "xla_hlo.dynamic-slice"(%arg0, %[[START_I64]]) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<2xi64>) -> tensor<1x4xi32> // CHECK: return %[[RESULT]] : tensor<1x4xi32> %sizes = "tf.Const"() {value = dense<[1, 4]> : tensor<2xi64>} : () -> (tensor<2xi64>) %0 = "tf.Slice"(%arg0, %arg1, %sizes) : (tensor<3x4xi32>, tensor<2xi64>, tensor<2xi64>) -> tensor<1x4xi32> diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc index 2d82b8c78bd..37341b82b23 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf.cc @@ -127,8 +127,8 @@ static llvm::Optional GetIntegerHLOAxisFromTFAxis(Value *value, /// Returns a `ConvertOp` that casts the elements to a i64 type while retaining /// the shape of the input value. -static ConvertOp CastElementsToI64(Location loc, Value *value, - PatternRewriter *rewriter) { +static ConvertOp CastValueToI64(Location loc, Value *value, + PatternRewriter *rewriter) { return rewriter->create(loc, value, rewriter->getIntegerType(64)); } @@ -419,7 +419,8 @@ static DenseIntElementsAttr TFSliceSizes2HLOSliceSizes( Builder *builder) { DenseIntElementsAttr constant_start_indices; if (!matchPattern(start_indices, m_Constant(&constant_start_indices))) { - return slice_sizes; + return xla::ConvertElementsAttr(slice_sizes, builder->getIntegerType(64)) + .cast(); } auto input_ty = input->getType().dyn_cast(); diff --git a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td index bbe0f43fdba..71821bb4578 100644 --- a/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td +++ b/tensorflow/compiler/mlir/xla/transforms/legalize_tf_patterns.td @@ -29,6 +29,9 @@ def FeatureDimension : NativeCodeCall< def FalseBoolAttr : AttrConstraint>; def TrueBoolAttr : AttrConstraint>; +def CastValueToI64: NativeCodeCall< + "CastValueToI64($0->getLoc(), $1, &$_builder)">; + def : Pattern< (TF_FusedBatchNormOp:$root $x, $scale, $offset, $mean, $variance, $epsilon, $data_format, FalseBoolAttr:$is_training), @@ -345,9 +348,9 @@ def TFSliceSizes2HLOSliceSizes : NativeCodeCall< "TFSliceSizes2HLOSliceSizes($0, $1, $2.cast()," "&$_builder)">; -def : Pat<(TF_SliceOp HLO_Tensor:$input, HLO_Tensor:$starting_indices, - (TF_ConstOp I64ElementsAttr:$slice_sizes)), - (HLO_DynamicSliceOp $input, $starting_indices, +def : Pat<(TF_SliceOp:$op HLO_Tensor:$input, HLO_Tensor:$starting_indices, + (TF_ConstOp $slice_sizes)), + (HLO_DynamicSliceOp $input, (CastValueToI64 $op, $starting_indices), (TFSliceSizes2HLOSliceSizes $input, $starting_indices, $slice_sizes)), [(CanBeTranslatedToDynamicSlice $input, $starting_indices, $slice_sizes)]>; @@ -410,8 +413,6 @@ foreach TfOp = [TF_ExpandDimsOp, TF_ReshapeOp, TF_SqueezeOp, ] in { //===----------------------------------------------------------------------===// // RngUniform. //===----------------------------------------------------------------------===// -def CastElementsToI64: NativeCodeCall< - "CastElementsToI64($0->getLoc(), $1, &$_builder)">; // TODO(misard,phawkins): handle random number generator seeds/states correctly. def : Pat<(TF_RandomUniformOp:$old $shape, $seed, $seed2), @@ -420,5 +421,5 @@ def : Pat<(TF_RandomUniformOp:$old $shape, $seed, $seed2), (NativeCodeCall<"$_builder.getFloatAttr(old.dtype(), 0.0)">)), (HLO_ConstOp (NativeCodeCall<"$_builder.getFloatAttr(old.dtype(), 1.0)">)), - (CastElementsToI64 $old, $shape)), + (CastValueToI64 $old, $shape)), [(IsShapedTensor $shape)]>; From 60b8dcfee1482b6620773fa18e3e57182af0d7a0 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 01:03:09 -0800 Subject: [PATCH 246/383] compat: Update forward compatibility horizon to 2019-12-06 PiperOrigin-RevId: 284137671 Change-Id: Ic9582381dd0526ceceae193969114f38c618e53d --- tensorflow/python/compat/compat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py index 1152256b463..ed796ef1a76 100644 --- a/tensorflow/python/compat/compat.py +++ b/tensorflow/python/compat/compat.py @@ -31,7 +31,7 @@ from tensorflow.python.util.tf_export import tf_export # This value changes every day with an automatic CL. It can be modified in code # via `forward_compatibility_horizon()` or with the environment variable # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date. -_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 12, 5) +_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2019, 12, 6) _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS" _FORWARD_COMPATIBILITY_DATE_NUMBER = None From f62a214c2202573fb13b0b601f721c6d757fad68 Mon Sep 17 00:00:00 2001 From: Dero Gharibian Date: Fri, 6 Dec 2019 01:08:33 -0800 Subject: [PATCH 247/383] Migrate from std::string to tensorflow::tstring. Note that during the transition period tstring is typedef'ed to std::string. This is a part of a larger migration effort for tensorflow::tstring. See: https://github.com/tensorflow/community/pull/91 PiperOrigin-RevId: 284138845 Change-Id: Ia478fde1a34804b54aca402e336e60cd75744852 --- tensorflow/compiler/xla/tools/replay_computation.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc index 67a2c26201a..095655085e5 100644 --- a/tensorflow/compiler/xla/tools/replay_computation.cc +++ b/tensorflow/compiler/xla/tools/replay_computation.cc @@ -346,10 +346,10 @@ StatusOr> ParseRecordIoFile(absl::string_view filename, std::vector snapshots; uint64 offset = 0; - string record; + tensorflow::tstring record; while (reader.ReadRecord(&offset, &record).ok()) { HloSnapshot snapshot; - if (snapshot.mutable_hlo()->ParseFromString(record)) { + if (snapshot.mutable_hlo()->ParseFromStringPiece(record)) { snapshots.push_back(std::move(snapshot)); } else { LOG(ERROR) << "Encountered bad proto"; From acb8b516ee9662f3ab78f1401ccb563d54aecc43 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 01:08:40 -0800 Subject: [PATCH 248/383] LLVM::AddressOfOp: properly take into account the address space The AddressOf operation in the LLVM dialect return a pointer to a global variable. The latter may be in a non-default address space as indicated by the "addr_space" attribute. Check that the address space of the pointer returned by AddressOfOp matches that of the referenced GlobalOp. Update the AddressOfOp builder to respect this constraint. PiperOrigin-RevId: 284138860 Change-Id: I2ada419c22b4ba7ac0788cefff199123e4634344 --- third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 5 +++-- third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 573542ba838..66d9ba3f750 100644 --- a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -509,8 +509,9 @@ def LLVM_AddressOfOp OpBuilder<"Builder *builder, OperationState &result, GlobalOp global, " "ArrayRef attrs = {}", [{ - build(builder, result, global.getType().getPointerTo(), global.sym_name(), - attrs);}]> + build(builder, result, + global.getType().getPointerTo(global.addr_space().getZExtValue()), + global.sym_name(), attrs);}]> ]; let extraClassDeclaration = [{ diff --git a/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index 00911012c1d..fb4555674eb 100644 --- a/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -825,7 +825,8 @@ static LogicalResult verify(AddressOfOp op) { return op.emitOpError( "must reference a global defined by 'llvm.mlir.global'"); - if (global.getType().getPointerTo() != op.getResult()->getType()) + if (global.getType().getPointerTo(global.addr_space().getZExtValue()) != + op.getResult()->getType()) return op.emitOpError( "the type must be a pointer to the type of the referred global"); From 9b3930304b001c6a4ed3edfbd7a12d381caec633 Mon Sep 17 00:00:00 2001 From: Chao Mei Date: Fri, 6 Dec 2019 01:44:34 -0800 Subject: [PATCH 249/383] Extract out the profiling listener creation as a virtual method so that subclasses could overwrite w/ its own profiling listener. PiperOrigin-RevId: 284142645 Change-Id: Ie66ca638c9e797f5aaf7493b8b5c5180a89eea1f --- .../tools/benchmark/benchmark_tflite_model.cc | 16 ++++++++++------ .../tools/benchmark/benchmark_tflite_model.h | 8 ++++++-- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc index 197907ec9e8..2edbbd06ec4 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc @@ -573,12 +573,8 @@ TfLiteStatus BenchmarkTfLiteModel::Init() { // Install profilers if necessary right after interpreter is created so that // any memory allocations inside the TFLite runtime could be recorded if the // installed profiler profile memory usage information. - if (params_.Get("enable_op_profiling")) { - profiling_listener_.reset(new ProfilingListener( - interpreter_.get(), - params_.Get("max_profiling_buffer_entries"))); - AddListener(profiling_listener_.get()); - } + profiling_listener_ = MayCreateProfilingListener(); + if (profiling_listener_) AddListener(profiling_listener_.get()); interpreter_->UseNNAPI(params_.Get("use_legacy_nnapi")); interpreter_->SetAllowFp16PrecisionForFp32(params_.Get("allow_fp16")); @@ -771,6 +767,14 @@ std::unique_ptr BenchmarkTfLiteModel::GetOpResolver() return std::unique_ptr(resolver); } +std::unique_ptr +BenchmarkTfLiteModel::MayCreateProfilingListener() const { + if (!params_.Get("enable_op_profiling")) return nullptr; + return std::unique_ptr(new ProfilingListener( + interpreter_.get(), + params_.Get("max_profiling_buffer_entries"))); +} + TfLiteStatus BenchmarkTfLiteModel::RunImpl() { return interpreter_->Invoke(); } } // namespace benchmark diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h index a6fc38a6180..3778cc968bd 100644 --- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h +++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h @@ -71,6 +71,10 @@ class BenchmarkTfLiteModel : public BenchmarkModel { // Allow subclasses to create a customized Op resolver during init. virtual std::unique_ptr GetOpResolver() const; + // Create a BenchmarkListener that's specifically for TFLite profiling if + // necessary. + virtual std::unique_ptr MayCreateProfilingListener() const; + void CleanUp(); std::unique_ptr model_; @@ -103,8 +107,8 @@ class BenchmarkTfLiteModel : public BenchmarkModel { std::vector inputs_; std::vector inputs_data_; - std::unique_ptr profiling_listener_; - std::unique_ptr gemmlowp_profiling_listener_; + std::unique_ptr profiling_listener_ = nullptr; + std::unique_ptr gemmlowp_profiling_listener_ = nullptr; TfLiteDelegatePtrMap delegates_; std::mt19937 random_engine_; From d0e55241d5a26df7876fdb6eeb652fc9678950a5 Mon Sep 17 00:00:00 2001 From: Stefano Galarraga Date: Fri, 6 Dec 2019 02:47:23 -0800 Subject: [PATCH 250/383] Add details about failed NNAPI operation when logging NNAPI errors. It also Converts the error code into the associated NNAPI constant name. PiperOrigin-RevId: 284149818 Change-Id: I3b6be19b93b40ada764c2edcd661c86619b9b830 --- .../lite/delegates/nnapi/nnapi_delegate.cc | 122 ++++++++++++------ tensorflow/lite/nnapi/NeuralNetworksTypes.h | 4 + 2 files changed, 86 insertions(+), 40 deletions(-) diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc index cc73f3020e5..1bb27baf7d4 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc @@ -57,14 +57,47 @@ limitations under the License. namespace tflite { namespace { -// TODO(b/80621585): Consider printing error string, but don't for now to -// minimize binary size. -#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code, p_errno) \ +// Returns the enum name corresponding to the given error code if the given +// value corresponds to an of the error codes in the enumeration above or +// an message with the unknown code. +// LINT.IfChange(NnApiErrorDescription) +std::string NnApiErrorDescription(int error_code) { + switch (error_code) { + case ANEURALNETWORKS_NO_ERROR: + return "ANEURALNETWORKS_NO_ERROR"; + case ANEURALNETWORKS_OUT_OF_MEMORY: + return "ANEURALNETWORKS_OUT_OF_MEMORY"; + case ANEURALNETWORKS_INCOMPLETE: + return "ANEURALNETWORKS_INCOMPLETE"; + case ANEURALNETWORKS_UNEXPECTED_NULL: + return "ANEURALNETWORKS_UNEXPECTED_NULL"; + case ANEURALNETWORKS_BAD_DATA: + return "ANEURALNETWORKS_BAD_DATA"; + case ANEURALNETWORKS_OP_FAILED: + return "ANEURALNETWORKS_OP_FAILED"; + case ANEURALNETWORKS_BAD_STATE: + return "ANEURALNETWORKS_BAD_STATE"; + case ANEURALNETWORKS_UNMAPPABLE: + return "ANEURALNETWORKS_UNMAPPABLE"; + case ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE: + return "ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE"; + case ANEURALNETWORKS_UNAVAILABLE_DEVICE: + return "ANEURALNETWORKS_UNAVAILABLE_DEVICE"; + default: + return "Unknown NNAPI error code: " + std::to_string(error_code); + } +} +// LINT.ThenChange() + +#define RETURN_TFLITE_ERROR_IF_NN_ERROR(context, code, call_desc, p_errno) \ do { \ const auto _code = (code); \ + const auto _call_desc = (call_desc); \ if (_code != ANEURALNETWORKS_NO_ERROR) { \ - context->ReportError(context, "NN API returned error (%d, line %d).\n", \ - _code, __LINE__); \ + const auto error_desc = NnApiErrorDescription(_code); \ + context->ReportError(context, \ + "NN API returned error %s at line %d while %s.\n", \ + error_desc.c_str(), __LINE__, _call_desc); \ *p_errno = _code; \ return kTfLiteError; \ } \ @@ -611,7 +644,7 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); dequantized_ann_index = operand_mapping_->add_new_non_tensor_operand(); // Add Dequantize operation. @@ -623,7 +656,7 @@ class NNAPIOpBuilder { nnapi_->ANeuralNetworksModel_addOperation( nn_model_, ANEURALNETWORKS_DEQUANTIZE, 1, dequantize_input, 1, dequantize_output), - nnapi_errno_); + "adding operation", nnapi_errno_); dequantize_mapping_->Add(ann_index, dequantized_type, dequantized_ann_index); } @@ -645,7 +678,7 @@ class NNAPIOpBuilder { augmented_inputs_.data(), static_cast(augmented_outputs_.size()), augmented_outputs_.data()), - nnapi_errno_); + "adding operation", nnapi_errno_); augmented_inputs_.clear(); augmented_outputs_.clear(); return kTfLiteOk; @@ -660,7 +693,7 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index); if (ann_tensor_index != -1) { augmented_inputs_.push_back(ann_tensor_index); @@ -718,7 +751,7 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); augmented_inputs_.push_back(ann_tensor_index); @@ -727,7 +760,7 @@ class NNAPIOpBuilder { nnapi_->ANeuralNetworksModel_setOperandValue( nn_model_, ann_tensor_index, new_tensor->data.raw, new_tensor->bytes), - nnapi_errno_); + "setting new operand value", nnapi_errno_); return kTfLiteOk; } @@ -774,13 +807,13 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); const int ann_index = operand_mapping_->add_new_non_tensor_operand(); RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_setOperandValue(nn_model_, ann_index, &value, sizeof(T)), - nnapi_errno_); + "setting new operand value", nnapi_errno_); augmented_inputs_.push_back(ann_index); return kTfLiteOk; } @@ -798,14 +831,14 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); const int ann_index = operand_mapping_->add_new_non_tensor_operand(); RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_setOperandValue( nn_model_, ann_index, values, sizeof(T) * num_values), - nnapi_errno_); + "settings new operand value", nnapi_errno_); augmented_inputs_.push_back(ann_index); return kTfLiteOk; } @@ -840,7 +873,7 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); const int ann_index = operand_mapping_->add_new_non_tensor_operand(); augmented_outputs_.push_back(ann_index); if (ann_index_out) *ann_index_out = ann_index; @@ -960,14 +993,14 @@ class NNAPIOpBuilder { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_addOperand(nn_model_, &operand_type), - nnapi_errno_); + "adding operand", nnapi_errno_); if (nn_type == ANEURALNETWORKS_TENSOR_QUANT8_SYMM_PER_CHANNEL) { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_setOperandSymmPerChannelQuantParams( nn_model_, ann_tensor_index, &ann_perchannel_params), - nnapi_errno_); + "setting new operand per channel quantization params", nnapi_errno_); } if (tensor->allocation_type == kTfLiteMmapRo) { if (IsQuantized(tensor_type) && need_int8_conversion) { @@ -1000,7 +1033,7 @@ class NNAPIOpBuilder { nnapi_->ANeuralNetworksModel_setOperandValue( nn_model_, ann_tensor_index, new_tensor->data.raw, new_tensor->bytes), - nnapi_errno_); + "setting new operand value", nnapi_errno_); #ifdef TFLITE_NNAPI_ALLOW_MMAP_SHARING } else if (tensor->allocation && static_cast(tensor->allocation)->type() == @@ -1025,14 +1058,14 @@ class NNAPIOpBuilder { nnapi_->ANeuralNetworksModel_setOperandValueFromMemory( nn_model_, ann_tensor_index, ann_memory_handle, offset, tensor->bytes), - nnapi_errno_); + "setting new operand value from memory", nnapi_errno_); #endif } else { RETURN_TFLITE_ERROR_IF_NN_ERROR( context_, nnapi_->ANeuralNetworksModel_setOperandValue( nn_model_, ann_tensor_index, tensor->data.raw, tensor->bytes), - nnapi_errno_); + "setting new operand value", nnapi_errno_); } } @@ -2910,8 +2943,9 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context, if (!nn_model_) { ANeuralNetworksModel* model = nullptr; - RETURN_TFLITE_ERROR_IF_NN_ERROR( - context, nnapi_->ANeuralNetworksModel_create(&model), nnapi_errno); + RETURN_TFLITE_ERROR_IF_NN_ERROR(context, + nnapi_->ANeuralNetworksModel_create(&model), + "creating NNAPI model", nnapi_errno); nn_model_.reset(model); TF_LITE_ENSURE_STATUS(BuildGraph(context, params->input_tensors, @@ -2927,11 +2961,12 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context, nnapi_->ANeuralNetworksCompilation_createForDevices( nn_model_.get(), nnapi_devices_.data(), nnapi_devices_.size(), &compilation), - nnapi_errno); + "creating NNAPI model for given devices", nnapi_errno); } else { RETURN_TFLITE_ERROR_IF_NN_ERROR(context, nnapi_->ANeuralNetworksCompilation_create( nn_model_.get(), &compilation), + "creating NNAPI compilation", nnapi_errno); } @@ -2945,7 +2980,9 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context, nnapi_->ANeuralNetworksCompilation_free(compilation); compilation = nullptr; } - RETURN_TFLITE_ERROR_IF_NN_ERROR(context, preference_result, nnapi_errno); + RETURN_TFLITE_ERROR_IF_NN_ERROR(context, preference_result, + "setting compilation preferences", + nnapi_errno); } const char* cache_dir = delegate_options.cache_dir; @@ -2978,7 +3015,8 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context, nnapi_->ANeuralNetworksCompilation_free(compilation); compilation = nullptr; } - RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result, nnapi_errno); + RETURN_TFLITE_ERROR_IF_NN_ERROR(context, set_caching_result, + "configuring NNAPI caching", nnapi_errno); } const int finish_result = nnapi_->ANeuralNetworksCompilation_finish(compilation); @@ -2986,7 +3024,8 @@ TfLiteStatus NNAPIDelegateKernel::Init(TfLiteContext* context, nnapi_->ANeuralNetworksCompilation_free(compilation); compilation = nullptr; } - RETURN_TFLITE_ERROR_IF_NN_ERROR(context, finish_result, nnapi_errno); + RETURN_TFLITE_ERROR_IF_NN_ERROR( + context, finish_result, "completing NNAPI compilation", nnapi_errno); nn_compilation_.reset(compilation); } return kTfLiteOk; @@ -3007,7 +3046,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, RETURN_TFLITE_ERROR_IF_NN_ERROR(context, nnapi_->ANeuralNetworksExecution_create( nn_compilation_.get(), &execution), - nnapi_errno); + "creating NNAPI execution", nnapi_errno); std::unique_ptr execution_unique_ptr(execution); @@ -3030,6 +3069,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, execution, relative_input_index, nullptr, tensor_memory_map_->at(tensor->buffer_handle).memory, 0, tensor->bytes), + "associating NNAPI execution input with a memory object", nnapi_errno); relative_input_index++; continue; @@ -3077,6 +3117,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, nnapi_->ANeuralNetworksExecution_setInputFromMemory( execution, relative_input_index, nullptr, nn_input_memory_->get_handle(), input_offset, tensor_size), + "associating NNAPI execution input with a memory object", nnapi_errno); } else { // copy data to pre-allocated shared memory. @@ -3087,6 +3128,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, nnapi_->ANeuralNetworksExecution_setInputFromMemory( execution, relative_input_index, nullptr, nn_input_memory_->get_handle(), input_offset, tensor->bytes), + "associating NNAPI execution input with a memory object", nnapi_errno); tensor_size = tensor->bytes; } @@ -3114,7 +3156,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, execution, relative_output_index, nullptr, tensor_memory_map_->at(tensor->buffer_handle).memory, 0, tensor->bytes), - nnapi_errno); + "associating NNAPI execution output to a memory object", nnapi_errno); } else { RETURN_TFLITE_ERROR_IF_NN_ERROR( @@ -3122,7 +3164,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, nnapi_->ANeuralNetworksExecution_setOutputFromMemory( execution, relative_output_index, nullptr, nn_output_memory_->get_handle(), output_offset, tensor->bytes), - nnapi_errno); + "associating NNAPI execution output to a memory object", nnapi_errno); output_offset += tensor->bytes; output_offset += getNumPaddingBytes(tensor->bytes); } @@ -3142,7 +3184,7 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, nnapi_->ANeuralNetworksExecution_setOutput( execution, relative_output_index, nullptr, tensor->data.raw, tensor->bytes), - nnapi_errno); + "associating NNAPI execution output to a buffer", nnapi_errno); relative_output_index++; } // Invoke ANN in blocking fashion. @@ -3151,15 +3193,17 @@ TfLiteStatus NNAPIDelegateKernel::Invoke(TfLiteContext* context, RETURN_TFLITE_ERROR_IF_NN_ERROR( context, nnapi_->ANeuralNetworksExecution_startCompute(execution, &event), - nnapi_errno); + "starting async computation", nnapi_errno); const int wait_result = nnapi_->ANeuralNetworksEvent_wait(event); nnapi_->ANeuralNetworksEvent_free(event); - RETURN_TFLITE_ERROR_IF_NN_ERROR(context, wait_result, nnapi_errno); + RETURN_TFLITE_ERROR_IF_NN_ERROR(context, wait_result, + "waiting for async computation completion", + nnapi_errno); } else { // Use synchronous execution for NNAPI 1.2+. RETURN_TFLITE_ERROR_IF_NN_ERROR( context, nnapi_->ANeuralNetworksExecution_compute(execution), - nnapi_errno); + "running computation", nnapi_errno); } // copy results from shared memory to the destination. @@ -3567,21 +3611,19 @@ TfLiteStatus NNAPIDelegateKernel::BuildGraph( nnapi_->ANeuralNetworksModel_identifyInputsAndOutputs( nn_model_.get(), inputs.size(), inputs.data(), outputs.size(), outputs.data()), - nnapi_errno); + "identifying model inputs and outputs", nnapi_errno); - // Set relaxed computation mode for fp32 if possible. if (nnapi_->android_sdk_version >= kMinSdkVersionForNNAPI11) { RETURN_TFLITE_ERROR_IF_NN_ERROR( context, nnapi_->ANeuralNetworksModel_relaxComputationFloat32toFloat16( nn_model_.get(), context->allow_fp32_relax_to_fp16), - nnapi_errno); + "set relaxed computation mode for fp32 if possible", nnapi_errno); } - // Finalize the model RETURN_TFLITE_ERROR_IF_NN_ERROR( context, nnapi_->ANeuralNetworksModel_finish(nn_model_.get()), - nnapi_errno); + "finalizing the model", nnapi_errno); // Create shared memory pool for inputs and outputs. nn_input_memory_.reset( @@ -3740,7 +3782,7 @@ TfLiteStatus StatefulNnApiDelegate::DoPrepare(TfLiteContext* context, uint32_t device_count = 0; RETURN_TFLITE_ERROR_IF_NN_ERROR( context, nnapi->ANeuralNetworks_getDeviceCount(&device_count), - nnapi_errno); + "getting number of NNAPI devices", nnapi_errno); if (device_count <= 1) { return kTfLiteOk; } diff --git a/tensorflow/lite/nnapi/NeuralNetworksTypes.h b/tensorflow/lite/nnapi/NeuralNetworksTypes.h index b4ec12ee14d..8c99f6f25bb 100644 --- a/tensorflow/lite/nnapi/NeuralNetworksTypes.h +++ b/tensorflow/lite/nnapi/NeuralNetworksTypes.h @@ -18,6 +18,8 @@ limitations under the License. #include #include +#include + typedef struct AHardwareBuffer AHardwareBuffer; // NN api types based on NNAPI header file @@ -159,6 +161,7 @@ enum { /** * Result codes. */ +// LINT.IfChange enum { ANEURALNETWORKS_NO_ERROR = 0, ANEURALNETWORKS_OUT_OF_MEMORY = 1, @@ -171,6 +174,7 @@ enum { ANEURALNETWORKS_OUTPUT_INSUFFICIENT_SIZE = 8, ANEURALNETWORKS_UNAVAILABLE_DEVICE = 9, }; +// LINT.ThenChange(//tensorflow/lite/delegates/nnapi/nnapi_delegate.cc:NnApiErrorDescription) /** * Implicit padding algorithms. From dbef8861c9310d5c2c05f4df876ccc100c097dc2 Mon Sep 17 00:00:00 2001 From: Stefano Galarraga Date: Fri, 6 Dec 2019 03:05:48 -0800 Subject: [PATCH 251/383] Add change guards to constants copied in linter.proto PiperOrigin-RevId: 284151950 Change-Id: I62dca705314590a3030c27ed9c20e4959610dd5b --- tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h | 2 ++ tensorflow/lite/schema/schema.fbs | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h index 6a9493f9f4d..db263a195f4 100644 --- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h +++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_kernel.h @@ -169,6 +169,7 @@ class NNMemory { ANeuralNetworksMemory* nn_memory_handle_ = nullptr; }; + enum class NNAPIValidationFailureType : int { // The operator is not supported by either NNAPI or the NNAPI Delegate. kUnsupportedOperator = 0, @@ -226,6 +227,7 @@ enum class NNAPIValidationFailureType : int { kUnsupportedQuantizationParameters = 15, }; + struct NNAPIValidationFailure { NNAPIValidationFailureType type; std::string message; diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs index 63fd3bbc4d6..7e70f986998 100644 --- a/tensorflow/lite/schema/schema.fbs +++ b/tensorflow/lite/schema/schema.fbs @@ -185,6 +185,7 @@ table Tensor { // ones, but not by much. Moreover, while custom operators accept an opaque // object containing configuration parameters, builtins have a predetermined // set of acceptable options. + enum BuiltinOperator : byte { ADD = 0, AVERAGE_POOL_2D = 1, @@ -318,6 +319,7 @@ enum BuiltinOperator : byte { SCATTER_ND = 122 } + // Options for the builtin operators. union BuiltinOptions { Conv2DOptions, From ba509fda808016f37c641fb73feef0a2d94b7a68 Mon Sep 17 00:00:00 2001 From: Dan Moldovan Date: Fri, 6 Dec 2019 04:44:23 -0800 Subject: [PATCH 252/383] Enable AutoGraph in MirroredStrategy.merge_call, Strategy.extended.update and Strategy.extended.update_non_slot. PiperOrigin-RevId: 284162125 Change-Id: Ie85a981c43e980800aa912b1b54c8464f6d47a09 --- tensorflow/python/distribute/BUILD | 1 + .../python/distribute/distribute_lib.py | 6 +++ .../python/distribute/distribute_lib_test.py | 44 +++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD index 7ce283749f3..a04e0b2b3ea 100644 --- a/tensorflow/python/distribute/BUILD +++ b/tensorflow/python/distribute/BUILD @@ -187,6 +187,7 @@ py_test( "//tensorflow/python:util", "//tensorflow/python:variable_scope", "//tensorflow/python:variables", + "//tensorflow/python/autograph/core:test_lib", "//tensorflow/python/data/ops:dataset_ops", "//third_party/py/numpy", ], diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index df9eccc8038..e988499292e 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -1538,6 +1538,8 @@ class StrategyExtendedV2(object): _require_cross_replica_or_default_context_extended(self) if kwargs is None: kwargs = {} + fn = autograph.tf_convert( + fn, ag_ctx.control_status_ctx(), convert_by_default=False) with self._container_strategy().scope(): return self._update(var, fn, args, kwargs, group) @@ -1562,6 +1564,8 @@ class StrategyExtendedV2(object): _require_cross_replica_or_default_context_extended(self) if kwargs is None: kwargs = {} + fn = autograph.tf_convert( + fn, ag_ctx.control_status_ctx(), convert_by_default=False) with self._container_strategy().scope(): return self._update_non_slot(colocate_with, fn, args, kwargs, group) @@ -1945,6 +1949,8 @@ class ReplicaContext(object): require_replica_context(self) if kwargs is None: kwargs = {} + merge_fn = autograph.tf_convert(merge_fn, ag_ctx.control_status_ctx(), + convert_by_default=False) return self._merge_call(merge_fn, args, kwargs) def _merge_call(self, merge_fn, args, kwargs): diff --git a/tensorflow/python/distribute/distribute_lib_test.py b/tensorflow/python/distribute/distribute_lib_test.py index fb8116d4ab2..588605864e2 100644 --- a/tensorflow/python/distribute/distribute_lib_test.py +++ b/tensorflow/python/distribute/distribute_lib_test.py @@ -21,6 +21,7 @@ from __future__ import print_function from absl.testing import parameterized import numpy as np +from tensorflow.python.autograph.core import converter_testing from tensorflow.python.data.ops import dataset_ops from tensorflow.python.distribute import combinations from tensorflow.python.distribute import distribute_lib @@ -29,6 +30,7 @@ from tensorflow.python.distribute import input_lib from tensorflow.python.distribute import reduce_util from tensorflow.python.distribute import values from tensorflow.python.eager import context +from tensorflow.python.eager import def_function from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops @@ -387,6 +389,21 @@ class TestStrategyTest(test.TestCase): self.assertIs(tt, t) dist.extended.update(v, assign_fn, (t,)) + @_run_in_and_out_of_scope + def testUpdateAutoGraph(self, dist): + with dist.scope(): + v = variables.Variable(1.) + t = constant_op.constant(2.) + + def assign_fn(unused_vv, unused_tt): + self.assertTrue(converter_testing.is_inside_generated_code()) + + @def_function.function # AutoGraph is default-on only within tf.function + def test_fn(): + dist.extended.update(v, assign_fn, (t,)) + + test_fn() + @_run_in_and_out_of_scope def testUpdateNonSlot(self, dist): t = constant_op.constant(2.) @@ -394,6 +411,19 @@ class TestStrategyTest(test.TestCase): dist.extended.update_non_slot(t, lambda: update_calls.append(1)) self.assertEqual(len(update_calls), 1) + @_run_in_and_out_of_scope + def testUpdateNonSlotAutoGraph(self, dist): + t = constant_op.constant(2.) + + def update_fn(): + self.assertTrue(converter_testing.is_inside_generated_code()) + + @def_function.function # AutoGraph is default-on only within tf.function + def test_fn(): + dist.extended.update_non_slot(t, update_fn) + + test_fn() + # _TestStrategy2 is like _TestStrategy, except it doesn't change variable # creation. @@ -428,6 +458,20 @@ class DefaultDistributionStrategyTest(test.TestCase, parameterized.TestCase): self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",))) _assert_in_default_state(self) + def testMergeCallAutoGraph(self): + _assert_in_default_state(self) + + def merge_fn(_, s): + self.assertTrue(converter_testing.is_inside_generated_code()) + return s + + @def_function.function # AutoGraph is default-on only within tf.function + def test_fn(): + replica_ctx = ds_context.get_replica_context() + replica_ctx.merge_call(merge_fn, args=("bar",)) + + test_fn() + def testScopeMostlyNoOp(self): _assert_in_default_state(self) From 3d21ad0e169a6aac1622a31e65ba8c621b2b027b Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Fri, 6 Dec 2019 05:30:25 -0800 Subject: [PATCH 253/383] Open source run_hlo_module. PiperOrigin-RevId: 284166663 Change-Id: I395f6a0a8efeb60784bdcca4e5227f0ef470f6f7 --- tensorflow/compiler/xla/tools/BUILD | 47 +++++ .../xla/tools/prepare_reference_module.cc | 9 +- .../xla/tools/prepare_reference_module.h | 6 +- .../compiler/xla/tools/run_hlo_module.cc | 145 ++++++++++++++ .../compiler/xla/tools/run_hlo_module.h | 76 ++++++++ .../compiler/xla/tools/run_hlo_module_main.cc | 184 ++++++++++++++++++ 6 files changed, 464 insertions(+), 3 deletions(-) create mode 100644 tensorflow/compiler/xla/tools/run_hlo_module.cc create mode 100644 tensorflow/compiler/xla/tools/run_hlo_module.h create mode 100644 tensorflow/compiler/xla/tools/run_hlo_module_main.cc diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD index 77274980698..02d5a1f8ad0 100644 --- a/tensorflow/compiler/xla/tools/BUILD +++ b/tensorflow/compiler/xla/tools/BUILD @@ -292,6 +292,53 @@ cc_library( "//tensorflow/compiler/xla/service:hlo", "//tensorflow/compiler/xla/service:hlo_module_config", "//tensorflow/core/platform:errors", + "//tensorflow/stream_executor:platform", "//tensorflow/stream_executor/lib", ], ) + +cc_library( + name = "run_hlo_module_lib", + testonly = True, + srcs = ["run_hlo_module.cc"], + hdrs = ["run_hlo_module.h"], + deps = [ + ":hlo_module_loader", + ":prepare_reference_module", + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla:error_spec", + "//tensorflow/compiler/xla:util", + "//tensorflow/compiler/xla/client/lib:testing", + "//tensorflow/compiler/xla/service:hlo", + "//tensorflow/compiler/xla/service:hlo_runner", + "//tensorflow/compiler/xla/service:hlo_verifier", + "//tensorflow/compiler/xla/service:platform_util", + "//tensorflow/compiler/xla/tests:literal_test_util", + "//tensorflow/compiler/xla/tests:test_utils", + "//tensorflow/core/platform:logging", + "//tensorflow/core/platform:status", + "//tensorflow/core/platform:test", + "//tensorflow/stream_executor:platform", + "@com_google_absl//absl/strings", + "@com_google_absl//absl/types:span", + ], +) + +tf_cc_binary( + name = "run_hlo_module", + testonly = True, + srcs = ["run_hlo_module_main.cc"], + deps = [ + ":run_hlo_module_lib", + "//tensorflow/compiler/xla:debug_options_flags", + "//tensorflow/compiler/xla/service:cpu_plugin", + "//tensorflow/compiler/xla/service:gpu_plugin", + "//tensorflow/compiler/xla/service:interpreter_plugin", + "//tensorflow/core:framework_internal", + "//tensorflow/core/platform:logging", + "//tensorflow/core/platform:platform_port", + "//tensorflow/core/platform:status", + "//tensorflow/core/platform:test", + "@com_google_absl//absl/strings", + ], +) diff --git a/tensorflow/compiler/xla/tools/prepare_reference_module.cc b/tensorflow/compiler/xla/tools/prepare_reference_module.cc index 1f4cc67205c..65489c2d5db 100644 --- a/tensorflow/compiler/xla/tools/prepare_reference_module.cc +++ b/tensorflow/compiler/xla/tools/prepare_reference_module.cc @@ -26,13 +26,17 @@ limitations under the License. #include "tensorflow/compiler/xla/xla.pb.h" #include "tensorflow/core/platform/errors.h" #include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform.h" namespace xla { StatusOr> PrepareReferenceModule( const HloModule& test_module, + const ::stream_executor::Platform::Id& test_platform_id, const std::function& config_modifier_hook, - const std::function& module_modifier_hook) { + const std::function& module_modifier_hook) { DebugOptions debug_options = GetDebugOptionsFromFlags(); // The combination of fast math and optimizations leads to unsound code // transformations (see third_party/tensorflow/compiler/xla/xla.proto for @@ -47,7 +51,8 @@ StatusOr> PrepareReferenceModule( std::unique_ptr reference_module = test_module.Clone(reference_config, "reference"); if (module_modifier_hook) { - TF_RETURN_IF_ERROR(module_modifier_hook(reference_module.get())); + TF_RETURN_IF_ERROR(module_modifier_hook(test_module, test_platform_id, + reference_module.get())); } else { TF_RETURN_IF_ERROR(Despecializer().Run(reference_module.get()).status()); } diff --git a/tensorflow/compiler/xla/tools/prepare_reference_module.h b/tensorflow/compiler/xla/tools/prepare_reference_module.h index 45341c08637..f98e50fc1e8 100644 --- a/tensorflow/compiler/xla/tools/prepare_reference_module.h +++ b/tensorflow/compiler/xla/tools/prepare_reference_module.h @@ -23,6 +23,7 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_module_config.h" #include "tensorflow/compiler/xla/statusor.h" #include "tensorflow/stream_executor/lib/status.h" +#include "tensorflow/stream_executor/platform.h" namespace xla { @@ -33,8 +34,11 @@ namespace xla { // platforms. StatusOr> PrepareReferenceModule( const HloModule& test_module, + const ::stream_executor::Platform::Id& test_platform_id, const std::function& config_modifier_hook = {}, - const std::function& module_modifier_hook = {}); + const std::function& module_modifier_hook = {}); } // namespace xla diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.cc b/tensorflow/compiler/xla/tools/run_hlo_module.cc new file mode 100644 index 00000000000..39b545af393 --- /dev/null +++ b/tensorflow/compiler/xla/tools/run_hlo_module.cc @@ -0,0 +1,145 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/compiler/xla/tools/run_hlo_module.h" + +#include +#include +#include +#include +#include +#include +#include + +#include "absl/types/span.h" +#include "tensorflow/compiler/xla/client/lib/testing.h" +#include "tensorflow/compiler/xla/debug_options_flags.h" +#include "tensorflow/compiler/xla/error_spec.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/compiler/xla/service/hlo_runner.h" +#include "tensorflow/compiler/xla/service/hlo_verifier.h" +#include "tensorflow/compiler/xla/service/platform_util.h" +#include "tensorflow/compiler/xla/tests/literal_test_util.h" +#include "tensorflow/compiler/xla/tests/test_utils.h" +#include "tensorflow/compiler/xla/tools/hlo_module_loader.h" +#include "tensorflow/compiler/xla/tools/prepare_reference_module.h" +#include "tensorflow/compiler/xla/util.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/platform/test.h" + +namespace se = ::stream_executor; + +namespace xla { +namespace { + +Literal ExecuteOnPlatform(std::unique_ptr module, + absl::Span args, + se::Platform* platform, bool run_hlo_passes) { + HloRunner runner(platform); + + TF_QCHECK_OK(VerifyHloModule(module.get(), /*layout_sensitive=*/false, + /*allow_mixed_precision=*/true)) + << " (on " << platform->Name() << ")"; + + std::cerr << "Running HLO module on platform " << platform->Name() << "...\n"; + XLA_VLOG_LINES(1, module->ToString()); + const auto start = std::chrono::high_resolution_clock::now(); + auto result_status = runner.Execute(std::move(module), args, run_hlo_passes); + const auto end = std::chrono::high_resolution_clock::now(); + std::chrono::duration diff = end - start; + std::cerr << "... compiled and ran in " << diff.count() << "s.\n"; + + TF_QCHECK_OK(result_status.status()) + << "Failed to execute on " << platform->Name() << "\n"; + + return result_status.ConsumeValueOrDie(); +} +} // namespace + +::testing::AssertionResult RunAndCompare( + const std::string& hlo_filename, const std::string& test_platform_name, + const std::string& reference_platform_name, std::minstd_rand0* engine, + const RunHloModuleOptions& options, + std::function + reference_module_modifier_hook) { + se::Platform* test_platform = + xla::PlatformUtil::GetPlatform(test_platform_name).ValueOrDie(); + se::Platform* reference_platform = + reference_platform_name.empty() + ? nullptr + : xla::PlatformUtil::GetPlatform(reference_platform_name) + .ValueOrDie(); + auto config_modifier = [](HloModuleConfig* config) { config->set_seed(42); }; + + std::unique_ptr test_module = + LoadModuleFromFile(hlo_filename, hlo_module_loader_details::Config(), + options.input_format, config_modifier) + .ValueOrDie(); + const HloModuleProto test_module_proto = test_module->ToProto(); + + std::vector args = MakeFakeArguments(test_module.get(), engine, + options.use_large_float_range) + .ConsumeValueOrDie(); + + if (options.print_literals) { + for (int i = 0; i < args.size(); ++i) { + std::cout << "\n** Argument " << i << " **\n" + << args[i].ToString() << "\n"; + } + } + + std::unique_ptr reference_module; + if (reference_platform != nullptr) { + // PrepareReferenceModule needs to know the *test* platform, in order to + // properly match the test platform's numerics. + reference_module = + PrepareReferenceModule(*test_module, test_platform->id(), + config_modifier, reference_module_modifier_hook) + .ConsumeValueOrDie(); + } + + Literal test_result = ExecuteOnPlatform( + std::move(test_module), args, test_platform, options.run_test_hlo_passes); + if (options.print_literals) { + std::cout << "\n** Result on test platform " << test_platform->Name() + << " **\n" + << test_result.ToString() << "\n"; + } + + if (reference_module == nullptr) { + std::cerr << "Skipping reference platform\n"; + return ::testing::AssertionSuccess(); + } + + Literal reference_result = + ExecuteOnPlatform(std::move(reference_module), args, reference_platform, + options.run_reference_hlo_passes); + + if (options.print_literals) { + std::cout << "\n** Result on reference platform " + << reference_platform->Name() << " **\n" + << reference_result.ToString() << "\n"; + } + ErrorSpec error_spec(static_cast(options.abs_error_bound), + static_cast(options.rel_error_bound)); + return LiteralTestUtil::Near(/*expected=*/reference_result, + /*actual=*/test_result, + /*error_spec=*/error_spec, + /*detailed_message=*/true); +} + +} // namespace xla diff --git a/tensorflow/compiler/xla/tools/run_hlo_module.h b/tensorflow/compiler/xla/tools/run_hlo_module.h new file mode 100644 index 00000000000..932cc22f4dd --- /dev/null +++ b/tensorflow/compiler/xla/tools/run_hlo_module.h @@ -0,0 +1,76 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_RUN_HLO_MODULE_H_ +#define TENSORFLOW_COMPILER_XLA_TOOLS_RUN_HLO_MODULE_H_ + +#include +#include +#include + +#include "tensorflow/compiler/xla/service/hlo_module.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/stream_executor/platform.h" + +namespace xla { + +// Command-line options to this tool. See main() in run_hlo_module_main.cc for +// descriptions of these fields. +struct RunHloModuleOptions { + RunHloModuleOptions() + : platform(""), + reference_platform("default"), + print_literals(false), + run_test_hlo_passes(true), + run_reference_hlo_passes(true), + use_large_float_range(true), + // TODO(b/68721786): These tolerances are set to match the values in the + // isolation test. The goal is to lower these to 0.001. + abs_error_bound(0.1), + rel_error_bound(0.1), + input_format("hlo"), + input_module(""), + iterations(1) {} + std::string platform; + std::string reference_platform; + bool print_literals; + bool run_test_hlo_passes; + bool run_reference_hlo_passes; + bool use_large_float_range; + float abs_error_bound; + float rel_error_bound; + std::string input_format; + std::string input_module; + int iterations; +}; + +// Reads a HloModule from 'hlo_filename', runs it on the platform with the name +// 'test_platform_name', and if 'reference_platform_name' is non-empty, it also +// runs it on the platform with the name 'reference_platform_name' and compares +// the results. 'reference_module_modifier_hook' can be used to transform the +// HloModule before it is run on the reference platform. This may be necessary +// to match the numerics of the test platform. +::testing::AssertionResult RunAndCompare( + const std::string& hlo_filename, const std::string& test_platform_name, + const std::string& reference_platform_name, std::minstd_rand0* engine, + const RunHloModuleOptions& options, + std::function + reference_module_modifier_hook = {}); + +} // namespace xla + +#endif // TENSORFLOW_COMPILER_XLA_TOOLS_RUN_HLO_MODULE_H_ diff --git a/tensorflow/compiler/xla/tools/run_hlo_module_main.cc b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc new file mode 100644 index 00000000000..7079f413eeb --- /dev/null +++ b/tensorflow/compiler/xla/tools/run_hlo_module_main.cc @@ -0,0 +1,184 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// A tool for reading a HloModule from a HloProto file and execute the module on +// given platform(s). See kUsage for details. + +#include +#include +#include +#include + +#include "absl/strings/str_cat.h" +#include "tensorflow/compiler/xla/debug_options_flags.h" +#include "tensorflow/compiler/xla/tools/run_hlo_module.h" +#include "tensorflow/core/platform/init_main.h" +#include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/status.h" +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/util/command_line_flags.h" + +namespace { +const char* const kUsage = R"( +This tool lets you read a HloModule from a file and execute the module on given +platform. + +The file can be one of the followings: +1) a binary or text proto file, the proto should be in xla.HloProto type. +2) a hlo text dump, the string should be in HloModule::ToString() format. + +By default, the module is run on a reference platform such as the interpreter +and the reference result is compared against the test result. + +You can also pass in debug option flags for the HloModule. + +Usage: + + bazel run run_hlo_module -- \ + --input_format=[hlo|pb|pbtxt] \ + --platform=[CPU|CUDA|Interpreter] \ + path/to/hlo_module +)"; +const char kInterpreterPlatformName[] = "Interpreter"; + +// Returns the name of the test platform. +std::string GetTestPlatformName(std::string name) { + QCHECK(!name.empty()) << "Must pass --platform flag."; + return name; +} + +// Returns the name of the reference platform +std::string GetReferencePlatformName(std::string reference_platform) { + if (reference_platform == "default") { + return kInterpreterPlatformName; + } + return reference_platform; +} +} // namespace + +int main(int argc, char** argv) { + xla::RunHloModuleOptions opts; + std::vector flag_list = { + tensorflow::Flag( + "platform", &opts.platform, + "The test platform that the HLO module will be executed on " + "(gpu, cpu, etc)."), + tensorflow::Flag( + "reference_platform", &opts.reference_platform, + "The reference platform that HLO module will be " + "executed on. The result produced on the reference platform will " + "be compared against the result produced on the test platform. A " + "value of 'default' will use the TPU_Interpreter as a reference if " + "the test platform is a TPU, and 'interpreter' otherwise. If the " + "flag value is the empty string, then the module will not be run " + "on a reference platform at all."), + tensorflow::Flag("print_literals", &opts.print_literals, + "Print the input and result literals to stdout."), + tensorflow::Flag( + "run_test_hlo_passes", &opts.run_test_hlo_passes, + "Run HLO pass pipeline for the test platform on the HLO module " + "before running the module on the test platform. This should be " + "set to true if the HLO module is unoptimized and set to false if " + "the HLO module already has been optimized."), + tensorflow::Flag( + "run_reference_hlo_passes", &opts.run_reference_hlo_passes, + "Run HLO pass pipeline for the reference platform on the HLO module " + "before running the module on the reference platform. " + "In general, if the given HLO module was optimized for a platform " + "other " + "than the reference this is necessary because some HLO passes are " + "legalization passes which must be run prior to code generation."), + + tensorflow::Flag( + "use_large_float_range", &opts.use_large_float_range, + "Generate floating point values using a large uniform-log " + "distribtion as opposed to a small uniform distribution."), + tensorflow::Flag( + "abs_error_bound", &opts.abs_error_bound, + "The absolute error bound used when comparing the test and " + "reference results."), + tensorflow::Flag( + "rel_error_bound", &opts.rel_error_bound, + "The relative error bound used when comparing the test and " + "reference results."), + tensorflow::Flag("input_format", &opts.input_format, + "The format of the input file. Valid values:\n" + " hlo : HLO textual format\n" + " pb : xla::HloProto in binary proto format\n" + " pbtxt : xla::HloProto in text proto format"), + tensorflow::Flag( + "input_module", &opts.input_module, + "A path to a file containing the HLO module. Can also pass " + "a this as argv[1], but this flag is more explicit."), + tensorflow::Flag( + "iterations", &opts.iterations, + "The number of times to run the module. Each iteration will be run " + "with different input data.")}; + xla::AppendDebugOptionsFlags(&flag_list); + // The usage string includes the message at the top of the file, the + // DebugOptions flags and the flags defined above. + const std::string kUsageString = absl::StrCat( + kUsage, "\n\n", tensorflow::Flags::Usage(argv[0], flag_list)); + bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list); + tensorflow::port::InitMain(kUsageString.c_str(), &argc, &argv); + if (!parse_ok) { + LOG(QFATAL) << kUsageString; + } + + const std::string test_platform_name = GetTestPlatformName(opts.platform); + const std::string reference_platform_name = + GetReferencePlatformName(opts.reference_platform); + + std::string hlo_filename; + if (!opts.input_module.empty()) { + hlo_filename = opts.input_module; + } else { + QCHECK(argc == 2) << "Must specify a single input file"; + hlo_filename = argv[1]; + } + + std::minstd_rand0 engine; + int failure_count = 0; + const int iteration_count = opts.iterations; + for (int i = 1; i <= iteration_count; ++i) { + if (iteration_count != 1) { + std::cerr << "\n=== Iteration " << i << "\n"; + } + ::testing::AssertionResult matched = + xla::RunAndCompare(hlo_filename, test_platform_name, + reference_platform_name, &engine, opts); + + // The AssertionResult is only meaningful when the reference is + // used. Without a reference, the test just verifies that nothing blew up + // when running the module. + if (!reference_platform_name.empty()) { + if (matched) { + // Success. + std::cerr << "\n** Results on " << test_platform_name << " and " + << reference_platform_name << " are close enough. **\n"; + } else { + failure_count++; + std::cerr << matched.message() << "\n"; + } + } + } + + if (!reference_platform_name.empty()) { + std::cerr << failure_count << "/" << iteration_count + << " runs miscompared.\n"; + } + + return failure_count == 0 ? 0 : -1; +} From 421fbae7753c717337d96bad5fd6cb5d58834e0a Mon Sep 17 00:00:00 2001 From: Kazuaki Ishizaki Date: Fri, 6 Dec 2019 05:58:59 -0800 Subject: [PATCH 254/383] minor spelling tweaks Closes #290 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/290 from kiszk:spelling_tweaks_201912 9d9afd16a723dd65754a04698b3976f150a6054a PiperOrigin-RevId: 284169681 Change-Id: I125030cd5081c2e0862f12c9cf8254dc2ad19dd4 --- third_party/mlir/g3doc/DeclarativeRewrites.md | 13 +- third_party/mlir/g3doc/Dialects/GPU.md | 2 +- third_party/mlir/g3doc/OpDefinitions.md | 130 +++++++++--------- third_party/mlir/g3doc/WritingAPass.md | 2 +- .../StandardToLLVM/ConvertStandardToLLVM.cpp | 2 +- .../mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp | 6 +- .../SPIRV/Serialization/Deserializer.cpp | 3 +- third_party/mlir/lib/IR/AsmPrinter.cpp | 2 +- third_party/mlir/lib/IR/Diagnostics.cpp | 2 +- third_party/mlir/lib/IR/Operation.cpp | 2 +- third_party/mlir/lib/IR/SymbolTable.cpp | 2 +- third_party/mlir/lib/Pass/PassTiming.cpp | 2 +- .../mlir/lib/Transforms/LoopFusion.cpp | 6 +- .../tools/mlir-tblgen/OpDefinitionsGen.cpp | 2 +- .../mlir/tools/mlir-tblgen/RewriterGen.cpp | 6 +- .../mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp | 2 +- 16 files changed, 93 insertions(+), 91 deletions(-) diff --git a/third_party/mlir/g3doc/DeclarativeRewrites.md b/third_party/mlir/g3doc/DeclarativeRewrites.md index e319b7d7a83..5adcb320983 100644 --- a/third_party/mlir/g3doc/DeclarativeRewrites.md +++ b/third_party/mlir/g3doc/DeclarativeRewrites.md @@ -259,9 +259,9 @@ def : Pat<(AOp $input, $attr), (COp (AOp $input, $attr) $attr)>; `AOp` is generated via a nested result pattern; DRR won't be able to deduce the result type for it. A custom builder for `AOp` should be defined and it should -deduce the result type by itself. The builder should have the a separate -parameter for each operand and attribute and deduce the result type internally -by itself. For example, for the above `AOp`, a possible builder is: +deduce the result type by itself. The builder should have the separate parameter +for each operand and attribute and deduce the result type internally by itself. +For example, for the above `AOp`, a possible builder is: ```c++ @@ -311,9 +311,10 @@ def DOp : Op<"d_op"> { def : Pat<(AOp $input, $ignored_attr), (DOp (BOp:$b_result) $b_result)>; ``` -In this pattern, a `AOp` is matched and replaced with a `DOp` whose two operands -are from the result of a single `BOp`. This is only possible by binding the -result of the `BOp` to a name and reuse it for the second operand of the `DOp` +In this pattern, an `AOp` is matched and replaced with a `DOp` whose two +operands are from the result of a single `BOp`. This is only possible by binding +the result of the `BOp` to a name and reuse it for the second operand of the +`DOp` #### `NativeCodeCall`: transforming the generated op diff --git a/third_party/mlir/g3doc/Dialects/GPU.md b/third_party/mlir/g3doc/Dialects/GPU.md index b1cc30e510f..faa07219e03 100644 --- a/third_party/mlir/g3doc/Dialects/GPU.md +++ b/third_party/mlir/g3doc/Dialects/GPU.md @@ -87,7 +87,7 @@ memory buffers at the module level, we chose to do it at the function level to provide some structuring for the lifetime of those buffers; this avoids the incentive to use the buffers for communicating between different kernels or launches of the same kernel, which should be done through function arguments -intead; we chose not to use `alloca`-style approach that would require more +instead; we chose not to use `alloca`-style approach that would require more complex lifetime analysis following the principles of MLIR that promote structure and representing analysis results in the IR. diff --git a/third_party/mlir/g3doc/OpDefinitions.md b/third_party/mlir/g3doc/OpDefinitions.md index b72b9937ebb..7fb0e53ea17 100644 --- a/third_party/mlir/g3doc/OpDefinitions.md +++ b/third_party/mlir/g3doc/OpDefinitions.md @@ -60,16 +60,17 @@ allowed in a TableGen file (typically with filename suffix `.td`) can be found [here][TableGenIntro]. The formal language specification can be found [here][TableGenRef]. _Roughly_ speaking, -* TableGen `class` is similar to C++ class; it can be templated and subclassed. -* TableGen `def` is similar to C++ object; it can be declared by specializing - a TableGen `class` (e.g., `def MyDef : MyClass<...>;`) or completely - independently (e.g., `def MyDef;`). It cannot be further templated or - subclassed. -* TableGen `dag` is a dedicated type for directed graph of elements. A `dag` - has one operator and zero or more arguments. Its syntax is `(operator arg0, - arg1, argN)`. The operator can be any TableGen `def`; an argument can be - anything, including `dag` itself. We can have names attached to both the - operator and the arguments like `(MyOp:$op_name MyArg:$arg_name)`. +* TableGen `class` is similar to C++ class; it can be templated and + subclassed. +* TableGen `def` is similar to C++ object; it can be declared by specializing + a TableGen `class` (e.g., `def MyDef : MyClass<...>;`) or completely + independently (e.g., `def MyDef;`). It cannot be further templated or + subclassed. +* TableGen `dag` is a dedicated type for directed acyclic graph of elements. A + `dag` has one operator and zero or more arguments. Its syntax is `(operator + arg0, arg1, argN)`. The operator can be any TableGen `def`; an argument can + be anything, including `dag` itself. We can have names attached to both the + operator and the arguments like `(MyOp:$op_name MyArg:$arg_name)`. Please see the [language introduction][TableGenIntro] to learn about all the types and expressions supported by TableGen. @@ -214,13 +215,13 @@ places like constraints. To declare a variadic operand, wrap the `TypeConstraint` for the operand with `Variadic<...>`. -Normally operations have no variadic operands or just one variadic operand. -For the latter case, it is easily deduce which dynamic operands are for the -static variadic operand definition. But if an operation has more than one -variadic operands, it would be impossible to attribute dynamic operands to the +Normally operations have no variadic operands or just one variadic operand. For +the latter case, it is easy to deduce which dynamic operands are for the static +variadic operand definition. But if an operation has more than one variadic +operands, it would be impossible to attribute dynamic operands to the corresponding static variadic operand definitions without further information -from the operation. Therefore, the `SameVariadicOperandSize` trait is needed -to indicate that all variadic operands have the same number of dynamic values. +from the operation. Therefore, the `SameVariadicOperandSize` trait is needed to +indicate that all variadic operands have the same number of dynamic values. #### Optional attributes @@ -776,7 +777,7 @@ duplication, which is being worked on right now. ### Enum attributes Some attributes can only take values from an predefined enum, e.g., the -comparsion kind of a comparsion op. To define such attributes, ODS provides +comparison kind of a comparison op. To define such attributes, ODS provides several mechanisms: `StrEnumAttr`, `IntEnumAttr`, and `BitEnumAttr`. * `StrEnumAttr`: each enum case is a string, the attribute is stored as a @@ -1042,53 +1043,54 @@ possible). We considered the approaches of several contemporary systems and focused on requirements that were desirable: -* Ops registered using a registry separate from C++ code. - * Unknown ops are allowed in MLIR, so ops need not be registered. The - ability of the compiler to optimize those ops or graphs containing those - ops is constrained but correct. - * The current proposal does not include a runtime op description, but it - does not preclude such description, it can be added later. - * The op registry is essential for generating C++ classes that make - manipulating ops, verifying correct construction etc. in C++ easier by - providing a typed representation and accessors. -* The op registry will be defined in - [TableGen](https://llvm.org/docs/TableGen/index.html) and be used to - generate C++ classes and utility functions - (builder/verifier/parser/printer). - * TableGen is a modelling specification language used by LLVM's backends - and fits in well with trait based modelling. This is an implementation - decision and there are alternative ways of doing this. But the - specification language is good for the requirements of modelling the - traits (as seen from usage in LLVM processor backend modelling) and easy - to extend, so a practical choice. If another good option comes up, we - will consider it. -* MLIR allows both defined and undefined ops. - * Defined ops should have fixed semantics and could have a corresponding - reference implementation defined using, for example, EDSC. - * Dialects are under full control of the dialect owner and normally live - with the framework of the dialect. -* The op's traits (e.g., commutative) are modelled along with the op in - the registry. -* The op's operand/return type constraints are modelled along with the op in - the registry (see [Shape inference](#shape-inference) discussion below), - this allows (e.g.) optimized concise syntax in textual dumps. -* Behavior of the op is documented along with the op with a summary and a - description. The description is written in markdown and extracted for - inclusion in the generated LangRef section of the dialect. -* The generic assembly form of printing and parsing is available as normal, - but a custom parser and printer can either be specified or automatically - generated from an optional string representation showing the mapping of the - "assembly" string to operands/type. - * Parser-level remappings (e.g., `eq` to enum) will be supported as part - of the parser generation. -* Matching patterns are specified separately from the op description. - * Contrasted with LLVM there is no "base" set of ops that every backend - needs to be aware of. Instead there are many different dialects and the - transformations/legalizations between these dialects form a graph of - transformations. -* Reference implementation may be provided along with the op definition. - * The reference implementation may be in terms of either standard ops or - other reference implementations. +* Ops registered using a registry separate from C++ code. + * Unknown ops are allowed in MLIR, so ops need not be registered. The + ability of the compiler to optimize those ops or graphs containing those + ops is constrained but correct. + * The current proposal does not include a runtime op description, but it + does not preclude such description, it can be added later. + * The op registry is essential for generating C++ classes that make + manipulating ops, verifying correct construction etc. in C++ easier by + providing a typed representation and accessors. +* The op registry will be defined in + [TableGen](https://llvm.org/docs/TableGen/index.html) and be used to + generate C++ classes and utility functions + (builder/verifier/parser/printer). + * TableGen is a modelling specification language used by LLVM's backends + and fits in well with trait-based modelling. This is an implementation + decision and there are alternative ways of doing this. But the + specification language is good for the requirements of modelling the + traits (as seen from usage in LLVM processor backend modelling) and easy + to extend, so a practical choice. If another good option comes up, we + will consider it. +* MLIR allows both defined and undefined ops. + * Defined ops should have fixed semantics and could have a corresponding + reference implementation defined using, for example, EDSC. + * Dialects are under full control of the dialect owner and normally live + with the framework of the dialect. +* The op's traits (e.g., commutative) are modelled along with the op in the + registry. +* The op's operand/return type constraints are modelled along with the op in + the registry (see [Shape inference](#shape-inference) discussion below), + this allows (e.g.) optimized concise syntax in textual dumps. +* Behavior of the op is documented along with the op with a summary and a + description. The description is written in markdown and extracted for + inclusion in the generated LangRef section of the dialect. +* The generic assembly form of printing and parsing is available as normal, + but a custom parser and printer can either be specified or automatically + generated from an optional string representation showing the mapping of the + "assembly" string to operands/type. + * Parser-level remappings (e.g., `eq` to enum) will be supported as part + of the parser generation. +* Matching patterns are specified separately from the op description. + * Contrasted with LLVM there is no "base" set of ops that every backend + needs to be aware of. Instead there are many different dialects and the + transformations/legalizations between these dialects form a graph of + transformations. +* Reference implementation may be provided along with the op definition. + + * The reference implementation may be in terms of either standard ops or + other reference implementations. TODO: document expectation if the dependent op's definition changes. diff --git a/third_party/mlir/g3doc/WritingAPass.md b/third_party/mlir/g3doc/WritingAPass.md index fc73b7e9ef3..f72d41bea40 100644 --- a/third_party/mlir/g3doc/WritingAPass.md +++ b/third_party/mlir/g3doc/WritingAPass.md @@ -122,7 +122,7 @@ An analysis may provide additional hooks to control various behavior: Given a preserved analysis set, the analysis returns true if it should truly be invalidated. This allows for more fine-tuned invalidation in cases where an -analysis wasn't explicitly marked preserved, but may be preserved(or +analysis wasn't explicitly marked preserved, but may be preserved (or invalidated) based upon other properties such as analyses sets. ### Querying Analyses diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp index 5a6282e8d4d..7b15b758968 100644 --- a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp +++ b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp @@ -510,7 +510,7 @@ struct FuncOpConversion : public LLVMLegalizationPattern { attributes.push_back(attr); } - // Create an LLVM funcion, use external linkage by default until MLIR + // Create an LLVM function, use external linkage by default until MLIR // functions have linkage. auto newFuncOp = rewriter.create( op->getLoc(), funcOp.getName(), llvmType, LLVM::Linkage::External, diff --git a/third_party/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp b/third_party/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp index e3b550223e5..694a98fd075 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/SPIRVLowering.cpp @@ -71,7 +71,7 @@ mlir::spirv::getEntryPointABIAttr(ArrayRef localSize, Type SPIRVTypeConverter::getIndexType(MLIRContext *context) { // Convert to 32-bit integers for now. Might need a way to control this in // future. - // TODO(ravishankarm): It is porbably better to make it 64-bit integers. To + // TODO(ravishankarm): It is probably better to make it 64-bit integers. To // this some support is needed in SPIR-V dialect for Conversion // instructions. The Vulkan spec requires the builtins like // GlobalInvocationID, etc. to be 32-bit (unsigned) integers which should be @@ -189,7 +189,7 @@ static spirv::GlobalVariableOp getBuiltinVariable(spirv::ModuleOp &moduleOp, return nullptr; } -/// Gets name of global variable for a buitlin. +/// Gets name of global variable for a builtin. static std::string getBuiltinVarName(spirv::BuiltIn builtin) { return std::string("__builtin_var_") + stringifyBuiltIn(builtin).str() + "__"; } @@ -230,7 +230,7 @@ getOrInsertBuiltinVariable(spirv::ModuleOp &moduleOp, Location loc, } /// Gets the global variable associated with a builtin and add -/// it if it doesnt exist. +/// it if it doesn't exist. Value *mlir::spirv::getBuiltinVariableValue(Operation *op, spirv::BuiltIn builtin, OpBuilder &builder) { diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp index 2011c750d83..72d11a19380 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp @@ -270,7 +270,6 @@ private: // block and redirect all branches to the old header block to the old // merge block (which contains the spv.selection/spv.loop op now). - /// For OpPhi instructions, we use block arguments to represent them. OpPhi /// encodes a list of (value, predecessor) pairs. At the time of handling the /// block containing an OpPhi instruction, the predecessor block might not be @@ -278,7 +277,7 @@ private: /// the block argument from the predecessors. We use the following approach: /// /// 1. For each OpPhi instruction, add a block argument to the current block - /// in construction. Record the block argment in `valueMap` so its uses + /// in construction. Record the block argument in `valueMap` so its uses /// can be resolved. For the list of (value, predecessor) pairs, update /// `blockPhiInfo` for bookkeeping. /// 2. After processing all blocks, loop over `blockPhiInfo` to fix up each diff --git a/third_party/mlir/lib/IR/AsmPrinter.cpp b/third_party/mlir/lib/IR/AsmPrinter.cpp index a3a15dac533..ed97b8b5940 100644 --- a/third_party/mlir/lib/IR/AsmPrinter.cpp +++ b/third_party/mlir/lib/IR/AsmPrinter.cpp @@ -1116,7 +1116,7 @@ void ModulePrinter::printType(Type type) { //===----------------------------------------------------------------------===// namespace { -/// This class provides the main specialication of the DialectAsmPrinter that is +/// This class provides the main specialization of the DialectAsmPrinter that is /// used to provide support for print attributes and types. This hooks allows /// for dialects to hook into the main ModulePrinter. struct CustomDialectAsmPrinter : public DialectAsmPrinter { diff --git a/third_party/mlir/lib/IR/Diagnostics.cpp b/third_party/mlir/lib/IR/Diagnostics.cpp index f2f2f83b3a8..70a802cd856 100644 --- a/third_party/mlir/lib/IR/Diagnostics.cpp +++ b/third_party/mlir/lib/IR/Diagnostics.cpp @@ -689,7 +689,7 @@ SourceMgrDiagnosticVerifierHandler::SourceMgrDiagnosticVerifierHandler( for (unsigned i = 0, e = mgr.getNumBuffers(); i != e; ++i) (void)impl->computeExpectedDiags(mgr.getMemoryBuffer(i + 1)); - // Register a handler to verfy the diagnostics. + // Register a handler to verify the diagnostics. setHandler([&](Diagnostic &diag) { // Process the main diagnostics. process(diag); diff --git a/third_party/mlir/lib/IR/Operation.cpp b/third_party/mlir/lib/IR/Operation.cpp index 69b8d056cd5..1d213f45dd5 100644 --- a/third_party/mlir/lib/IR/Operation.cpp +++ b/third_party/mlir/lib/IR/Operation.cpp @@ -286,7 +286,7 @@ void Operation::destroy() { /// Return the context this operation is associated with. MLIRContext *Operation::getContext() { return location->getContext(); } -/// Return the dialact this operation is associated with, or nullptr if the +/// Return the dialect this operation is associated with, or nullptr if the /// associated dialect is not registered. Dialect *Operation::getDialect() { if (auto *abstractOp = getAbstractOperation()) diff --git a/third_party/mlir/lib/IR/SymbolTable.cpp b/third_party/mlir/lib/IR/SymbolTable.cpp index b61308b74af..bd8cb59cea7 100644 --- a/third_party/mlir/lib/IR/SymbolTable.cpp +++ b/third_party/mlir/lib/IR/SymbolTable.cpp @@ -283,7 +283,7 @@ static Optional walkSymbolUses( if (walkSymbolRefs(&op, callback).wasInterrupted()) return WalkResult::interrupt(); - // If this operation has regions, and it as well as its dialect arent't + // If this operation has regions, and it as well as its dialect aren't // registered then conservatively fail. The operation may define a // symbol table, so we can't opaquely know if we should traverse to find // nested uses. diff --git a/third_party/mlir/lib/Pass/PassTiming.cpp b/third_party/mlir/lib/Pass/PassTiming.cpp index 4747249690f..dd193a4d9a9 100644 --- a/third_party/mlir/lib/Pass/PassTiming.cpp +++ b/third_party/mlir/lib/Pass/PassTiming.cpp @@ -323,7 +323,7 @@ void PassTiming::runAfterPass(Pass *pass, Operation *) { return; } - // Adapator passes aren't timed directly, so we don't need to stop their + // Adaptor passes aren't timed directly, so we don't need to stop their // timers. if (!isAdaptorPass(pass)) timer->stop(); diff --git a/third_party/mlir/lib/Transforms/LoopFusion.cpp b/third_party/mlir/lib/Transforms/LoopFusion.cpp index cda35297abc..6627e73056a 100644 --- a/third_party/mlir/lib/Transforms/LoopFusion.cpp +++ b/third_party/mlir/lib/Transforms/LoopFusion.cpp @@ -1561,10 +1561,10 @@ public: !canFuseSrcWhichWritesToLiveOut(srcId, dstId, srcStoreOp, mdg)) continue; - // Dont create a private memref if 'writesToLiveInOrOut'. + // Don't create a private memref if 'writesToLiveInOrOut'. bool createPrivateMemref = !writesToLiveInOrOut; - // Dont create a private memref if 'srcNode' has in edges on 'memref', - // or if 'dstNode' has out edges on 'memref'. + // Don't create a private memref if 'srcNode' has in edges on + // 'memref', or if 'dstNode' has out edges on 'memref'. if (mdg->getIncomingMemRefAccesses(srcNode->id, memref) > 0 || mdg->getOutEdgeCount(dstNode->id, memref) > 0) { createPrivateMemref = false; diff --git a/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index 16894ad4cb3..b5fd0862b45 100644 --- a/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -1439,7 +1439,7 @@ void OpEmitter::genVerifier() { auto sizeAttr = getAttrOfType("{0}"); auto numElements = sizeAttr.getType().cast().getNumElements(); if (numElements != {1}) {{ - return emitOpError("'{0}' attribute for specifiying {2} segments " + return emitOpError("'{0}' attribute for specifying {2} segments " "must have {1} elements"); } )"; diff --git a/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp b/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp index d321b204f4e..f229a349d27 100644 --- a/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp +++ b/third_party/mlir/tools/mlir-tblgen/RewriterGen.cpp @@ -685,7 +685,7 @@ std::string PatternEmitter::handleReplaceWithNativeCodeCall(DagNode tree) { } for (int i = 0, e = tree.getNumArgs(); i != e; ++i) { attrs[i] = handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i)); - LLVM_DEBUG(llvm::dbgs() << "NativeCodeCall argment #" << i + LLVM_DEBUG(llvm::dbgs() << "NativeCodeCall argument #" << i << " replacement: " << attrs[i] << "\n"); } return tgfmt(fmt, &fmtCtx, attrs[0], attrs[1], attrs[2], attrs[3], attrs[4], @@ -769,7 +769,7 @@ std::string PatternEmitter::handleOpCreation(DagNode tree, int resultIndex, if (isSameOperandsAndResultType || useFirstAttr) { // We know how to deduce the result type for ops with these traits and we've - // generated builders taking aggregrate parameters. Use those builders to + // generated builders taking aggregate parameters. Use those builders to // create the ops. // First prepare local variables for op arguments used in builder call. @@ -891,7 +891,7 @@ void PatternEmitter::supplyValuesForOpArgs( Operator &resultOp = node.getDialectOp(opMap); for (int argIndex = 0, numOpArgs = resultOp.getNumArgs(); argIndex != numOpArgs; ++argIndex) { - // Start each argment on its own line. + // Start each argument on its own line. (os << ",\n").indent(8); Argument opArg = resultOp.getArg(argIndex); diff --git a/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp b/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp index f39295a22c8..422183ed948 100644 --- a/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp +++ b/third_party/mlir/tools/mlir-tblgen/SPIRVUtilsGen.cpp @@ -687,7 +687,7 @@ static void emitEnumGetSymbolizeFnDefn(const EnumAttr &enumAttr, } static bool emitOpUtils(const RecordKeeper &recordKeeper, raw_ostream &os) { - llvm::emitSourceFileHeader("SPIR-V Op Utilites", os); + llvm::emitSourceFileHeader("SPIR-V Op Utilities", os); auto defs = recordKeeper.getAllDerivedDefinitions("EnumAttrInfo"); os << "#ifndef SPIRV_OP_UTILS_H_\n"; From 1629894b12112dee3523dd7afe4a55394c4a7d3a Mon Sep 17 00:00:00 2001 From: Uday Bondhugula Date: Fri, 6 Dec 2019 05:59:06 -0800 Subject: [PATCH 255/383] DimOp folding for alloc/view dynamic dimensions Signed-off-by: Uday Bondhugula Closes #253 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/253 from bondhugula:dimop a4b464f24ae63fd259114558d87e11b8ee4dae86 PiperOrigin-RevId: 284169689 Change-Id: I4b0aaf508668ebd35e3abc12e0e2d661f996995d --- .../mlir/lib/Dialect/StandardOps/Ops.cpp | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp index 0e2bee063a8..a9e9364aa17 100644 --- a/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp +++ b/third_party/mlir/lib/Dialect/StandardOps/Ops.cpp @@ -1364,11 +1364,26 @@ OpFoldResult DimOp::fold(ArrayRef operands) { else if (auto memrefType = opType.dyn_cast()) indexSize = memrefType.getShape()[getIndex()]; - if (indexSize >= 0) + if (!ShapedType::isDynamic(indexSize)) return IntegerAttr::get(IndexType::get(getContext()), indexSize); - // Fold dim to the size argument of a SubViewOp. + // Fold dim to the size argument for an AllocOp/ViewOp/SubViewOp. + auto memrefType = opType.dyn_cast(); + if (!memrefType) + return {}; + + // The size at getIndex() is now a dynamic size of a memref. + auto memref = memrefOrTensor()->getDefiningOp(); + if (auto alloc = dyn_cast_or_null(memref)) + return *(alloc.getDynamicSizes().begin() + + memrefType.getDynamicDimIndex(getIndex())); + + if (auto view = dyn_cast_or_null(memref)) + return *(view.getDynamicSizes().begin() + + memrefType.getDynamicDimIndex(getIndex())); + + // The subview op here is expected to have rank dynamic sizes now. if (auto subview = dyn_cast_or_null(memref)) { auto sizes = subview.sizes(); if (!sizes.empty()) From 707f8f2c187428449e9989e77f06e7d5fc43ca95 Mon Sep 17 00:00:00 2001 From: Denis Khalikov Date: Fri, 6 Dec 2019 06:26:24 -0800 Subject: [PATCH 256/383] [spirv] Reorder `erase` and `emplace` to avoid "invalid iterator access". The iterator should be erased before adding a new entry into blockMergeInfo to avoid iterator invalidation. Closes #299 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/299 from denis0x0D:sandbox/reoder_erase 983be565809aa0aadfc7e92962e4d4b282f63c66 PiperOrigin-RevId: 284173235 Change-Id: I7ce585acc0507f796726ddefa70d3ce7dfcbd443 --- .../mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp index 72d11a19380..e60805aca1b 100644 --- a/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp +++ b/third_party/mlir/lib/Dialect/SPIRV/Serialization/Deserializer.cpp @@ -1874,8 +1874,10 @@ LogicalResult ControlFlowStructurizer::structurizeImpl() { if (Block *mappedTo = mapper.lookupOrNull(newMerge)) newMerge = mappedTo; - blockMergeInfo.try_emplace(newHeader, newMerge, newContinue); + // The iterator should be erased before adding a new entry into + // blockMergeInfo to avoid iterator invalidation. blockMergeInfo.erase(it); + blockMergeInfo.try_emplace(newHeader, newMerge, newContinue); } // The structured selection/loop's entry block does not have arguments. From a153c76754ed52954d21640cce6a00bcef01a710 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Fri, 6 Dec 2019 06:39:34 -0800 Subject: [PATCH 257/383] Test clip-ops docstring PiperOrigin-RevId: 284174716 Change-Id: Ib453c0db34ef631d728d216d1cd85910f95274f1 --- tensorflow/python/ops/clip_ops.py | 55 +++++++++++++++++++++++-------- 1 file changed, 42 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py index 30317092a29..d316d6df6d7 100644 --- a/tensorflow/python/ops/clip_ops.py +++ b/tensorflow/python/ops/clip_ops.py @@ -49,29 +49,58 @@ def clip_by_value(t, clip_value_min, clip_value_max, For example: - ```python - A = tf.constant([[1, 20, 13], [3, 21, 13]]) - B = tf.clip_by_value(A, clip_value_min=0, clip_value_max=3) # [[1, 3, 3],[3, 3, 3]] - C = tf.clip_by_value(A, clip_value_min=0., clip_value_max=3.) # throws `TypeError` - as input and clip_values are of different dtype - ``` + Basic usage passes a scalar as the min and max value. + + >>> t = tf.constant([[-10., -1., 0.], [0., 2., 10.]]) + >>> t2 = tf.clip_by_value(t, clip_value_min=-1, clip_value_max=1) + >>> t2.numpy() + array([[-1., -1., 0.], + [ 0., 1., 1.]], dtype=float32) + + The min and max can be the same size as `t`, or broadcastable to that size. + + >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]]) + >>> clip_min = [[2],[1]] + >>> t3 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100) + >>> t3.numpy() + array([[ 2., 2., 10.], + [ 1., 1., 10.]], dtype=float32) + + Broadcasting fails, intentionally, if you would expand the dimensions of `t` + + >>> t = tf.constant([[-1, 0., 10.], [-1, 0, 10]]) + >>> clip_min = [[[2, 1]]] # Has a third axis + >>> t4 = tf.clip_by_value(t, clip_value_min=clip_min, clip_value_max=100) + Traceback (most recent call last): + ... + InvalidArgumentError: Incompatible shapes: [2,3] vs. [1,1,2] + + It throws a `TypeError` if you try to clip an `int` to a `float` value + (`tf.cast` the input to `float` first). + + >>> t = tf.constant([[1, 2], [3, 4]], dtype=tf.int32) + >>> t5 = tf.clip_by_value(t, clip_value_min=-3.1, clip_value_max=3.1) + Traceback (most recent call last): + ... + TypeError: Cannot convert ... + Args: t: A `Tensor` or `IndexedSlices`. - clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape - as `t`. The minimum value to clip by. - clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape - as `t`. The maximum value to clip by. + clip_value_min: The minimum value to clip to. A scalar `Tensor` or one that + is broadcastable to the shape of `t`. + clip_value_max: The minimum value to clip to. A scalar `Tensor` or one that + is broadcastable to the shape of `t`. name: A name for the operation (optional). Returns: A clipped `Tensor` or `IndexedSlices`. Raises: - ValueError: If the clip tensors would trigger array broadcasting - that would make the returned tensor larger than the input. + `tf.errors.InvalidArgumentError`: If the clip tensors would trigger array + broadcasting that would make the returned tensor larger than the input. TypeError: If dtype of the input is `int32` and dtype of - the `clip_value_min` or `clip_value_max` is `float32` + the `clip_value_min` or `clip_value_max` is `float32` """ with ops.name_scope(name, "clip_by_value", [t, clip_value_min, clip_value_max]) as name: From a35679e5c63f7a7e5e95ae1ddea365fff544ec15 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 07:36:55 -0800 Subject: [PATCH 258/383] Unroll vector masks along with their associated vector arguments. Updates vector ContractionOp to use proper vector masks (produced by CreateMaskOp/ConstantMaskOp). Leverages the following canonicalizations in unrolling unit test: CreateMaskOp -> ConstantMaskOp, StridedSliceOp(ConstantMaskOp) -> ConstantMaskOp Removes IndexTupleOp (no longer needed now that we have vector mask ops). Updates all unit tests. PiperOrigin-RevId: 284182168 Change-Id: I8cb1c341b1ee114848be7992b4ad2cc37a036a08 --- .../mlir/Dialect/VectorOps/VectorOps.h | 6 +++ .../mlir/Dialect/VectorOps/VectorOps.td | 41 +++++--------- .../mlir/lib/Dialect/VectorOps/VectorOps.cpp | 53 ++++--------------- .../lib/Dialect/VectorOps/VectorToVector.cpp | 38 +++++++------ 4 files changed, 50 insertions(+), 88 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h index 668eaa5c9d5..8cb0d8516b4 100644 --- a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h +++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.h @@ -28,6 +28,8 @@ #include "mlir/IR/StandardTypes.h" namespace mlir { +class MLIRContext; +class OwningRewritePatternList; namespace vector { /// Dialect for Ops on higher-dimensional vector types. @@ -37,6 +39,10 @@ public: static StringRef getDialectNamespace() { return "vector"; } }; +/// Collect a set of vector-to-vector canonicalization patterns. +void populateVectorToVectorCanonicalizationPatterns( + OwningRewritePatternList &patterns, MLIRContext *context); + #define GET_OP_CLASSES #include "mlir/Dialect/VectorOps/VectorOps.h.inc" diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td index f4bfeb73dd7..ebeecfbb715 100644 --- a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td +++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td @@ -49,7 +49,7 @@ class Vector_Op traits = []> : def Vector_ContractionOp : Vector_Op<"contract", [NoSideEffect]>, Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyVector:$acc, - Variadic>:$masks, + Variadic>:$masks, AffineMapArrayAttr:$indexing_maps, ArrayAttr:$iterator_types)>, Results<(outs AnyVector)> { let summary = "vector contraction operation"; @@ -60,8 +60,9 @@ def Vector_ContractionOp : vector result of rank K (where K = num_lhs_free_dims + num_rhs_free_dims + num_batch_dims (see dimension type descriptions below)). - Optional vector mask arguments specify the dynamic dimension sizes of - valid data within the lhs/rhs vector arguments. + Optional vector mask arguments (produced by CreateMaskOp or ConstantMaskOp) + specify the dynamic dimension sizes of valid data within the lhs/rhs vector + arguments. An iterator type attribute list must be specified, where each element of the list represents an iterator with one of the following types: @@ -120,10 +121,8 @@ def Vector_ContractionOp : // 4D vector contraction with two contracting dimensions and optional // vector mask arguments. - %lhs_mask = vector.make_tuple %size0, %size1, %size2, %size3 - : tuple - %rhs_mask = vector.make_tuple %size4, %size5, %size6, %size7 - : tuple + %lhs_mask = vector.constant_mask [7, 8, 16, 15] : vector<7x8x16x15xi1> + %rhs_mask = vector.constant_mask [8, 16, 7, 5] : vector<8x16x7x5xi1> %5 = vector.contract #contraction_trait %0, %1, %2, %lhs_mask, %rhs_mask : vector<7x8x16x15xf32>, vector<8x16x7x5xf32> into vector<8x15x8x5xf32> @@ -138,13 +137,13 @@ def Vector_ContractionOp : VectorType getAccType() { return acc()->getType().cast(); } - TupleType getLHSVectorMaskType() { - if (llvm::size(masks()) != 2) return TupleType(); - return getOperand(3)->getType().cast(); + VectorType getLHSVectorMaskType() { + if (llvm::size(masks()) != 2) return VectorType(); + return getOperand(3)->getType().cast(); } - TupleType getRHSVectorMaskType() { - if (llvm::size(masks()) != 2) return TupleType(); - return getOperand(4)->getType().cast(); + VectorType getRHSVectorMaskType() { + if (llvm::size(masks()) != 2) return VectorType(); + return getOperand(4)->getType().cast(); } VectorType getResultType() { return getResult()->getType().cast(); @@ -706,20 +705,4 @@ def Vector_CreateMaskOp : let hasCanonicalizer = 1; } -// TODO(andydavis) Delete this op once ContractOp is converted to use VectorMask -def Vector_IndexTupleOp : - Vector_Op<"make_index_tuple", [NoSideEffect]>, - Arguments<(ins Variadic:$operands)>, - Results<(outs TupleOf<[Index]>)> { - let summary = "creates a tuple of operand values"; - let description = [{ - Creates and returns a tuple of its operands which must be of index type. - - Example: - - %1 = vector.make_index_tuple %size0, %size1, %size2 - : tuple - - }]; -} #endif // VECTOR_OPS diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp index f96d3bacacf..5d596f388ed 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp @@ -82,16 +82,12 @@ static ParseResult parseContractionOp(OpAsmParser &parser, if (masksInfo.size() != 2) return parser.emitError(parser.getNameLoc(), "expected zero or exactly 2 vector mask operands"); - auto indexType = parser.getBuilder().getIndexType(); auto lhsType = types[0].cast(); auto rhsType = types[1].cast(); + auto maskElementType = parser.getBuilder().getI1Type(); SmallVector maskTypes; - SmallVector lhsMaskElementTypes(lhsType.getRank(), indexType); - maskTypes.push_back( - TupleType::get(lhsMaskElementTypes, parser.getBuilder().getContext())); - SmallVector rhsMaskElementTypes(rhsType.getRank(), indexType); - maskTypes.push_back( - TupleType::get(rhsMaskElementTypes, parser.getBuilder().getContext())); + maskTypes.push_back(VectorType::get(lhsType.getShape(), maskElementType)); + maskTypes.push_back(VectorType::get(rhsType.getShape(), maskElementType)); if (parser.resolveOperands(masksInfo, maskTypes, loc, result.operands)) return failure(); return success(); @@ -231,15 +227,10 @@ static LogicalResult verify(ContractionOp op) { if ((lhsMaskType && !rhsMaskType) || (!lhsMaskType && rhsMaskType)) return op.emitOpError("invalid number of vector masks specified"); if (lhsMaskType && rhsMaskType) { - // Verify tuple element size is != rank. - if (lhsMaskType.getTypes().size() != lhsType.getShape().size() || - rhsMaskType.getTypes().size() != rhsType.getShape().size()) - return op.emitOpError("invalid number of vector mask elements"); - // Verify all tuple elements are index type. - for (auto eltType : lhsMaskType.getTypes()) { - if (!eltType.isa()) - return op.emitOpError("vector mask element must have index type"); - } + // Verify mask rank == argument rank. + if (lhsMaskType.getShape().size() != lhsType.getShape().size() || + rhsMaskType.getShape().size() != rhsType.getShape().size()) + return op.emitOpError("invalid vector mask rank"); } return success(); } @@ -1218,33 +1209,9 @@ void CreateMaskOp::getCanonicalizationPatterns( results.insert(context); } -//===----------------------------------------------------------------------===// -// IndexTupleOp -//===----------------------------------------------------------------------===// - -ParseResult parseIndexTupleOp(OpAsmParser &parser, OperationState &result) { - auto indexType = parser.getBuilder().getIndexType(); - Type resultType; - SmallVector operandInfo; - return failure( - parser.parseOperandList(operandInfo) || - parser.parseOptionalAttrDict(result.attributes) || - parser.parseColonType(resultType) || - parser.resolveOperands(operandInfo, indexType, result.operands) || - parser.addTypeToList(resultType, result.types)); -} - -static void print(OpAsmPrinter &p, IndexTupleOp &op) { - p << op.getOperationName() << ' '; - p.printOperands(op.operands()); - p << " : " << op.getResult()->getType(); -} - -static LogicalResult verify(IndexTupleOp &op) { - for (auto operand : op.getOperands()) - if (!operand->getType().isa()) - return op.emitOpError("all operands must be of index type"); - return success(); +void mlir::vector::populateVectorToVectorCanonicalizationPatterns( + OwningRewritePatternList &patterns, MLIRContext *context) { + patterns.insert(context); } namespace mlir { diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp index 4654aff4582..c2726edd9bf 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorToVector.cpp @@ -278,9 +278,8 @@ static Value *getOrCreateUnrolledOperandSlice( // with iteration bounds 'iterationBounds' unrolled to 'targetShape'. // An iteration space index map argument 'iterationIndexMapList' must be // specified, with a map for each structured op input and a single map for the -// single result. The last map in the list must be the single result map. -// Extra operands can be passed to unrolled instances of 'op' using the -// 'extraOperands' argument. +// single result. The map at index 'indexMapListResultIndex' in the list must +// be the single result map. // // Example: // @@ -310,7 +309,7 @@ static Value *getOrCreateUnrolledOperandSlice( static Value *unrollSingleResultStructuredOp( Operation *op, ArrayRef iterationBounds, std::vector> &iterationIndexMapList, - ArrayRef targetShape, ArrayRef extraOperands, + unsigned indexMapListResultIndex, ArrayRef targetShape, PatternRewriter &builder) { auto shapedType = op->getResult(0)->getType().dyn_cast_or_null(); if (!shapedType || !shapedType.hasStaticShape()) @@ -334,7 +333,7 @@ static Value *unrollSingleResultStructuredOp( auto numUnrolledInstances = computeMaxLinearIndex(unrollFactors); auto basis = computeStrides(unrollFactors); - auto &resultOperandState = unrolledOperandState[numMaps - 1]; + auto &resultOperandState = unrolledOperandState[indexMapListResultIndex]; auto unrolledResultType = VectorType::get(resultOperandState.unrolledShape, shapedType.getElementType()); @@ -360,7 +359,6 @@ static Value *unrollSingleResultStructuredOp( iterationIndexMapList[i], caches[i], builder)); } // Create op on sliced vector arguments. - operands.append(extraOperands.begin(), extraOperands.end()); auto resultVector = cloneOpWithOperandsAndTypes(builder, op->getLoc(), op, operands, unrolledResultType) @@ -368,12 +366,14 @@ static Value *unrollSingleResultStructuredOp( // Compute linear result index. int64_t resultIndex = getUnrolledOperandLinearIndex( - resultOperandState, vectorOffsets, iterationIndexMapList[numMaps - 1]); + resultOperandState, vectorOffsets, + iterationIndexMapList[indexMapListResultIndex]); // Update result cache at 'resultIndex'. - caches[numMaps - 1][resultIndex] = resultVector; + caches[indexMapListResultIndex][resultIndex] = resultVector; } - // Make zero splat into which we will insert results from 'cache[numMaps - 1]' + // Make zero splat into which we will insert results from + // 'cache[indexMapListResultIndex]' auto resultVectorType = op->getResult(0)->getType().cast(); auto *res = makeSplatZero(op->getLoc(), builder, resultVectorType); SmallVector strides(resultOperandState.unrollFactors.size(), 1); @@ -384,7 +384,8 @@ static Value *unrollSingleResultStructuredOp( auto offsets = zipMap([](int64_t v1, int64_t v2) { return v1 * v2; }, vectorOffsets, resultOperandState.unrolledShape); res = builder.create( - op->getLoc(), caches[numMaps - 1][i], res, offsets, strides); + op->getLoc(), caches[indexMapListResultIndex][i], res, offsets, + strides); } return res; @@ -434,13 +435,17 @@ Value * mlir::vector::unrollSingleResultOpMatchingType(PatternRewriter &builder, // Get map from iteration space index to lhs/rhs/result shape index. std::vector> iterationIndexMapList; contractionOp.getIterationIndexMap(iterationIndexMapList); - // TODO(andydavis) Support unrollable vector masks. - SmallVector masks(contractionOp.masks().begin(), - contractionOp.masks().end()); + if (llvm::size(contractionOp.masks()) == 2) { + // Add maps for lhs/rhs vector mask arguments (same lhs/rhs vector shape) + iterationIndexMapList.push_back(iterationIndexMapList[0]); + iterationIndexMapList.push_back(iterationIndexMapList[1]); + } // Unroll 'op' 'iterationBounds' to 'targetShape'. - return unrollSingleResultStructuredOp(op, iterationBounds, - iterationIndexMapList, targetShape, - masks, builder); + // TODO(andydavis) Use linalg style 'args_in'/'args_out' to partition + // 'iterationIndexMapList' instead of 'indexMapListResultIndex'. + return unrollSingleResultStructuredOp( + op, iterationBounds, iterationIndexMapList, + /*indexMapListResultIndex=*/2, targetShape, builder); } // TODO(andydavis) Create trivial iteration bounds and index map for // elementwise operations and call 'unrollSingleResultStructuredOp'. Remove @@ -680,6 +685,7 @@ void mlir::populateVectorToVectorConversionPatterns( MLIRContext *context, OwningRewritePatternList &patterns, ArrayRef coarseVectorShape, ArrayRef fineVectorShape) { vector::populateWithGenerated(context, &patterns); + vector::populateVectorToVectorCanonicalizationPatterns(patterns, context); patterns .insert Date: Fri, 6 Dec 2019 08:38:41 -0800 Subject: [PATCH 259/383] Use std::rint in scalar_round_op_google PiperOrigin-RevId: 284191834 Change-Id: Ifd9945b1de4fe3c586a06d7e894d57a24feadd9b --- tensorflow/core/kernels/cwise_ops.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h index fdcc1e1e49c..a2b8748dd9a 100644 --- a/tensorflow/core/kernels/cwise_ops.h +++ b/tensorflow/core/kernels/cwise_ops.h @@ -568,6 +568,20 @@ struct scalar_round_op_google { } }; +template <> +struct scalar_round_op_google { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { + return std::rint(x); + } +}; +template <> +struct scalar_round_op_google { + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double operator()( + const double& x) const { + return std::rint(x); + } +}; + template struct scalar_round_op_google { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar From 04574413d2211949ec3387df3b4cb06525978dfd Mon Sep 17 00:00:00 2001 From: Jose Baiocchi Date: Fri, 6 Dec 2019 09:12:56 -0800 Subject: [PATCH 260/383] Extract :profiler_factory library PiperOrigin-RevId: 284197650 Change-Id: I34c290ce2a488e6bd0c03b2ef0f6defa2e39127c --- tensorflow/core/profiler/internal/BUILD | 16 +++++++-- tensorflow/core/profiler/internal/cpu/BUILD | 1 + .../core/profiler/internal/cpu/host_tracer.cc | 1 + tensorflow/core/profiler/internal/gpu/BUILD | 1 + .../profiler/internal/gpu/device_tracer.cc | 4 +-- .../internal/gpu/device_tracer_test.cc | 10 +++--- ...filer_interface.cc => profiler_factory.cc} | 7 ++-- .../core/profiler/internal/profiler_factory.h | 34 +++++++++++++++++++ .../profiler/internal/profiler_interface.h | 10 ------ tensorflow/core/profiler/lib/BUILD | 34 +++++++------------ .../core/profiler/lib/profiler_session.cc | 11 +++--- .../core/profiler/lib/profiler_session.h | 5 ++- 12 files changed, 86 insertions(+), 48 deletions(-) rename tensorflow/core/profiler/internal/{profiler_interface.cc => profiler_factory.cc} (89%) create mode 100644 tensorflow/core/profiler/internal/profiler_factory.h diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD index 25abf6d82cf..2b59dab4a75 100644 --- a/tensorflow/core/profiler/internal/BUILD +++ b/tensorflow/core/profiler/internal/BUILD @@ -405,7 +405,6 @@ tf_cc_test( cc_library( name = "profiler_interface", - srcs = ["profiler_interface.cc"], hdrs = ["profiler_interface.h"], deps = [ "//tensorflow/core:lib", @@ -414,12 +413,25 @@ cc_library( ], ) +cc_library( + name = "profiler_factory", + srcs = ["profiler_factory.cc"], + hdrs = ["profiler_factory.h"], + deps = [ + ":profiler_interface", + "//tensorflow/core:lib", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core/profiler/protobuf:xplane_proto_cc", + ], +) + filegroup( name = "mobile_srcs", srcs = [ "annotation_stack.cc", "annotation_stack.h", - "profiler_interface.cc", + "profiler_factory.cc", + "profiler_factory.h", "profiler_interface.h", "traceme_recorder.cc", "traceme_recorder.h", diff --git a/tensorflow/core/profiler/internal/cpu/BUILD b/tensorflow/core/profiler/internal/cpu/BUILD index 93fc8e05813..a3048dfb85b 100644 --- a/tensorflow/core/profiler/internal/cpu/BUILD +++ b/tensorflow/core/profiler/internal/cpu/BUILD @@ -29,6 +29,7 @@ cc_library( "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", + "//tensorflow/core/profiler/internal:profiler_factory", "//tensorflow/core/profiler/internal:profiler_interface", "//tensorflow/core/profiler/internal:traceme_recorder", "//tensorflow/core/profiler/protobuf:xplane_proto_cc", diff --git a/tensorflow/core/profiler/internal/cpu/host_tracer.cc b/tensorflow/core/profiler/internal/cpu/host_tracer.cc index 9b32ce80084..79fe7cf3729 100644 --- a/tensorflow/core/profiler/internal/cpu/host_tracer.cc +++ b/tensorflow/core/profiler/internal/cpu/host_tracer.cc @@ -20,6 +20,7 @@ limitations under the License. #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/env_time.h" #include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h" +#include "tensorflow/core/profiler/internal/profiler_factory.h" #include "tensorflow/core/profiler/internal/profiler_interface.h" #include "tensorflow/core/profiler/internal/traceme_recorder.h" #include "tensorflow/core/profiler/protobuf/xplane.pb.h" diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD index 2d752f363d9..620f92e5709 100644 --- a/tensorflow/core/profiler/internal/gpu/BUILD +++ b/tensorflow/core/profiler/internal/gpu/BUILD @@ -36,6 +36,7 @@ tf_cuda_library( "//tensorflow/core:protos_all_cc", "//tensorflow/core/profiler/internal:annotation_stack", "//tensorflow/core/profiler/internal:parse_annotation", + "//tensorflow/core/profiler/internal:profiler_factory", "//tensorflow/core/profiler/internal:profiler_interface", "//tensorflow/core/profiler/lib:traceme", "//tensorflow/core/profiler/protobuf:xplane_proto_cc", diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc index 1388fab75a3..58d414413f9 100644 --- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc @@ -31,6 +31,7 @@ limitations under the License. #include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h" #include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h" #include "tensorflow/core/profiler/internal/parse_annotation.h" +#include "tensorflow/core/profiler/internal/profiler_factory.h" #include "tensorflow/core/profiler/internal/profiler_interface.h" #include "tensorflow/core/util/env_var.h" @@ -372,8 +373,6 @@ Status GpuTracer::CollectData(XSpace* space) { return errors::Unimplemented("Collect data into XSpace not yet implemented"); } -} // namespace profiler - // Not in anonymous namespace for testing purposes. std::unique_ptr CreateGpuTracer( const profiler::ProfilerOptions& options) { @@ -394,6 +393,7 @@ auto register_gpu_tracer_factory = [] { return 0; }(); +} // namespace profiler } // namespace tensorflow #endif // GOOGLE_CUDA diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc index 9ed8896f16c..c123c59772b 100644 --- a/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc +++ b/tensorflow/core/profiler/internal/gpu/device_tracer_test.cc @@ -39,14 +39,15 @@ limitations under the License. #include "tensorflow/core/util/device_name_utils.h" namespace tensorflow { +namespace profiler { #if GOOGLE_CUDA -std::unique_ptr CreateGpuTracer( - const profiler::ProfilerOptions& options); +std::unique_ptr CreateGpuTracer( + const ProfilerOptions& options); #else // We don't have device tracer for non-cuda case. -std::unique_ptr CreateGpuTracer( - const profiler::ProfilerOptions& options) { +std::unique_ptr CreateGpuTracer( + const ProfilerOptions& options) { return nullptr; } #endif @@ -243,4 +244,5 @@ TEST_F(DeviceTracerTest, RunWithTraceOption) { } } // namespace +} // namespace profiler } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/profiler_interface.cc b/tensorflow/core/profiler/internal/profiler_factory.cc similarity index 89% rename from tensorflow/core/profiler/internal/profiler_interface.cc rename to tensorflow/core/profiler/internal/profiler_factory.cc index bd3163fce24..bf1dedc3c4b 100644 --- a/tensorflow/core/profiler/internal/profiler_interface.cc +++ b/tensorflow/core/profiler/internal/profiler_factory.cc @@ -1,4 +1,4 @@ -/* Copyright 2016 The TensorFlow Authors All Rights Reserved. +/* Copyright 2019 The TensorFlow Authors All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,12 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#include "tensorflow/core/profiler/internal/profiler_interface.h" +#include "tensorflow/core/profiler/internal/profiler_factory.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" namespace tensorflow { +namespace profiler { namespace { mutex mu(LINKER_INITIALIZED); @@ -44,4 +45,6 @@ void CreateProfilers( } } } + +} // namespace profiler } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/internal/profiler_factory.h new file mode 100644 index 00000000000..c2d0aa70671 --- /dev/null +++ b/tensorflow/core/profiler/internal/profiler_factory.h @@ -0,0 +1,34 @@ +/* Copyright 2019 The TensorFlow Authors All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_ +#define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_ + +#include "tensorflow/core/profiler/internal/profiler_interface.h" + +namespace tensorflow { +namespace profiler { + +using ProfilerFactory = + std::unique_ptr (*)(const ProfilerOptions&); + +void RegisterProfilerFactory(ProfilerFactory factory); + +void CreateProfilers(const ProfilerOptions& options, + std::vector>* result); + +} // namespace profiler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_ diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h index 6d7d456f95e..eeb6d82b75b 100644 --- a/tensorflow/core/profiler/internal/profiler_interface.h +++ b/tensorflow/core/profiler/internal/profiler_interface.h @@ -76,16 +76,6 @@ class ProfilerInterface { }; } // namespace profiler - -using ProfilerFactory = std::unique_ptr (*)( - const profiler::ProfilerOptions&); - -void RegisterProfilerFactory(ProfilerFactory factory); - -void CreateProfilers( - const profiler::ProfilerOptions& options, - std::vector>* result); - } // namespace tensorflow #endif // TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_ diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD index 3ee9cd78c22..4a0c0107e21 100644 --- a/tensorflow/core/profiler/lib/BUILD +++ b/tensorflow/core/profiler/lib/BUILD @@ -1,8 +1,4 @@ -load( - "//tensorflow:tensorflow.bzl", - "tf_cuda_library", -) -load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda") +load("//tensorflow:tensorflow.bzl", "tf_cuda_library") package( default_visibility = [ @@ -12,42 +8,37 @@ package( licenses = ["notice"], # Apache 2.0 ) -tf_cuda_library( +cc_library( name = "profiler_session", - srcs = [ - "profiler_session.cc", - ], - hdrs = [ - "profiler_session.h", - ], + srcs = ["profiler_session.cc"], + hdrs = ["profiler_session.h"], visibility = ["//tensorflow:internal"], deps = [ + ":profiler_utils", "//tensorflow/core/profiler/internal:profiler_interface", - "//tensorflow/core/profiler/lib:profiler_utils", + "//tensorflow/core/profiler/internal:profiler_factory", "@com_google_absl//absl/strings", ] + select({ "//tensorflow:android": [], "//conditions:default": [ "//tensorflow/core:core_cpu_lib", - "//tensorflow/core:framework", - "//tensorflow/core:framework_internal", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", "//tensorflow/core:protos_all_cc", - "//tensorflow/core:session_options", ], }), ) tf_cuda_library( name = "profiler_lib", + cuda_deps = [ + "//tensorflow/core/profiler/internal/gpu:device_tracer", + ], visibility = ["//tensorflow:internal"], deps = [ "//tensorflow/core/profiler/internal/cpu:host_tracer", - ] + if_cuda([ - "//tensorflow/core/profiler/internal/gpu:device_tracer", - ]), - alwayslink = 1, + ], + alwayslink = True, ) cc_library( @@ -76,8 +67,7 @@ cc_library( name = "profiler_utils", srcs = ["profiler_utils.cc"], hdrs = ["profiler_utils.h"], - visibility = ["//tensorflow:internal"], - alwayslink = 1, + visibility = ["//tensorflow/core/profiler:internal"], ) filegroup( diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc index 4e0a4d5ad6f..340cd639c5f 100644 --- a/tensorflow/core/profiler/lib/profiler_session.cc +++ b/tensorflow/core/profiler/lib/profiler_session.cc @@ -15,8 +15,8 @@ limitations under the License. #include "tensorflow/core/profiler/lib/profiler_session.h" -#include -#include +#include + #include #include "absl/strings/str_split.h" @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/internal/profiler_factory.h" #include "tensorflow/core/profiler/lib/profiler_utils.h" #include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/protobuf/error_codes.pb.h" @@ -38,7 +39,7 @@ namespace { // If the "op_type" is missing, returns the node_name. // This is done so all ops with the same type appear in the same color in trace // viewer. -inline std::string EventName(absl::string_view node_name) { +inline string EventName(absl::string_view node_name) { // NOTE: open source device tracer now append cupti kernel name after // annotation as node_name, @@ is used as separator. kernel name is // demangled and possibly contains "::" patterns. @@ -52,10 +53,10 @@ inline std::string EventName(absl::string_view node_name) { absl::StrSplit(annotation_stack.back(), '#'); std::vector parts = absl::StrSplit(annotation_parts.front(), ':'); - return std::string(parts.back()); + return string(parts.back()); } else { std::vector parts = absl::StrSplit(node_name, ':'); - return std::string(parts.back()); + return string(parts.back()); } } diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h index e47a85594ed..e91d7ad2bee 100644 --- a/tensorflow/core/profiler/lib/profiler_session.h +++ b/tensorflow/core/profiler/lib/profiler_session.h @@ -19,6 +19,7 @@ limitations under the License. #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/core/profiler/internal/profiler_interface.h" +#include "tensorflow/core/protobuf/config.pb.h" namespace tensorflow { @@ -41,7 +42,9 @@ class ProfilerSession { tensorflow::Status Status() LOCKS_EXCLUDED(mutex_); - tensorflow::Status CollectData(RunMetadata* run_metadata); + tensorflow::Status CollectData(RunMetadata* run_metadata) + LOCKS_EXCLUDED(mutex_); + tensorflow::Status SerializeToString(string* content) LOCKS_EXCLUDED(mutex_); private: From c28ca27b96b3a141922523c005c71af51cc61906 Mon Sep 17 00:00:00 2001 From: Jiri Simsa Date: Fri, 6 Dec 2019 09:14:40 -0800 Subject: [PATCH 261/383] Fixing a TensorFlow control flow bug. Calling `nest.map_structure` with a lambda that does not return (i.e. only for its side-effect) will fail on structures that contain composite tensors because the `map_structure` implementation will try to reconstruct the composite tensors from the return values of the lambda, which will be None. PiperOrigin-RevId: 284197904 Change-Id: I9b3e43bbd28712281839eaf77b2e4280db7c585c --- tensorflow/python/ops/control_flow_ops.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py index bf0df397c85..5d0ca7c90de 100644 --- a/tensorflow/python/ops/control_flow_ops.py +++ b/tensorflow/python/ops/control_flow_ops.py @@ -749,10 +749,10 @@ class ControlFlowContext(object): def ExitResult(self, result): """Make a list of tensors available in the outer context.""" if self._outer_context: - nest.map_structure( - lambda x: self._outer_context.AddName(x.name), - result, - expand_composites=True) + def fn(x): + self._outer_context.AddName(x.name) + return x + nest.map_structure(fn, result, expand_composites=True) def GetWhileContext(self): """Return the while context containing this context.""" From c94797780e2a0730687a21356a2a9ec7c97fdea9 Mon Sep 17 00:00:00 2001 From: HyoukJoong Lee Date: Fri, 6 Dec 2019 09:16:38 -0800 Subject: [PATCH 262/383] Add optional layout constraints for AllReduce PiperOrigin-RevId: 284198168 Change-Id: I4ef59638851ca1cef689f7db622bd06ca41bccad --- tensorflow/compiler/xla/client/xla_builder.cc | 38 +++++++++++++++---- tensorflow/compiler/xla/client/xla_builder.h | 18 ++++++--- tensorflow/compiler/xla/python/xla.cc | 10 +++-- tensorflow/compiler/xla/python/xla_client.py | 2 +- tensorflow/compiler/xla/service/BUILD | 1 + .../bfloat16_conversion_folding_test.cc | 1 + .../service/bfloat16_normalization_test.cc | 1 + .../xla/service/bfloat16_propagation_test.cc | 3 +- .../compiler/xla/service/hlo_instruction.cc | 9 +++-- .../compiler/xla/service/hlo_instruction.h | 2 +- .../compiler/xla/service/hlo_instructions.cc | 25 ++++++++++-- .../compiler/xla/service/hlo_instructions.h | 24 +++++++++++- tensorflow/compiler/xla/service/hlo_parser.cc | 6 ++- .../compiler/xla/service/hlo_parser_test.cc | 18 +++++++++ tensorflow/compiler/xla/service/hlo_query.cc | 14 +++++++ tensorflow/compiler/xla/service/hlo_query.h | 5 +++ .../compiler/xla/service/hlo_verifier.cc | 24 ++++++++++++ .../compiler/xla/service/hlo_verifier_test.cc | 25 ++++++++++++ .../compiler/xla/service/layout_assignment.cc | 12 +++++- 19 files changed, 209 insertions(+), 29 deletions(-) diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc index 8f480c0dec3..290b9c0f647 100644 --- a/tensorflow/compiler/xla/client/xla_builder.cc +++ b/tensorflow/compiler/xla/client/xla_builder.cc @@ -2112,7 +2112,8 @@ XlaOp XlaBuilder::CrossReplicaSum( XlaOp XlaBuilder::AllReduce(XlaOp operand, const XlaComputation& computation, absl::Span replica_groups, - const absl::optional& channel_id) { + const absl::optional& channel_id, + const absl::optional& shape_with_layout) { return ReportErrorOrReturn([&]() -> StatusOr { HloInstructionProto instr; TF_ASSIGN_OR_RETURN(const Shape* operand_shape, GetShapePtr(operand)); @@ -2136,9 +2137,31 @@ XlaOp XlaBuilder::AllReduce(XlaOp operand, const XlaComputation& computation, operand_shapes.push_back(operand_shape); operands.push_back(operand); } - TF_ASSIGN_OR_RETURN(Shape shape, + + TF_ASSIGN_OR_RETURN(Shape inferred_shape, ShapeInference::InferAllReduceShape(operand_shapes)); - *instr.mutable_shape() = shape.ToProto(); + if (shape_with_layout) { + if (!LayoutUtil::HasLayout(*shape_with_layout)) { + return InvalidArgument("shape_with_layout must have the layout set: %s", + shape_with_layout->ToString()); + } + if (!ShapeUtil::Compatible(*shape_with_layout, *operand_shape)) { + return InvalidArgument( + "Provided shape_with_layout must be compatible with the " + "operand shape: %s vs %s", + shape_with_layout->ToString(), operand_shape->ToString()); + } + instr.set_constrain_layout(true); + if (operand_shape->IsTuple() && !inferred_shape.IsTuple()) { + // For a single-element tuple, take the tuple element shape. + TF_RET_CHECK(shape_with_layout->tuple_shapes_size() == 1); + *instr.mutable_shape() = shape_with_layout->tuple_shapes(0).ToProto(); + } else { + *instr.mutable_shape() = shape_with_layout->ToProto(); + } + } else { + *instr.mutable_shape() = inferred_shape.ToProto(); + } for (const ReplicaGroup& group : replica_groups) { *instr.add_replica_groups() = group; @@ -2153,10 +2176,10 @@ XlaOp XlaBuilder::AllReduce(XlaOp operand, const XlaComputation& computation, TF_ASSIGN_OR_RETURN( auto all_reduce, AddInstruction(std::move(instr), HloOpcode::kAllReduce, operands)); - if (operand_shape->IsTuple() && !shape.IsTuple()) { + if (operand_shape->IsTuple() && !inferred_shape.IsTuple()) { // For a single-element tuple, wrap the result into a tuple. TF_RET_CHECK(operand_shapes.size() == 1); - TF_RET_CHECK(ShapeUtil::Compatible(*operand_shapes[0], shape)); + TF_RET_CHECK(ShapeUtil::Compatible(*operand_shapes[0], inferred_shape)); return Tuple({all_reduce}); } return all_reduce; @@ -3282,9 +3305,10 @@ XlaOp CrossReplicaSum(const XlaOp operand, XlaOp AllReduce(const XlaOp operand, const XlaComputation& computation, absl::Span replica_groups, - const absl::optional& channel_id) { + const absl::optional& channel_id, + const absl::optional& shape_with_layout) { return operand.builder()->AllReduce(operand, computation, replica_groups, - channel_id); + channel_id, shape_with_layout); } XlaOp AllToAll(const XlaOp operand, int64 split_dimension, diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h index 3822e907203..5e93bb2b3ba 100644 --- a/tensorflow/compiler/xla/client/xla_builder.h +++ b/tensorflow/compiler/xla/client/xla_builder.h @@ -514,7 +514,8 @@ class XlaBuilder { XlaOp AllReduce( XlaOp operand, const XlaComputation& computation, absl::Span replica_groups = {}, - const absl::optional& channel_id = absl::nullopt); + const absl::optional& channel_id = absl::nullopt, + const absl::optional& shape_with_layout = absl::nullopt); XlaOp AllToAll(XlaOp operand, int64 split_dimension, int64 concat_dimension, int64 split_count, @@ -922,7 +923,8 @@ class XlaBuilder { absl::Span replica_groups); friend XlaOp AllReduce(XlaOp operand, const XlaComputation& computation, absl::Span replica_groups, - const absl::optional& channel_id); + const absl::optional& channel_id, + const absl::optional& shape_with_layout); friend XlaOp AllToAll(XlaOp operand, int64 split_dimension, int64 concat_dimension, int64 split_count, const std::vector& replica_groups); @@ -1666,10 +1668,14 @@ XlaOp CrossReplicaSum(XlaOp operand, // - `channel_id`: for Allreduce nodes from different modules, if they have the // same channel_id, they will be 'AllReduce'd. If empty, AllReduce will not be // applied cross modules. -XlaOp AllReduce( - XlaOp operand, const XlaComputation& computation, - absl::Span replica_groups = {}, - const absl::optional& channel_id = absl::nullopt); +// +// - `shape_with_layout`: forces the layout of the AllReduce to the given +// layout. This is used to guarantee the same layout for a group of AllReduce +// ops compiled separately. +XlaOp AllReduce(XlaOp operand, const XlaComputation& computation, + absl::Span replica_groups = {}, + const absl::optional& channel_id = absl::nullopt, + const absl::optional& shape_with_layout = absl::nullopt); // Enqueues an operation that do an Alltoall of the operand cross cores. XlaOp AllToAll(XlaOp operand, int64 split_dimension, int64 concat_dimension, diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc index 0e594982202..13968154188 100644 --- a/tensorflow/compiler/xla/python/xla.cc +++ b/tensorflow/compiler/xla/python/xla.cc @@ -639,10 +639,12 @@ PYBIND11_MODULE(xla_extension, m) { py::module ops = m.def_submodule("ops", "XLA operations"); ops.def("AfterAll", &AfterAll); - ops.def("AllReduce", - static_cast, - const absl::optional&)>(&AllReduce)); + ops.def( + "AllReduce", + static_cast, + const absl::optional&, const absl::optional&)>( + &AllReduce)); ops.def("AllToAll", &AllToAll); ops.def("CollectivePermute", &CollectivePermute); ops.def("CreateToken", &CreateToken); diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py index 9477b3c2b1d..a7e35a8a81f 100644 --- a/tensorflow/compiler/xla/python/xla_client.py +++ b/tensorflow/compiler/xla/python/xla_client.py @@ -1034,7 +1034,7 @@ class ComputationBuilder(object): """ replica_groups_protos = _get_replica_groups_protos(replica_groups) return ops.AllReduce(operand, computation.computation, - replica_groups_protos, None) + replica_groups_protos, None, None) def AllToAll(self, operand, diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD index a6300d2dc73..14e6f66741e 100755 --- a/tensorflow/compiler/xla/service/BUILD +++ b/tensorflow/compiler/xla/service/BUILD @@ -1505,6 +1505,7 @@ cc_library( hdrs = ["hlo_query.h"], deps = [ ":hlo", + ":hlo_casting_utils", "//tensorflow/compiler/xla:literal", "//tensorflow/compiler/xla:shape_util", "@com_google_absl//absl/container:flat_hash_set", diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc index eb6692ade5b..ac5edd82bee 100644 --- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc @@ -239,6 +239,7 @@ TEST_F(BFloat16ConversionFoldingTest, FoldAllReduceTupleOutput) { HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce( ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}, sum, /*replica_groups=*/{}, + /*constrain_layout=*/false, /*channel_id=*/absl::nullopt)); HloInstruction* gte_a = builder.AddInstruction( HloInstruction::CreateGetTupleElement(f32_shape, crs, 0)); diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc index f7a5ee691f3..ec93a868022 100644 --- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc @@ -259,6 +259,7 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleAllReduce) { HloInstruction* crs = builder.AddInstruction(HloInstruction::CreateAllReduce( ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}, reduction, /*replica_groups=*/{}, + /*constrain_layout=*/false, /*channel_id=*/absl::nullopt)); builder.AddInstruction( HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1)); diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc index d716e62d467..aee1f652abd 100644 --- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc +++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc @@ -211,7 +211,8 @@ TEST_F(BFloat16PropagationTest, DoNotChangeAllReduce) { HloInstruction* all_reduce = builder.AddInstruction(HloInstruction::CreateAllReduce( ShapeUtil::MakeTupleShape({shape, shape}), {a, b}, reduction, - /*replica_groups=*/{}, /*channel_id=*/1)); + /*replica_groups=*/{}, /*constrain_layout=*/false, + /*channel_id=*/1)); HloInstruction* gte0 = builder.AddInstruction( HloInstruction::CreateGetTupleElement(shape, all_reduce, 0)); HloInstruction* gte1 = builder.AddInstruction( diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc index 368a3876f8c..bc099371d08 100755 --- a/tensorflow/compiler/xla/service/hlo_instruction.cc +++ b/tensorflow/compiler/xla/service/hlo_instruction.cc @@ -400,6 +400,7 @@ StatusOr> HloInstruction::CreateFromProto( /*replica_groups=*/ std::vector(proto.replica_groups().begin(), proto.replica_groups().end()), + /*constrain_layout=*/proto.constrain_layout(), /*channel_id=*/channel_id); break; } @@ -900,10 +901,11 @@ HloInstruction::CreateReducePrecision(const Shape& shape, /* static */ std::unique_ptr HloInstruction::CreateAllReduce( const Shape& shape, absl::Span operands, HloComputation* reduce_computation, - const std::vector& replica_groups, + const std::vector& replica_groups, bool constrain_layout, const absl::optional& channel_id) { return absl::make_unique( - shape, operands, reduce_computation, replica_groups, channel_id); + shape, operands, reduce_computation, replica_groups, constrain_layout, + channel_id); } /* static */ std::unique_ptr HloInstruction::CreateAllToAll( @@ -1341,7 +1343,8 @@ bool HloInstruction::HasSideEffectNoRecurse() const { case HloOpcode::kTrace: return true; case HloOpcode::kAllReduce: - return channel_id().has_value(); + return channel_id().has_value() || + Cast(this)->constrain_layout(); case HloOpcode::kCustomCall: return Cast(this) ->custom_call_has_side_effect(); diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h index 5855911650d..238a96e52a0 100644 --- a/tensorflow/compiler/xla/service/hlo_instruction.h +++ b/tensorflow/compiler/xla/service/hlo_instruction.h @@ -607,7 +607,7 @@ class HloInstruction { static std::unique_ptr CreateAllReduce( const Shape& shape, absl::Span operands, HloComputation* reduce_computation, - const std::vector& replica_groups, + const std::vector& replica_groups, bool constrain_layout, const absl::optional& channel_id); // An all-to-all op takes N array operands of the same shape and scatters them diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc index 9448feb7d8a..a150efd8c83 100644 --- a/tensorflow/compiler/xla/service/hlo_instructions.cc +++ b/tensorflow/compiler/xla/service/hlo_instructions.cc @@ -553,10 +553,11 @@ bool HloCollectiveInstruction::IdenticalSlowPath( HloAllReduceInstruction::HloAllReduceInstruction( const Shape& shape, absl::Span operands, HloComputation* reduce_computation, - const std::vector& replica_groups, + const std::vector& replica_groups, bool constrain_layout, const absl::optional& channel_id) : HloCollectiveInstruction(HloOpcode::kAllReduce, shape, operands, - replica_groups, channel_id) { + replica_groups, channel_id), + constrain_layout_(constrain_layout) { AppendComputation(reduce_computation); } @@ -569,12 +570,29 @@ bool HloAllReduceInstruction::IsNoop() const { return !channel_id(); } +HloInstructionProto HloAllReduceInstruction::ToProto() const { + HloInstructionProto proto = HloCollectiveInstruction::ToProto(); + proto.set_constrain_layout(constrain_layout_); + return proto; +} + +std::vector HloAllReduceInstruction::ExtraAttributesToStringImpl( + const HloPrintOptions& options) const { + std::vector result = + HloCollectiveInstruction::ExtraAttributesToStringImpl(options); + if (constrain_layout_) { + result.push_back("constrain_layout=true"); + } + return result; +} + bool HloAllReduceInstruction::IdenticalSlowPath( const HloInstruction& other, const std::function& eq_computations) const { const auto& casted_other = static_cast(other); return HloCollectiveInstruction::IdenticalSlowPath(other, eq_computations) && + constrain_layout() == casted_other.constrain_layout() && eq_computations(to_apply(), casted_other.to_apply()); } @@ -583,7 +601,8 @@ HloAllReduceInstruction::CloneWithNewOperandsImpl( const Shape& shape, absl::Span new_operands, HloCloneContext* /*context*/) const { return absl::make_unique( - shape, new_operands, to_apply(), replica_groups(), channel_id()); + shape, new_operands, to_apply(), replica_groups(), constrain_layout(), + channel_id()); } HloAllToAllInstruction::HloAllToAllInstruction( diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h index 8950e6218e3..1863c78e7e1 100755 --- a/tensorflow/compiler/xla/service/hlo_instructions.h +++ b/tensorflow/compiler/xla/service/hlo_instructions.h @@ -336,13 +336,33 @@ class HloAllReduceInstruction : public HloCollectiveInstruction { explicit HloAllReduceInstruction( const Shape& shape, absl::Span operands, HloComputation* reduce_computation, - const std::vector& replica_groups, + const std::vector& replica_groups, bool constrain_layout, const absl::optional& channel_id); // Returns true if the AllReduce does no communication, so it's equivalent // to a mem copy. bool IsNoop() const; + // Returns true if the layout of the AllReduce is enforced by XLA client (as + // the layout set in the shape). The only reason for the client to set the + // layout is to separately compile computations that communicate with + // AllReduce. Since this field is only set `true` by the client, the compiler + // only needs to propagate existing values (e.g., Clone, X64Rewriter) or set + // `false` for all other cases. + // + // When this is `true`, there may be communication endpoints outside the + // current compilation unit, so the compiler considers this AllReduce as + // side-effecting to disable compiler transformations. The compiler is free to + // transform unconstrained AllReduces differently across compilation units. + // It is an error for an HloModule to have a mix of constrained and + // unconstrained AllReduce instructions (checked by HloVerifier). + bool constrain_layout() const { return constrain_layout_; } + + protected: + std::vector ExtraAttributesToStringImpl( + const HloPrintOptions& options) const override; + HloInstructionProto ToProto() const override; + private: bool IdenticalSlowPath( const HloInstruction& other, @@ -353,6 +373,8 @@ class HloAllReduceInstruction : public HloCollectiveInstruction { std::unique_ptr CloneWithNewOperandsImpl( const Shape& shape, absl::Span new_operands, HloCloneContext* context) const override; + + bool constrain_layout_; }; class HloAllToAllInstruction : public HloCollectiveInstruction { diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc index ef58b37b469..3ecd0af3480 100644 --- a/tensorflow/compiler/xla/service/hlo_parser.cc +++ b/tensorflow/compiler/xla/service/hlo_parser.cc @@ -857,11 +857,14 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder, optional to_apply; optional> replica_group_ids; optional channel_id; + optional constrain_layout; attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation, &to_apply}; attrs["replica_groups"] = {/*required=*/false, AttrTy::kBracedInt64ListList, &tmp_groups}; attrs["channel_id"] = {/*required=*/false, AttrTy::kInt64, &channel_id}; + attrs["constrain_layout"] = {/*required=*/false, AttrTy::kBool, + &constrain_layout}; if (!ParseOperands(&operands) || !ParseAttributes(attrs)) { return false; } @@ -870,7 +873,8 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder, replica_groups = CreateReplicaGroups(*tmp_groups); } instruction = builder->AddInstruction(HloInstruction::CreateAllReduce( - shape, operands, *to_apply, replica_groups, channel_id)); + shape, operands, *to_apply, replica_groups, + constrain_layout ? *constrain_layout : false, channel_id)); break; } case HloOpcode::kAllToAll: { diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc index a522b1ddbfe..29a6a5e4297 100644 --- a/tensorflow/compiler/xla/service/hlo_parser_test.cc +++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc @@ -1472,6 +1472,24 @@ ENTRY AllReduceWithSubgroups { ROOT all-reduce = f32[128,32]{0,1} all-reduce(input), replica_groups={{0,1},{2,3}}, to_apply=add } +)" +}, +// all-reduce with constrained layout +{ +"AllReduceWithLayout", +R"(HloModule CRS + +add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) +} + +ENTRY CRS { + input = f32[8]{0} parameter(0) + ROOT crs = f32[8]{0} all-reduce(input), replica_groups={}, constrain_layout=true, to_apply=add +} + )" }, // all-reduce with all-reduce-id diff --git a/tensorflow/compiler/xla/service/hlo_query.cc b/tensorflow/compiler/xla/service/hlo_query.cc index f968a4a9445..defd6abd8f6 100644 --- a/tensorflow/compiler/xla/service/hlo_query.cc +++ b/tensorflow/compiler/xla/service/hlo_query.cc @@ -16,6 +16,8 @@ limitations under the License. #include "tensorflow/compiler/xla/service/hlo_query.h" #include "tensorflow/compiler/xla/literal.h" +#include "tensorflow/compiler/xla/service/hlo_casting_utils.h" +#include "tensorflow/compiler/xla/service/hlo_instructions.h" #include "tensorflow/compiler/xla/service/hlo_opcode.h" #include "tensorflow/compiler/xla/shape_util.h" @@ -119,5 +121,17 @@ bool ContainsInstrWithOpcode(const HloComputation* comp, return false; } +bool ContainsLayoutConstrainedAllReduce(const HloModule& module) { + for (auto computation : module.computations()) { + for (auto hlo : computation->instructions()) { + if (hlo->opcode() == HloOpcode::kAllReduce && + DynCast(hlo)->constrain_layout()) { + return true; + } + } + } + return false; +} + } // namespace hlo_query } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_query.h b/tensorflow/compiler/xla/service/hlo_query.h index 215051f8834..0ea36ae83f8 100644 --- a/tensorflow/compiler/xla/service/hlo_query.h +++ b/tensorflow/compiler/xla/service/hlo_query.h @@ -19,6 +19,7 @@ limitations under the License. #include "absl/container/flat_hash_set.h" #include "tensorflow/compiler/xla/service/hlo_computation.h" #include "tensorflow/compiler/xla/service/hlo_instruction.h" +#include "tensorflow/compiler/xla/service/hlo_module.h" namespace xla { @@ -72,6 +73,10 @@ bool MatchBinaryInstructionOperandOpcode(HloOpcode opcode, HloInstruction** matching_operand, HloInstruction** other_operand); +// Returns whether the module contains all-reduce instructions with constrained +// layout. +bool ContainsLayoutConstrainedAllReduce(const HloModule& module); + } // namespace hlo_query } // namespace xla diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc index 4d460ee30ca..1218f7dfc6f 100755 --- a/tensorflow/compiler/xla/service/hlo_verifier.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier.cc @@ -1310,6 +1310,29 @@ Status VerifyAsynchronousCopies(const HloModule& module) { return Status::OK(); } +// Checks that AllReduce instructions in the module are either all layout +// constrained or all unconstrained. +Status VerifyLayoutConstrainedAllReduce(const HloModule& module) { + const HloAllReduceInstruction* reference = nullptr; + for (const HloComputation* computation : module.computations()) { + for (const HloInstruction* instruction : computation->instructions()) { + if (instruction->opcode() != HloOpcode::kAllReduce) { + continue; + } + auto all_reduce = DynCast(instruction); + if (!reference) { + reference = all_reduce; + } + if (reference->constrain_layout() != all_reduce->constrain_layout()) { + return FailedPrecondition( + "HloModule has a mix of layout constrained and unconstrained " + "AllReduce instructions."); + } + } + } + return Status::OK(); +} + // Checks various invariants of send and recv instructions. Status VerifySendsAndRecvs(const HloModule& module) { absl::flat_hash_map host_channels; @@ -1697,6 +1720,7 @@ StatusOr HloVerifier::Run(HloModule* module) { })); TF_RETURN_IF_ERROR(module->dynamic_parameter_binding().Verify(*module)); + TF_RETURN_IF_ERROR(VerifyLayoutConstrainedAllReduce(*module)); return false; } diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc index df603102157..1b273909991 100644 --- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc +++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc @@ -988,5 +988,30 @@ TEST_F(HloVerifierTest, FusionShapeVerifier) { HasSubstr("Fused computation shape")); } +TEST_F(HloVerifierTest, AllReduceVerifier) { + const char* const kModuleStr = R"( + HloModule test + + add { + lhs = f32[] parameter(0) + rhs = f32[] parameter(1) + ROOT add = f32[] add(lhs, rhs) + } + + ENTRY entry { + input = f32[8,12]{0,1} parameter(0) + crs0 = f32[8,12]{0,1} all-reduce(input), replica_groups={}, to_apply=add + crs1 = f32[8,12]{0,1} all-reduce(input), replica_groups={}, to_apply=add, + constrain_layout=true + ROOT result = (f32[8,12]{0,1}, f32[8,12]{0,1}) tuple(crs0, crs1) + } + )"; + TF_ASSERT_OK_AND_ASSIGN(auto module, + ParseAndReturnUnverifiedModule(kModuleStr)); + EXPECT_THAT( + verifier().Run(module.get()).status().error_message(), + HasSubstr("mix of layout constrained and unconstrained AllReduce")); +} + } // namespace } // namespace xla diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc index 81a42de6816..defaf4cd7ab 100644 --- a/tensorflow/compiler/xla/service/layout_assignment.cc +++ b/tensorflow/compiler/xla/service/layout_assignment.cc @@ -432,6 +432,12 @@ bool IsLayoutConstrainedCustomCall(HloInstruction* instruction) { return custom_call != nullptr && custom_call->layout_constrained(); } +bool IsLayoutConstrainedAllReduce(HloInstruction* instruction) { + const HloAllReduceInstruction* all_reduce = + DynCast(instruction); + return all_reduce != nullptr && all_reduce->constrain_layout(); +} + } // namespace Status LayoutAssignment::AddMandatoryConstraints( @@ -516,6 +522,9 @@ Status LayoutAssignment::AddMandatoryConstraints( TF_RETURN_IF_ERROR( constraints->SetBufferLayout(new_shape.layout(), *buffer)); } + } else if (IsLayoutConstrainedAllReduce(instruction)) { + TF_RETURN_IF_ERROR( + constraints->SetInstructionLayout(instruction->shape(), instruction)); } else if (instruction->IsCrossModuleAllReduce()) { CHECK(get_channel_constraints(instruction)) << "Multi-module layout assignment requires ChannelLayoutConstraints"; @@ -1765,7 +1774,8 @@ Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) { } // Some instructions carry mandatory layouts in their shape. if (instruction->opcode() != HloOpcode::kInfeed && - !IsLayoutConstrainedCustomCall(instruction)) { + !IsLayoutConstrainedCustomCall(instruction) && + !IsLayoutConstrainedAllReduce(instruction)) { LayoutUtil::ClearLayout(instruction->mutable_shape()); } } From a7a97eaf9731017aaaf90347956776fcc0eb7992 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 09:26:48 -0800 Subject: [PATCH 263/383] Add op stack class for op processing. PiperOrigin-RevId: 284199882 Change-Id: I6c86c024a03d0677c5b77e0a749f85898fb6f4c1 --- tensorflow/core/profiler/convert/BUILD | 8 +++ tensorflow/core/profiler/convert/op_stack.h | 69 +++++++++++++++++++++ 2 files changed, 77 insertions(+) create mode 100644 tensorflow/core/profiler/convert/op_stack.h diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD index 51af083ca5b..cfb50111c1b 100644 --- a/tensorflow/core/profiler/convert/BUILD +++ b/tensorflow/core/profiler/convert/BUILD @@ -15,6 +15,14 @@ cc_library( ], ) +cc_library( + name = "op_stack", + hdrs = ["op_stack.h"], + deps = [ + "//tensorflow/core:lib", + ], +) + cc_library( name = "op_stats_to_tf_stats", srcs = ["op_stats_to_tf_stats.cc"], diff --git a/tensorflow/core/profiler/convert/op_stack.h b/tensorflow/core/profiler/convert/op_stack.h new file mode 100644 index 00000000000..6bfa4d77643 --- /dev/null +++ b/tensorflow/core/profiler/convert/op_stack.h @@ -0,0 +1,69 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_ +#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_ + +#include +#include +#include + +#include "tensorflow/core/platform/types.h" + +namespace tensorflow { +namespace profiler { + +template +class OpStack { + public: + // Pushes an Op onto the stack. + void Push(uint32 op_id, std::unique_ptr op_info) { + stack_.emplace_back(op_id, std::move(op_info)); + } + + // Pops the Op with the given op_id from the stack. + std::unique_ptr Pop(uint32 op_id) { + // Pop until match or stack_ is empty. + std::unique_ptr result; + while (!stack_.empty()) { + auto back = std::move(stack_.back()); + stack_.pop_back(); + if (op_id == back.first) { + result = std::move(back.second); + break; + } + } + return result; + } + + // Returns the Op at the top of the stack. + OpInfo* Top() const { + return stack_.empty() ? nullptr : stack_.back().second.get(); + } + + // Returns true if the stack is empty. + bool Empty() const { return stack_.empty(); } + + // Clears the stack. + void Clear() { stack_.clear(); } + + private: + std::vector>> stack_; +}; + +} // namespace profiler +} // namespace tensorflow + +#endif // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STACK_H_ From 68fb5ac01c1724b1f117a02114c63bd406a56cc8 Mon Sep 17 00:00:00 2001 From: "Alexandre E. Eichenberger" Date: Fri, 6 Dec 2019 09:40:12 -0800 Subject: [PATCH 264/383] fix examples in comments Closes #301 COPYBARA_INTEGRATE_REVIEW=https://github.com/tensorflow/mlir/pull/301 from AlexandreEichenberger:vect-doc-update 7e5418a9101a4bdad2357882fe660b02bba8bd01 PiperOrigin-RevId: 284202462 Change-Id: Icc3f89105534fa06821433caae97f38f74a8a205 --- third_party/mlir/lib/Transforms/Vectorize.cpp | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/third_party/mlir/lib/Transforms/Vectorize.cpp b/third_party/mlir/lib/Transforms/Vectorize.cpp index c1e0a9c0e13..16b43e3f136 100644 --- a/third_party/mlir/lib/Transforms/Vectorize.cpp +++ b/third_party/mlir/lib/Transforms/Vectorize.cpp @@ -306,10 +306,10 @@ using namespace mlir; /// terminal processing out of the use-def chains starting from loads. In the /// following snippet, there is simply no load:: /// ```mlir -/// mlfunc @fill(%A : memref<128xf32>) -> () { +/// func @fill(%A : memref<128xf32>) -> () { /// %f1 = constant 1.0 : f32 /// affine.for %i0 = 0 to 32 { -/// store %f1, %A[%i0] : memref<128xf32, 0> +/// affine.store %f1, %A[%i0] : memref<128xf32, 0> /// } /// return /// } @@ -322,7 +322,7 @@ using namespace mlir; /// vectorize by a factor 128, we want to transform the following input: /// ```mlir /// affine.for %i = %M to %N { -/// %a = load A[%i] : memref +/// %a = affine.load %A[%i] : memref /// } /// ``` /// @@ -332,7 +332,7 @@ using namespace mlir; /// ```mlir /// affine.for %i = floor(%M, 128) to ceil(%N, 128) { /// affine.for %ii = max(%M, 128 * %i) to min(%N, 128*%i + 127) { -/// %a = load A[%ii] : memref +/// %a = affine.load %A[%ii] : memref /// } /// } /// ``` @@ -341,7 +341,7 @@ using namespace mlir; /// scheduling, so we want to generate a pattern that resembles: /// ```mlir /// affine.for %i = ? to ? step ? { -/// %v_a = vector.transfer_read A[%i] : memref, vector<128xf32> +/// %v_a = vector.transfer_read %A[%i] : memref, vector<128xf32> /// } /// ``` /// @@ -361,7 +361,7 @@ using namespace mlir; /// abstraction of size 128 returns code similar to: /// ```mlir /// affine.for %i = %M to %N step 128 { -/// %v_a = vector.transfer_read A[%i] : memref, vector<128xf32> +/// %v_a = vector.transfer_read %A[%i] : memref, vector<128xf32> /// } /// ``` /// @@ -382,7 +382,7 @@ using namespace mlir; /// ========= /// Consider the following Function: /// ```mlir -/// mlfunc @vector_add_2d(%M : index, %N : index) -> f32 { +/// func @vector_add_2d(%M : index, %N : index) -> f32 { /// %A = alloc (%M, %N) : memref /// %B = alloc (%M, %N) : memref /// %C = alloc (%M, %N) : memref @@ -391,19 +391,19 @@ using namespace mlir; /// affine.for %i0 = 0 to %M { /// affine.for %i1 = 0 to %N { /// // non-scoped %f1 -/// store %f1, %A[%i0, %i1] : memref +/// affine.store %f1, %A[%i0, %i1] : memref /// } /// } /// affine.for %i2 = 0 to %M { /// affine.for %i3 = 0 to %N { /// // non-scoped %f2 -/// store %f2, %B[%i2, %i3] : memref +/// affine.store %f2, %B[%i2, %i3] : memref /// } /// } /// affine.for %i4 = 0 to %M { /// affine.for %i5 = 0 to %N { -/// %a5 = load %A[%i4, %i5] : memref -/// %b5 = load %B[%i4, %i5] : memref +/// %a5 = affine.load %A[%i4, %i5] : memref +/// %b5 = affine.load %B[%i4, %i5] : memref /// %s5 = addf %a5, %b5 : f32 /// // non-scoped %f1 /// %s6 = addf %s5, %f1 : f32 @@ -411,7 +411,7 @@ using namespace mlir; /// %s7 = addf %s5, %f2 : f32 /// // diamond dependency. /// %s8 = addf %s7, %s6 : f32 -/// store %s8, %C[%i4, %i5] : memref +/// affine.store %s8, %C[%i4, %i5] : memref /// } /// } /// %c7 = constant 7 : index @@ -421,15 +421,14 @@ using namespace mlir; /// } /// ``` /// -/// TODO(ntv): update post b/119731251. -/// The -vectorize pass with the following arguments: +/// The -affine-vectorize pass with the following arguments: /// ``` -/// -vectorize -virtual-vector-size 256 --test-fastest-varying=0 +/// -affine-vectorize -virtual-vector-size 256 --test-fastest-varying=0 /// ``` /// /// produces this standard innermost-loop vectorized code: /// ```mlir -/// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { +/// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { /// %0 = alloc(%arg0, %arg1) : memref /// %1 = alloc(%arg0, %arg1) : memref /// %2 = alloc(%arg0, %arg1) : memref @@ -476,16 +475,15 @@ using namespace mlir; /// } /// ``` /// -/// TODO(ntv): update post b/119731251. -/// The -vectorize pass with the following arguments: +/// The -affine-vectorize pass with the following arguments: /// ``` -/// -vectorize -virtual-vector-size 32 -virtual-vector-size 256 +/// -affine-vectorize -virtual-vector-size 32 -virtual-vector-size 256 /// --test-fastest-varying=1 --test-fastest-varying=0 /// ``` /// /// produces this more interesting mixed outer-innermost-loop vectorized code: /// ```mlir -/// mlfunc @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { +/// func @vector_add_2d(%arg0 : index, %arg1 : index) -> f32 { /// %0 = alloc(%arg0, %arg1) : memref /// %1 = alloc(%arg0, %arg1) : memref /// %2 = alloc(%arg0, %arg1) : memref From e3b2203323c578cc9a3e1a5bca51d00c050cb18e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 09:44:21 -0800 Subject: [PATCH 265/383] Use std::rint in scalar_round_op_google PiperOrigin-RevId: 284203337 Change-Id: I9dcdd1f9df1fa8eab89eced0498b8387d3871f73 --- tensorflow/core/kernels/cwise_ops.h | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h index a2b8748dd9a..fdcc1e1e49c 100644 --- a/tensorflow/core/kernels/cwise_ops.h +++ b/tensorflow/core/kernels/cwise_ops.h @@ -568,20 +568,6 @@ struct scalar_round_op_google { } }; -template <> -struct scalar_round_op_google { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE float operator()(const float& x) const { - return std::rint(x); - } -}; -template <> -struct scalar_round_op_google { - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double operator()( - const double& x) const { - return std::rint(x); - } -}; - template struct scalar_round_op_google { EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar From 8ea11fcde73a4d8cdebcd223be7052c2141b0240 Mon Sep 17 00:00:00 2001 From: Jose Baiocchi Date: Fri, 6 Dec 2019 10:04:34 -0800 Subject: [PATCH 266/383] Make profiler classes do nothing if IS_MOBILE_PLATFORM PiperOrigin-RevId: 284207580 Change-Id: I7faf2df1fe24b34d449bcb69d66ff946942f1a30 --- tensorflow/core/profiler/internal/BUILD | 6 ---- tensorflow/core/profiler/lib/BUILD | 5 +-- .../core/profiler/lib/profiler_session.cc | 13 +++++++ .../core/profiler/lib/scoped_annotation.h | 21 ++++++++++- tensorflow/core/profiler/lib/traceme.h | 35 +++++++++++++------ 5 files changed, 60 insertions(+), 20 deletions(-) diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD index 2b59dab4a75..a69806ef639 100644 --- a/tensorflow/core/profiler/internal/BUILD +++ b/tensorflow/core/profiler/internal/BUILD @@ -428,13 +428,7 @@ cc_library( filegroup( name = "mobile_srcs", srcs = [ - "annotation_stack.cc", - "annotation_stack.h", - "profiler_factory.cc", - "profiler_factory.h", "profiler_interface.h", - "traceme_recorder.cc", - "traceme_recorder.h", ], visibility = ["//visibility:public"], ) diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD index 4a0c0107e21..54b85b03045 100644 --- a/tensorflow/core/profiler/lib/BUILD +++ b/tensorflow/core/profiler/lib/BUILD @@ -21,6 +21,7 @@ cc_library( ] + select({ "//tensorflow:android": [], "//conditions:default": [ + "//tensorflow/core/platform", "//tensorflow/core:core_cpu_lib", "//tensorflow/core:lib", "//tensorflow/core:lib_internal", @@ -47,6 +48,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:lib", + "//tensorflow/core/platform", "//tensorflow/core/profiler/internal:traceme_recorder", "@com_google_absl//absl/strings", ], @@ -58,6 +60,7 @@ cc_library( visibility = ["//visibility:public"], deps = [ "//tensorflow/core:lib", + "//tensorflow/core/platform", "//tensorflow/core/profiler/internal:annotation_stack", "@com_google_absl//absl/strings", ], @@ -75,8 +78,6 @@ filegroup( srcs = [ "profiler_session.cc", "profiler_session.h", - "profiler_utils.cc", - "profiler_utils.h", "scoped_annotation.h", "traceme.h", ], diff --git a/tensorflow/core/profiler/lib/profiler_session.cc b/tensorflow/core/profiler/lib/profiler_session.cc index 340cd639c5f..3f69e5ad624 100644 --- a/tensorflow/core/profiler/lib/profiler_session.cc +++ b/tensorflow/core/profiler/lib/profiler_session.cc @@ -24,9 +24,12 @@ limitations under the License. #include "tensorflow/core/common_runtime/step_stats_collector.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/platform.h" #include "tensorflow/core/platform/types.h" +#if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/profiler/internal/profiler_factory.h" #include "tensorflow/core/profiler/lib/profiler_utils.h" +#endif #include "tensorflow/core/protobuf/config.pb.h" #include "tensorflow/core/protobuf/error_codes.pb.h" #include "tensorflow/core/protobuf/trace_events.pb.h" @@ -194,7 +197,9 @@ Status ProfilerSession::CollectData(RunMetadata* run_metadata) { if (active_) { // Allow another session to start. +#if !defined(IS_MOBILE_PLATFORM) profiler::ReleaseProfilerLock(); +#endif active_ = false; } @@ -215,7 +220,11 @@ Status ProfilerSession::SerializeToString(string* content) { } ProfilerSession::ProfilerSession(const profiler::ProfilerOptions& options) +#if !defined(IS_MOBILE_PLATFORM) : active_(profiler::AcquireProfilerLock()), +#else + : active_(false), +#endif start_time_micros_(Env::Default()->NowNanos() / EnvTime::kMicrosToNanos) { if (!active_) { status_ = tensorflow::Status(error::UNAVAILABLE, @@ -225,7 +234,9 @@ ProfilerSession::ProfilerSession(const profiler::ProfilerOptions& options) LOG(INFO) << "Profiler session started."; +#if !defined(IS_MOBILE_PLATFORM) CreateProfilers(options, &profilers_); +#endif status_ = Status::OK(); for (auto& profiler : profilers_) { @@ -244,7 +255,9 @@ ProfilerSession::~ProfilerSession() { if (active_) { // Allow another session to start. +#if !defined(IS_MOBILE_PLATFORM) profiler::ReleaseProfilerLock(); +#endif } } } // namespace tensorflow diff --git a/tensorflow/core/profiler/lib/scoped_annotation.h b/tensorflow/core/profiler/lib/scoped_annotation.h index 1ba6982b8f1..61b0cf42dd6 100644 --- a/tensorflow/core/profiler/lib/scoped_annotation.h +++ b/tensorflow/core/profiler/lib/scoped_annotation.h @@ -21,8 +21,11 @@ limitations under the License. #include "absl/strings/string_view.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/platform.h" #include "tensorflow/core/platform/types.h" +#if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/profiler/internal/annotation_stack.h" +#endif namespace tensorflow { namespace profiler { @@ -39,31 +42,39 @@ namespace profiler { class ScopedAnnotation { public: explicit ScopedAnnotation(absl::string_view name) { +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) { old_length_ = AnnotationStack::PushAnnotation(name); } +#endif } explicit ScopedAnnotation(const char* name) : ScopedAnnotation(absl::string_view(name)) {} explicit ScopedAnnotation(const string& name) { +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) { old_length_ = AnnotationStack::PushAnnotation(name); } +#endif } explicit ScopedAnnotation(string&& name) { +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) { old_length_ = AnnotationStack::PushAnnotation(std::move(name)); } +#endif } template explicit ScopedAnnotation(NameGeneratorT name_generator) { +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) { old_length_ = AnnotationStack::PushAnnotation(name_generator()); } +#endif } // Pops the name passed in the constructor from the current annotation. @@ -71,12 +82,20 @@ class ScopedAnnotation { // TODO(b/137971921): without this memory fence, two presubmit tests will // fail probably due to compiler in that presubmit config. std::atomic_thread_fence(std::memory_order_acquire); +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(old_length_ != kInvalidLength)) { AnnotationStack::PopAnnotation(old_length_); } +#endif } - static bool IsEnabled() { return AnnotationStack::IsEnabled(); } + static bool IsEnabled() { +#if !defined(IS_MOBILE_PLATFORM) + return AnnotationStack::IsEnabled(); +#else + return false; +#endif + } private: // signals that annotation is disabled at the constructor. diff --git a/tensorflow/core/profiler/lib/traceme.h b/tensorflow/core/profiler/lib/traceme.h index 821c5eaf9d2..2937a3483ac 100644 --- a/tensorflow/core/profiler/lib/traceme.h +++ b/tensorflow/core/profiler/lib/traceme.h @@ -15,14 +15,15 @@ limitations under the License. #ifndef TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_ #define TENSORFLOW_CORE_PROFILER_LIB_TRACEME_H_ -#include - #include "absl/strings/string_view.h" #include "tensorflow/core/platform/env_time.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/platform.h" #include "tensorflow/core/platform/types.h" +#if !defined(IS_MOBILE_PLATFORM) #include "tensorflow/core/profiler/internal/traceme_recorder.h" +#endif namespace tensorflow { namespace profiler { @@ -80,12 +81,12 @@ class TraceMe { // out their host traces based on verbosity. explicit TraceMe(absl::string_view activity_name, int level = 1) { DCHECK_GE(level, 1); +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { new (&no_init_.name) string(activity_name); start_time_ = EnvTime::NowNanos(); - } else { - start_time_ = kUntracedActivity; } +#endif } // string&& constructor to prevent an unnecessary string copy, e.g. when a @@ -95,12 +96,12 @@ class TraceMe { // constructor so we avoid copying them when tracing is disabled. explicit TraceMe(string &&activity_name, int level = 1) { DCHECK_GE(level, 1); +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { new (&no_init_.name) string(std::move(activity_name)); start_time_ = EnvTime::NowNanos(); - } else { - start_time_ = kUntracedActivity; } +#endif } // Do not allow passing strings by reference or value since the caller @@ -125,12 +126,12 @@ class TraceMe { template explicit TraceMe(NameGeneratorT name_generator, int level = 1) { DCHECK_GE(level, 1); +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { new (&no_init_.name) string(name_generator()); start_time_ = EnvTime::NowNanos(); - } else { - start_time_ = kUntracedActivity; } +#endif } // Stop tracing the activity. Called by the destructor, but exposed to allow @@ -145,6 +146,7 @@ class TraceMe { // spuriously record the event. This is extremely rare, and acceptable as // event will be discarded when its start timestamp fall outside of the // start/stop session timestamp. +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(start_time_ != kUntracedActivity)) { if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) { TraceMeRecorder::Record({kCompleteActivity, std::move(no_init_.name), @@ -153,6 +155,7 @@ class TraceMe { no_init_.name.~string(); start_time_ = kUntracedActivity; } +#endif } ~TraceMe() { Stop(); } @@ -162,6 +165,7 @@ class TraceMe { // Record the start time of an activity. // Returns the activity ID, which is used to stop the activity. static uint64 ActivityStart(absl::string_view name, int level = 1) { +#if !defined(IS_MOBILE_PLATFORM) if (TF_PREDICT_FALSE(TraceMeRecorder::Active(level))) { uint64 activity_id = TraceMeRecorder::NewActivityId(); TraceMeRecorder::Record({activity_id, string(name), @@ -169,21 +173,30 @@ class TraceMe { /*end_time=*/0}); return activity_id; } +#endif return kUntracedActivity; } // Record the end time of an activity started by ActivityStart(). static void ActivityEnd(uint64 activity_id) { - // We don't check the level again (see ~TraceMe()). +#if !defined(IS_MOBILE_PLATFORM) + // We don't check the level again (see TraceMe::Stop()). if (TF_PREDICT_FALSE(activity_id != kUntracedActivity)) { if (TF_PREDICT_TRUE(TraceMeRecorder::Active())) { TraceMeRecorder::Record({activity_id, /*name=*/"", /*start_time=*/0, /*end_time=*/EnvTime::NowNanos()}); } } +#endif } - static bool Active(int level = 1) { return TraceMeRecorder::Active(level); } + static bool Active(int level = 1) { +#if !defined(IS_MOBILE_PLATFORM) + return TraceMeRecorder::Active(level); +#else + return false; +#endif + } private: // Activity ID or start time used when tracing is disabled. @@ -201,7 +214,7 @@ class TraceMe { string name; } no_init_; - uint64 start_time_; + uint64 start_time_ = kUntracedActivity; }; } // namespace profiler From eb95904f1e41f827ea971c9e2c12a9e780b08b9e Mon Sep 17 00:00:00 2001 From: George Karpenkov Date: Fri, 6 Dec 2019 10:05:55 -0800 Subject: [PATCH 267/383] [XLA/GPU] [NFC] Simplify code for GetNumberOfPartialResults PiperOrigin-RevId: 284207886 Change-Id: Ibf97d55454bf6b97fea2f271aa7469c1af5d9898 --- .../compiler/xla/service/gpu/ir_emitter_unnested.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 0e62e27bd99..2f8fd5e01cf 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1988,10 +1988,11 @@ static int GetNumberOfPartialResults( if (reduction_info.IsRowReduction()) { return 1; } - int64 num_thread = mapping_scheme.GetNumberOfThreadsForDimensionX(); - int64 tile_size = mapping_scheme.GetTileSizeForDimensionX(); - CHECK_EQ(tile_size % num_thread, 0); - return tile_size / num_thread; + int64 num_partial_results = mapping_scheme.DilatedX() ? 1 : 2; + CHECK_EQ(num_partial_results, + (mapping_scheme.GetTileSizeForDimensionX() / + mapping_scheme.GetNumberOfThreadsForDimensionX())); + return num_partial_results; } void IrEmitterUnnested::EmitPrologueForOneReduction( From a034a3ad800056d8838309c84bc1ea8d9a58cd3e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 10:08:15 -0800 Subject: [PATCH 268/383] Add conversions of GPU func with memory attributions to LLVM/NVVM GPU functions use memory attributions, a combination of Op attributes and region arguments, to specify function-wide buffers placed in workgroup or private memory spaces. Introduce a lowering pattern for GPU functions to be converted to LLVM functions taking into account memory attributions. Workgroup attributions get transformed into module-level globals with unique names derived from function names. Private attributions get converted into llvm.allocas inside the function body. In both cases, we inject at the beginning of the function the IR that obtains the raw pointer to the data and populates a MemRef descriptor based on the MemRef type of buffer, making attributions compose with the rest of the MemRef lowering and transparent for use with std.load and std.store. While using raw pointers instead of descriptors might have been more efficient, it is better implemented as a canonicalization or a separate transformation so that non-attribution memrefs could also benefit from it. PiperOrigin-RevId: 284208396 Change-Id: Ie330774f90df0c459325fd7146d81cb46da98b39 --- third_party/mlir/BUILD | 1 + .../StandardToLLVM/ConvertStandardToLLVM.h | 12 ++ .../include/mlir/Dialect/GPU/GPUDialect.h | 16 +- .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 145 +++++++++++++++++- .../StandardToLLVM/ConvertStandardToLLVM.cpp | 62 +++++++- 5 files changed, 227 insertions(+), 9 deletions(-) diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index 0854e6be4c3..ebebde633b2 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -700,6 +700,7 @@ cc_library( ":NVVMDialect", ":Pass", ":Transforms", + "@llvm//:support", ], alwayslink = 1, ) diff --git a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h index 6b02edaa389..cef80cf0b23 100644 --- a/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h +++ b/third_party/mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h @@ -168,6 +168,13 @@ public: /// Builds IR creating an `undef` value of the descriptor type. static MemRefDescriptor undef(OpBuilder &builder, Location loc, Type descriptorType); + /// Builds IR creating a MemRef descriptor that represents `type` and + /// populates it with static shape and stride information extracted from the + /// type. + static MemRefDescriptor fromStaticShape(OpBuilder &builder, Location loc, + LLVMTypeConverter &typeConverter, + MemRefType type, Value *memory); + /// Builds IR extracting the allocated pointer from the descriptor. Value *allocatedPtr(OpBuilder &builder, Location loc); /// Builds IR inserting the allocated pointer into the descriptor. @@ -184,18 +191,23 @@ public: /// Builds IR inserting the offset into the descriptor. void setOffset(OpBuilder &builder, Location loc, Value *offset); + void setConstantOffset(OpBuilder &builder, Location loc, uint64_t offset); /// Builds IR extracting the pos-th size from the descriptor. Value *size(OpBuilder &builder, Location loc, unsigned pos); /// Builds IR inserting the pos-th size into the descriptor void setSize(OpBuilder &builder, Location loc, unsigned pos, Value *size); + void setConstantSize(OpBuilder &builder, Location loc, unsigned pos, + uint64_t size); /// Builds IR extracting the pos-th size from the descriptor. Value *stride(OpBuilder &builder, Location loc, unsigned pos); /// Builds IR inserting the pos-th stride into the descriptor void setStride(OpBuilder &builder, Location loc, unsigned pos, Value *stride); + void setConstantStride(OpBuilder &builder, Location loc, unsigned pos, + uint64_t stride); /// Returns the (LLVM) type this descriptor points to. LLVM::LLVMType getElementType(); diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h index fb906b2ace5..619f76937bc 100644 --- a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -61,6 +61,10 @@ public: /// 'gpu.kernel' attribute. static bool isKernel(Operation *op); + /// Returns the numeric value used to identify the workgroup memory address + /// space. + static int getWorkgroupAddressSpace() { return 3; } + LogicalResult verifyOperationAttribute(Operation *op, NamedAttribute attr) override; }; @@ -249,6 +253,12 @@ public: return {begin, getBody().front().args_end()}; } + /// Returns the name of the attribute containing the number of buffers located + /// in the workgroup memory. + static StringRef getNumWorkgroupAttributionsAttrName() { + return "workgroup_attibutions"; + } + private: // FunctionLike trait needs access to the functions below. friend class OpTrait::FunctionLike; @@ -257,12 +267,6 @@ private: unsigned getNumFuncArguments() { return getType().getNumInputs(); } unsigned getNumFuncResults() { return getType().getNumResults(); } - /// Returns the name of the attribute containing the number of buffers located - /// in the workgroup memory. - static StringRef getNumWorkgroupAttributionsAttrName() { - return "workgroup_attibutions"; - } - /// Returns the keywords used in the custom syntax for this Op. static StringRef getWorkgroupKeyword() { return "workgroup"; } static StringRef getPrivateKeyword() { return "private"; } diff --git a/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 54dd18e7492..50a2e2efd2c 100644 --- a/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -29,6 +29,8 @@ #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/FormatVariadic.h" + #include "../GPUCommon/IndexIntrinsicsOpLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" @@ -451,6 +453,146 @@ private: static constexpr int kWarpSize = 32; }; +namespace { + +struct FuncOpLowering : LLVMOpLowering { + explicit FuncOpLowering(LLVMTypeConverter &typeConverter) + : LLVMOpLowering(gpu::GPUFuncOp::getOperationName(), + typeConverter.getDialect()->getContext(), + typeConverter) {} + + PatternMatchResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.empty() && "func op is not expected to have operands"); + auto gpuFuncOp = cast(op); + Location loc = gpuFuncOp.getLoc(); + + SmallVector workgroupBuffers; + workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); + for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { + Value *attribution = en.value(); + + auto type = attribution->getType().dyn_cast(); + assert(type && type.hasStaticShape() && "unexpected type in attribution"); + + uint64_t numElements = type.getNumElements(); + + auto elementType = + lowering.convertType(type.getElementType()).cast(); + auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); + auto addSpaceAttr = rewriter.getNamedAttr( + "addr_space", rewriter.getI32IntegerAttr( + gpu::GPUDialect::getWorkgroupAddressSpace())); + std::string name = + llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()); + auto globalOp = rewriter.create( + gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, + LLVM::Linkage::Internal, name, /*value=*/Attribute(), + llvm::makeArrayRef(addSpaceAttr)); + workgroupBuffers.push_back(globalOp); + } + + // Rewrite the original GPU function to an LLVM function. + // TODO(zinenko): there is a hack in the std->llvm lowering that promotes + // structs to pointers that probably needs to be replicated here. + auto funcType = lowering.convertType(gpuFuncOp.getType()) + .cast() + .getPointerElementTy(); + + // Remap proper input types. + TypeConverter::SignatureConversion signatureConversion( + gpuFuncOp.front().getNumArguments()); + for (unsigned i = 0, e = funcType.getFunctionNumParams(); i < e; ++i) + signatureConversion.addInputs(i, funcType.getFunctionParamType(i)); + + // Create the new function operation. Only copy those attributes that are + // not specific to function modeling. + SmallVector attributes; + for (const auto &attr : gpuFuncOp.getAttrs()) { + if (attr.first.is(SymbolTable::getSymbolAttrName()) || + attr.first.is(impl::getTypeAttrName()) || + attr.first.is(gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())) + continue; + attributes.push_back(attr); + } + auto llvmFuncOp = rewriter.create( + gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, + LLVM::Linkage::External, attributes); + + { + // Insert operations that correspond to converted workgroup and private + // memory attributions to the body of the function. This must operate on + // the original function, before the body region is inlined in the new + // function to maintain the relation between block arguments and the + // parent operation that assigns their semantics. + OpBuilder::InsertionGuard guard(rewriter); + + // Rewrite workgroup memory attributions to addresses of global buffers. + rewriter.setInsertionPointToStart(&gpuFuncOp.front()); + unsigned numProperArguments = gpuFuncOp.getNumArguments(); + auto i32Type = LLVM::LLVMType::getInt32Ty(lowering.getDialect()); + + Value *zero = nullptr; + if (!workgroupBuffers.empty()) + zero = rewriter.create(loc, i32Type, + rewriter.getI32IntegerAttr(0)); + for (auto en : llvm::enumerate(workgroupBuffers)) { + LLVM::GlobalOp global = en.value(); + Value *address = rewriter.create(loc, global); + auto elementType = global.getType().getArrayElementType(); + Value *memory = rewriter.create( + loc, elementType.getPointerTo(global.addr_space().getZExtValue()), + address, ArrayRef{zero, zero}); + + // Build a memref descriptor pointing to the buffer to plug with the + // existing memref infrastructure. This may use more registers than + // otherwise necessary given that memref sizes are fixed, but we can try + // and canonicalize that away later. + Value *attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; + auto type = attribution->getType().cast(); + auto descr = MemRefDescriptor::fromStaticShape(rewriter, loc, lowering, + type, memory); + signatureConversion.remapInput(numProperArguments + en.index(), descr); + } + + // Rewrite private memory attributions to alloca'ed buffers. + unsigned numWorkgroupAttributions = + gpuFuncOp.getNumWorkgroupAttributions(); + auto int64Ty = LLVM::LLVMType::getInt64Ty(lowering.getDialect()); + for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { + Value *attribution = en.value(); + auto type = attribution->getType().cast(); + assert(type && type.hasStaticShape() && + "unexpected type in attribution"); + + auto ptrType = lowering.convertType(type.getElementType()) + .cast() + .getPointerTo(type.getMemorySpace()); + Value *numElements = rewriter.create( + gpuFuncOp.getLoc(), int64Ty, + rewriter.getI64IntegerAttr(type.getNumElements())); + Value *allocated = rewriter.create( + gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); + auto descr = MemRefDescriptor::fromStaticShape(rewriter, loc, lowering, + type, allocated); + signatureConversion.remapInput( + numProperArguments + numWorkgroupAttributions + en.index(), descr); + } + } + + rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), + llvmFuncOp.end()); + rewriter.applySignatureConversion(&llvmFuncOp.getBody(), + signatureConversion); + + rewriter.eraseOp(gpuFuncOp); + return matchSuccess(); + } +}; + +} // end namespace + /// Import the GPU Ops to NVVM Patterns. #include "GPUToNVVM.cpp.inc" @@ -479,12 +621,13 @@ public: NVVM::BlockIdYOp, NVVM::BlockIdZOp>, GPUIndexIntrinsicOpLowering, - GPUAllReduceOpLowering>(converter); + GPUAllReduceOpLowering, FuncOpLowering>(converter); patterns.insert>(converter, "__nv_expf", "__nv_exp"); ConversionTarget target(getContext()); target.addIllegalDialect(); target.addIllegalOp(); + target.addIllegalOp(); target.addLegalDialect(); target.addLegalDialect(); // TODO(csigg): Remove once we support replacing non-root ops. diff --git a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp index 7b15b758968..c1a7a336401 100644 --- a/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp +++ b/third_party/mlir/lib/Conversion/StandardToLLVM/ConvertStandardToLLVM.cpp @@ -304,6 +304,36 @@ MemRefDescriptor MemRefDescriptor::undef(OpBuilder &builder, Location loc, return MemRefDescriptor(descriptor); } +/// Builds IR creating a MemRef descriptor that represents `type` and +/// populates it with static shape and stride information extracted from the +/// type. +MemRefDescriptor +MemRefDescriptor::fromStaticShape(OpBuilder &builder, Location loc, + LLVMTypeConverter &typeConverter, + MemRefType type, Value *memory) { + assert(type.hasStaticShape() && "unexpected dynamic shape"); + assert(type.getAffineMaps().empty() && "unexpected layout map"); + + auto convertedType = typeConverter.convertType(type); + assert(convertedType && "unexpected failure in memref type conversion"); + + auto descr = MemRefDescriptor::undef(builder, loc, convertedType); + descr.setAllocatedPtr(builder, loc, memory); + descr.setAlignedPtr(builder, loc, memory); + descr.setConstantOffset(builder, loc, 0); + + // Fill in sizes and strides, in reverse order to simplify stride + // calculation. + uint64_t runningStride = 1; + for (unsigned i = type.getRank(); i > 0; --i) { + unsigned dim = i - 1; + descr.setConstantSize(builder, loc, dim, type.getDimSize(dim)); + descr.setConstantStride(builder, loc, dim, runningStride); + runningStride *= type.getDimSize(dim); + } + return descr; +} + /// Builds IR extracting the allocated pointer from the descriptor. Value *MemRefDescriptor::allocatedPtr(OpBuilder &builder, Location loc) { return extractPtr(builder, loc, kAllocatedPtrPosInMemRefDescriptor); @@ -326,6 +356,14 @@ void MemRefDescriptor::setAlignedPtr(OpBuilder &builder, Location loc, setPtr(builder, loc, kAlignedPtrPosInMemRefDescriptor, ptr); } +// Creates a constant Op producing a value of `resultType` from an index-typed +// integer attribute. +static Value *createIndexAttrConstant(OpBuilder &builder, Location loc, + Type resultType, int64_t value) { + return builder.create( + loc, resultType, builder.getIntegerAttr(builder.getIndexType(), value)); +} + /// Builds IR extracting the offset from the descriptor. Value *MemRefDescriptor::offset(OpBuilder &builder, Location loc) { return builder.create( @@ -341,6 +379,13 @@ void MemRefDescriptor::setOffset(OpBuilder &builder, Location loc, builder.getI64ArrayAttr(kOffsetPosInMemRefDescriptor)); } +/// Builds IR inserting the offset into the descriptor. +void MemRefDescriptor::setConstantOffset(OpBuilder &builder, Location loc, + uint64_t offset) { + setOffset(builder, loc, + createIndexAttrConstant(builder, loc, indexType, offset)); +} + /// Builds IR extracting the pos-th size from the descriptor. Value *MemRefDescriptor::size(OpBuilder &builder, Location loc, unsigned pos) { return builder.create( @@ -356,6 +401,13 @@ void MemRefDescriptor::setSize(OpBuilder &builder, Location loc, unsigned pos, builder.getI64ArrayAttr({kSizePosInMemRefDescriptor, pos})); } +/// Builds IR inserting the pos-th size into the descriptor +void MemRefDescriptor::setConstantSize(OpBuilder &builder, Location loc, + unsigned pos, uint64_t size) { + setSize(builder, loc, pos, + createIndexAttrConstant(builder, loc, indexType, size)); +} + /// Builds IR extracting the pos-th size from the descriptor. Value *MemRefDescriptor::stride(OpBuilder &builder, Location loc, unsigned pos) { @@ -372,6 +424,13 @@ void MemRefDescriptor::setStride(OpBuilder &builder, Location loc, unsigned pos, builder.getI64ArrayAttr({kStridePosInMemRefDescriptor, pos})); } +/// Builds IR inserting the pos-th stride into the descriptor +void MemRefDescriptor::setConstantStride(OpBuilder &builder, Location loc, + unsigned pos, uint64_t stride) { + setStride(builder, loc, pos, + createIndexAttrConstant(builder, loc, indexType, stride)); +} + LLVM::LLVMType MemRefDescriptor::getElementType() { return value->getType().cast().getStructElementType( kAlignedPtrPosInMemRefDescriptor); @@ -448,8 +507,7 @@ public: // Create an LLVM IR pseudo-operation defining the given index constant. Value *createIndexConstant(ConversionPatternRewriter &builder, Location loc, uint64_t value) const { - auto attr = builder.getIntegerAttr(builder.getIndexType(), value); - return builder.create(loc, getIndexType(), attr); + return createIndexAttrConstant(builder, loc, getIndexType(), value); } protected: From e2f5c41688d8af36756a6d9a97eca6785b30dae9 Mon Sep 17 00:00:00 2001 From: Sean Silva Date: Fri, 6 Dec 2019 10:14:23 -0800 Subject: [PATCH 269/383] Expand casting abilities when lowering to CFG The previous code was relying on the tensor_cast op, which isn't powerful enough for the casts involved when lowering variant types. Use tf.Cast instead, which is powerful enough. PiperOrigin-RevId: 284209653 Change-Id: I0dad82a0032ad19710e01f0be46d6153762494df --- .../tests/functional-control-flow-to-cfg.mlir | 104 +++++++++--------- .../functional_control_flow_to_cfg.cc | 37 +++---- 2 files changed, 69 insertions(+), 72 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir index 2a0434b69e0..a0390ec8738 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/functional-control-flow-to-cfg.mlir @@ -49,40 +49,33 @@ func @testIf3Result(tensor, tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi8>, // ----- -func @testIf1Then(tensor<2x?xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> -func @testIf1Else(tensor<*xf32>, tensor<2x?xf32>) -> tensor<*xf32> +func @testIfThen(%arg0: tensor) -> tensor { + return %arg0 : tensor +} +func @testIfElse(%arg0: tensor) -> tensor { + return %arg0 : tensor +} -// CHECK-LABEL: func @testIf1Casts(%arg0: tensor, %arg1: tensor<2x2xf32>, %arg2: tensor<*xf32>) -func @testIf1Casts(tensor, tensor<2x2xf32>, tensor<*xf32>) -> tensor<2x?xf32> { -^bb0(%arg0: tensor, %arg1: tensor<2x2xf32>, %arg2: tensor<*xf32>): - - %1 = "tf.If"(%arg0, %arg1, %arg2) { - then_branch = @testIf1Then, else_branch = @testIf1Else, is_stateless = false - } : (tensor, tensor<2x2xf32>, tensor<*xf32>) -> tensor<2x?xf32> - -// CHECK: %0 = extract_element %arg0[] : tensor -// CHECK: cond_br %0, ^bb1, ^bb2 -// CHECK:^bb1: // pred: ^bb0 -// CHECK: %1 = tensor_cast %arg1 : tensor<2x2xf32> to tensor<2x?xf32> -// CHECK: %2 = tensor_cast %arg2 : tensor<*xf32> to tensor<2x2xf32> -// CHECK: %3 = call @testIf1Then(%1, %2) : (tensor<2x?xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> -// CHECK: %4 = tensor_cast %3 : tensor<2x2xf32> to tensor<2x?xf32> -// CHECK: br ^bb3(%4 : tensor<2x?xf32>) - -// CHECK:^bb2: // pred: ^bb0 -// CHECK: %5 = tensor_cast %arg1 : tensor<2x2xf32> to tensor<*xf32> -// CHECK: %6 = tensor_cast %arg2 : tensor<*xf32> to tensor<2x?xf32> -// CHECK: %7 = call @testIf1Else(%5, %6) : (tensor<*xf32>, tensor<2x?xf32>) -> tensor<*xf32> -// CHECK: %8 = tensor_cast %7 : tensor<*xf32> to tensor<2x?xf32> -// CHECK: br ^bb3(%8 : tensor<2x?xf32>) - -// CHECK:^bb3(%9: tensor<2x?xf32>): // 2 preds: ^bb1, ^bb2 - - %2 = "tf.Add"(%1, %1) : (tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xf32> -// CHECK: %10 = "tf.Add"(%9, %9) : (tensor<2x?xf32>, tensor<2x?xf32>) -> tensor<2x?xf32> - - return %2 : tensor<2x?xf32> -// CHECK: return %10 : tensor<2x?xf32> +// CHECK-LABEL: func @testIfCasts(%arg0: tensor, %arg1: tensor>>) -> tensor>> +func @testIfCasts(%arg0: tensor, %arg1: tensor>>) -> tensor>> { + %0 = "tf.If"(%arg0, %arg1) { + then_branch = @testIfThen, else_branch = @testIfElse, is_stateless = false + } : (tensor, tensor>>) -> tensor>> + return %0: tensor>> +// CHECK: %0 = extract_element %arg0[] : tensor +// CHECK: cond_br %0, ^bb1, ^bb2 +// CHECK: ^bb1: +// CHECK: %1 = "tf.Cast"(%arg1) {Truncate = false} : (tensor>>) -> tensor +// CHECK: %2 = call @testIfThen(%1) : (tensor) -> tensor +// CHECK: %3 = "tf.Cast"(%2) {Truncate = false} : (tensor) -> tensor>> +// CHECK: br ^bb3(%3 : tensor>>) +// CHECK: ^bb2: +// CHECK: %4 = "tf.Cast"(%arg1) {Truncate = false} : (tensor>>) -> tensor +// CHECK: %5 = call @testIfElse(%4) : (tensor) -> tensor +// CHECK: %6 = "tf.Cast"(%5) {Truncate = false} : (tensor) -> tensor>> +// CHECK: br ^bb3(%6 : tensor>>) +// CHECK: ^bb3(%7: tensor>>): +// CHECK: return %7 : tensor>> } // ----- @@ -188,31 +181,36 @@ func @testComplexWhile1Result(tensor<*xf32>) -> (tensor<*xf32>) { // ----- -func @testWhileCond(tensor) -> (tensor) -func @testWhileBody(tensor<*xf32>) -> (tensor) +func @testWhileCond(%arg0: tensor) -> (tensor) { + %true = "tf.Const"() { value = dense : tensor } : () -> (tensor) + return %true : tensor +} +func @testWhileBody(%arg0: tensor>>) -> (tensor>>) { + %0 = "tf.Cast"(%arg0) : (tensor>>) -> tensor>> + return %0 : tensor>> +} -// CHECK-LABEL: func @testWhileCasts(%arg0: tensor<1x3xf32>) -func @testWhileCasts(%arg0: tensor<1x3xf32>) -> (tensor) { +// CHECK-LABEL: func @testWhileCasts(%arg0: tensor>>) -> tensor>> +func @testWhileCasts(%arg0: tensor>>) -> (tensor>>) { %0 = "tf.While"(%arg0) { cond = @testWhileCond, body = @testWhileBody, is_stateless = false - } : (tensor<1x3xf32>) -> (tensor) - -// CHECK: %0 = tensor_cast %arg0 : tensor<1x3xf32> to tensor -// CHECK: br ^bb1(%0 : tensor) -// CHECK: ^bb1(%1: tensor): -// CHECK: %2 = call @testWhileCond(%1) : (tensor) -> tensor + } : (tensor>>) -> (tensor>>) + return %0 : tensor>> +// CHECK: %0 = "tf.Cast"(%arg0) {Truncate = false} : (tensor>>) -> tensor +// CHECK: br ^bb1(%0 : tensor) +// CHECK: ^bb1(%1: tensor): // 2 preds: ^bb0, ^bb2 +// CHECK: %2 = call @testWhileCond(%1) : (tensor) -> tensor // CHECK: %3 = extract_element %2[] : tensor -// CHECK: %4 = tensor_cast %1 : tensor to tensor<*xf32> -// CHECK: cond_br %3, ^bb2(%4 : tensor<*xf32>), ^bb3(%4 : tensor<*xf32>) -// CHECK: ^bb2(%5: tensor<*xf32>): -// CHECK: %6 = call @testWhileBody(%5) : (tensor<*xf32>) -> tensor -// CHECK: %7 = tensor_cast %6 : tensor to tensor -// CHECK: br ^bb1(%7 : tensor) -// CHECK: ^bb3(%8: tensor<*xf32>): -// CHECK: %9 = tensor_cast %8 : tensor<*xf32> to tensor +// CHECK: %4 = "tf.Cast"(%1) {Truncate = false} : (tensor) -> tensor>> +// CHECK: cond_br %3, ^bb2(%4 : tensor>>), ^bb3(%4 : tensor>>) +// CHECK: ^bb2(%5: tensor>>): // pred: ^bb1 +// CHECK: %6 = call @testWhileBody(%5) : (tensor>>) -> tensor>> +// CHECK: %7 = "tf.Cast"(%6) {Truncate = false} : (tensor>>) -> tensor +// CHECK: br ^bb1(%7 : tensor) +// CHECK: ^bb3(%8: tensor>>): // pred: ^bb1 +// CHECK: %9 = "tf.Cast"(%8) {Truncate = false} : (tensor>>) -> tensor>> +// CHECK: return %9 : tensor>> - return %0 : tensor -// CHECK: return %9 : tensor } // ----- diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc index 880b4c4210b..e9b3879c025 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_cfg.cc @@ -17,6 +17,7 @@ limitations under the License. // standard TensorFlow dialect to MLIR Control Flow Graph (CFG) form. #include "mlir/Dialect/StandardOps/Ops.h" // TF:local_config_mlir +#include "mlir/IR/Attributes.h" // TF:local_config_mlir #include "mlir/IR/Builders.h" // TF:local_config_mlir #include "mlir/IR/Operation.h" // TF:local_config_mlir #include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir @@ -79,8 +80,11 @@ static Operation* CallFn(Location loc, for (int i = 0; i < num_operands; ++i) { Value* val = get_arg(i); Type expected = fn_type.getInput(i); - if (val->getType() != expected) - val = builder->create(loc, val, expected); + if (val->getType() != expected) { + val = + builder->create(loc, expected, val, + /*Truncate=*/builder->getBoolAttr(false)); + } operands.push_back(val); } return builder->create(loc, fn, operands).getOperation(); @@ -100,8 +104,11 @@ static llvm::SmallVector PrepareValsForJump( for (int i = 0; i < num_vals; ++i) { Value* val = get_val(i); Type expected = block->getArgument(i)->getType(); - if (val->getType() != expected) - val = builder->create(loc, val, expected); + if (val->getType() != expected) { + val = + builder->create(loc, expected, val, + /*Truncate=*/builder->getBoolAttr(false)); + } result.push_back(val); } return result; @@ -131,8 +138,11 @@ static void ReplaceOpResultWithBlockArgs(Location loc, Operation* op, for (unsigned i = 0, e = op->getNumResults(); i != e; ++i) { Value* arg = block->getArgument(i); Value* result = op->getResult(i); - if (arg->getType() != result->getType()) - arg = builder->create(loc, arg, result->getType()); + if (arg->getType() != result->getType()) { + arg = + builder->create(loc, result->getType(), arg, + /*Truncate=*/builder->getBoolAttr(false)); + } result->replaceAllUsesWith(arg); } } @@ -301,26 +311,15 @@ void FunctionalControlFlowToCFG::runOnFunction() { // subsequent blocks. // // TODO: Use PatternRewriter to eliminate these function control flow ops. - auto has_variant_operand = [](Operation* op) { - auto is_variant = [](Type ty) { - return getElementTypeOrSelf(ty).getKind() == TensorFlowTypes::VARIANT; - }; - - if (llvm::none_of(op->getOperandTypes(), is_variant)) return false; - - op->emitOpError() << "does not yet support operands of type variant " - "for conversion to CFG"; - return true; - }; if (IfOp if_op = llvm::dyn_cast(op)) { - if (has_variant_operand(&op) || failed(LowerIfOp(if_op))) { + if (failed(LowerIfOp(if_op))) { return signalPassFailure(); } break; } if (WhileOp while_op = llvm::dyn_cast(op)) { - if (has_variant_operand(&op) || failed(LowerWhileOp(while_op))) { + if (failed(LowerWhileOp(while_op))) { return signalPassFailure(); } break; From b2cf8773f710cf7d154b346ecf9aefa338c1a9a7 Mon Sep 17 00:00:00 2001 From: Reed Wanderman-Milne Date: Fri, 6 Dec 2019 10:44:26 -0800 Subject: [PATCH 270/383] Raise better error if loss_scale=None is passed to LossScaleOptimizer. This is an easy mistake to make, as a dtype policy's loss scale can be None. I might later change this so that passing loss_scale=None means "do not do loss scaling" PiperOrigin-RevId: 284216044 Change-Id: I59fbf97025fbea01c1f79cc9692506bdcae5effe --- .../mixed_precision/experimental/loss_scale_optimizer.py | 2 ++ .../experimental/loss_scale_optimizer_test.py | 5 +++++ .../python/training/experimental/loss_scale_optimizer.py | 2 ++ .../training/experimental/loss_scale_optimizer_test.py | 5 +++++ 4 files changed, 14 insertions(+) diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py index 8d14ec1ceca..1dcf4a7f248 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py +++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer.py @@ -129,6 +129,8 @@ class LossScaleOptimizer(optimizer_v2.OptimizerV2): self._optimizer = optimizer self._loss_scale = keras_loss_scale_module.get(loss_scale) + if self._loss_scale is None: + raise ValueError('loss_scale cannot be None.') for weight in loss_scale_module.get_loss_scale_weights(self._loss_scale): # We cannot call `track_variable` in the LossScale class itself, because a # file outside of Keras cannot depend on a Keras file. Calling it here diff --git a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py index 05035c50dab..58107e7a3a5 100644 --- a/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py +++ b/tensorflow/python/keras/mixed_precision/experimental/loss_scale_optimizer_test.py @@ -310,6 +310,11 @@ class LossScaleOptimizerTest(test.TestCase, parameterized.TestCase): 'will be removed in the future.'): opt.add_slot(None, None) + def testPassingNoneToLossScale(self): + opt = gradient_descent.SGD() + with self.assertRaisesRegexp(ValueError, r'loss_scale cannot be None'): + loss_scale_optimizer.LossScaleOptimizer(opt, None) + @parameterized.named_parameters(*TESTCASES) @test_util.run_in_graph_and_eager_modes def testGettingAndSettingLearningRate(self, strategy_fn): diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer.py b/tensorflow/python/training/experimental/loss_scale_optimizer.py index a7f772ce547..ae8d1e8c788 100644 --- a/tensorflow/python/training/experimental/loss_scale_optimizer.py +++ b/tensorflow/python/training/experimental/loss_scale_optimizer.py @@ -68,6 +68,8 @@ class MixedPrecisionLossScaleOptimizer(optimizer.Optimizer): super(MixedPrecisionLossScaleOptimizer, self).__init__(use_locking, name) self._loss_scale = loss_scale_module.get(loss_scale) + if self._loss_scale is None: + raise ValueError('loss_scale cannot be None') self._track_trackable(self._optimizer, 'base_optimizer') self._track_trackable(self._loss_scale, 'loss_scale') diff --git a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py index c2259cd7ed2..ef82696c997 100644 --- a/tensorflow/python/training/experimental/loss_scale_optimizer_test.py +++ b/tensorflow/python/training/experimental/loss_scale_optimizer_test.py @@ -261,6 +261,11 @@ class MixedPrecisionLossScaleOptimizerTest(test.TestCase, self.assertEqual(self.evaluate(loss_scale()), 1.) self.assertEqual(self.evaluate(loss_scale._num_good_steps), 1) + def testPassingNoneToLossScale(self): + opt = gradient_descent.GradientDescentOptimizer(1.0) + with self.assertRaisesRegexp(ValueError, r'loss_scale cannot be None'): + loss_scale_optimizer.MixedPrecisionLossScaleOptimizer(opt, None) + if __name__ == '__main__': test.main() From fcef316223eada70a2f3f0bd5525e1e355226615 Mon Sep 17 00:00:00 2001 From: Peter Hawkins Date: Fri, 6 Dec 2019 10:46:03 -0800 Subject: [PATCH 271/383] [XLA] Implement a 64-bit approximation for ErfInv. Enable 64-bit special value tests. Disable SqrtPow equivalence test on GPU due to wrong output. PiperOrigin-RevId: 284216386 Change-Id: Ifeaa2b10f236eb37eea695f973bf93c5100a4dfa --- tensorflow/compiler/xla/client/lib/math.cc | 99 ++++++++++++++++++- .../compiler/xla/client/lib/math_test.cc | 40 +++++++- 2 files changed, 134 insertions(+), 5 deletions(-) diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc index 989968b5cbc..8c85482c8f8 100644 --- a/tensorflow/compiler/xla/client/lib/math.cc +++ b/tensorflow/compiler/xla/client/lib/math.cc @@ -319,6 +319,8 @@ XlaOp Erf(XlaOp x) { }); } +namespace { + // Approximation for the inverse error function from // Giles, M., "Approximating the erfinv function". // The approximation has the form: @@ -331,7 +333,7 @@ XlaOp Erf(XlaOp x) { // p = sum_{i=1}^n gq[i]*w^i // } // return p*x -XlaOp ErfInv(XlaOp x) { +XlaOp ErfInv32(XlaOp x) { constexpr int kDegree = 9; constexpr std::array w_less_than_5_constants = { 2.81022636e-08f, 3.43273939e-07f, -3.5233877e-06f, @@ -371,6 +373,101 @@ XlaOp ErfInv(XlaOp x) { }); } +XlaOp ErfInv64(XlaOp x) { + constexpr std::array w_less_than_6_25_constants = { + -3.6444120640178196996e-21, -1.685059138182016589e-19, + 1.2858480715256400167e-18, 1.115787767802518096e-17, + -1.333171662854620906e-16, 2.0972767875968561637e-17, + 6.6376381343583238325e-15, -4.0545662729752068639e-14, + -8.1519341976054721522e-14, 2.6335093153082322977e-12, + -1.2975133253453532498e-11, -5.4154120542946279317e-11, + 1.051212273321532285e-09, -4.1126339803469836976e-09, + -2.9070369957882005086e-08, 4.2347877827932403518e-07, + -1.3654692000834678645e-06, -1.3882523362786468719e-05, + 0.0001867342080340571352, -0.00074070253416626697512, + -0.0060336708714301490533, 0.24015818242558961693, + 1.6536545626831027356}; + constexpr std::array w_less_than_16_constants = { + 2.2137376921775787049e-09, 9.0756561938885390979e-08, + -2.7517406297064545428e-07, 1.8239629214389227755e-08, + 1.5027403968909827627e-06, -4.013867526981545969e-06, + 2.9234449089955446044e-06, 1.2475304481671778723e-05, + -4.7318229009055733981e-05, 6.8284851459573175448e-05, + 2.4031110387097893999e-05, -0.0003550375203628474796, + 0.00095328937973738049703, -0.0016882755560235047313, + 0.0024914420961078508066, -0.0037512085075692412107, + 0.005370914553590063617, 1.0052589676941592334, + 3.0838856104922207635, + }; + constexpr std::array w_greater_than_16_constants = { + -2.7109920616438573243e-11, -2.5556418169965252055e-10, + 1.5076572693500548083e-09, -3.7894654401267369937e-09, + 7.6157012080783393804e-09, -1.4960026627149240478e-08, + 2.9147953450901080826e-08, -6.7711997758452339498e-08, + 2.2900482228026654717e-07, -9.9298272942317002539e-07, + 4.5260625972231537039e-06, -1.9681778105531670567e-05, + 7.5995277030017761139e-05, -0.00021503011930044477347, + -0.00013871931833623122026, 1.0103004648645343977, + 4.8499064014085844221, + }; + // Compute logarithm of (1+arg) using log1p(arg) which is more precise than + // log(1+arg) when arg is close to zero. For more details, see + // https://en.cppreference.com/w/cpp/numeric/math/log1p + auto w = -Log1p(-x * x); + + auto lt_6_25 = Lt(w, ScalarLike(x, 6.25)); + auto lt_16 = Lt(w, ScalarLike(x, 16)); + auto coefficient = [&](int i) { + auto c = FullLike(x, w_less_than_6_25_constants[i]); + if (i < 19) { + c = Select(lt_6_25, c, FullLike(x, w_less_than_16_constants[i])); + } + if (i < 17) { + c = Select(lt_16, c, FullLike(x, w_greater_than_16_constants[i])); + } + return c; + }; + auto sqrt_w = Sqrt(w); + w = Select(lt_6_25, w - ScalarLike(x, 3.125), + sqrt_w - Select(lt_16, ScalarLike(x, 3.25), ScalarLike(x, 5.0))); + auto p = coefficient(0); + for (int i = 1; i < 17; ++i) { + p = coefficient(i) + p * w; + } + for (int i = 17; i < 19; ++i) { + p = Select(lt_16, coefficient(i) + p * w, p); + } + for (int i = 19; i < 23; ++i) { + p = Select(lt_6_25, coefficient(i) + p * w, p); + } + // Result modulo edge cases. + XlaOp result = p * x; + + // Handle edge cases, namely erfinv(+/-1) = +/-inf. (The above computation is + // indeterminate, and can give nan or -/+inf.) + auto& b = *x.builder(); + return b.ReportErrorOrReturn([&]() -> StatusOr { + TF_ASSIGN_OR_RETURN(Shape shape, b.GetShape(x)); + return Select(Eq(Abs(x), ScalarLike(x, 1)), + x * MaxValue(&b, shape.element_type()), result); + }); +} + +} // namespace + +XlaOp ErfInv(XlaOp x) { + auto& b = *x.builder(); + return b.ReportErrorOrReturn([&]() -> StatusOr { + TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("ErfInv", x)); + TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x)); + if (shape.element_type() == F64) { + return ErfInv64(x); + } + return DoWithUpcastToF32(x, {BF16, F16}, + [](XlaOp x) { return ErfInv32(x); }); + }); +} + namespace { // Coefficients for the Lanczos approximation of the gamma function. The // coefficients are uniquely determined by the choice of g and n (kLanczosGamma diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc index 6415e9383b5..8d13922e0e3 100644 --- a/tensorflow/compiler/xla/client/lib/math_test.cc +++ b/tensorflow/compiler/xla/client/lib/math_test.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/compiler/xla/client/lib/math.h" + #include "tensorflow/compiler/xla/client/lib/constants.h" #include "tensorflow/compiler/xla/client/xla_builder.h" #include "tensorflow/compiler/xla/literal_util.h" @@ -116,6 +117,10 @@ class MathTypedTest : public MathTest { // // For good measure, we also check pow with an exponent other than 0.5. void TestSqrtPowInequivalence() { + // TODO(b/145798892): test fails on GPU for double values. + if (std::is_same::value) { + return; + } SetFastMathDisabled(true); // Tests disable constant folding by default, but this test needs it @@ -151,11 +156,16 @@ class MathTypedTest : public MathTest { }; // TODO(b/123355973): Add bfloat16 to TestTypes once it's working. -#ifdef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16 -using TestTypes = ::testing::Types; -#else -using TestTypes = ::testing::Types; +using TestTypes = ::testing::Types; TYPED_TEST_CASE(MathTypedTest, TestTypes); @@ -224,6 +234,28 @@ XLA_TEST_F(MathTest, SqrtF32) { ComputeAndCompareR0(&builder, 0.0f, {zero_data.get()}, error_spec_); } +#ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64 +XLA_TEST_F(MathTest, ErfInvF64) { + XlaBuilder builder(TestName()); + auto x = ConstantR1( + &builder, {-0.9, -0.8, -0.7, -0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0.0, 0.1, + 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9}); + ErfInv(x); + + std::vector expected = {-1.163087153676674, -0.9061938024368231, + -0.732869077959217, -0.5951160814499948, + -0.4769362762044698, -0.37080715859355795, + -0.27246271472675443, -0.1791434546212916, + -0.08885599049425767, 0., + 0.08885599049425777, 0.1791434546212916, + 0.27246271472675443, 0.37080715859355784, + 0.4769362762044698, 0.5951160814499948, + 0.732869077959217, 0.9061938024368231, + 1.1630871536766736}; + ComputeAndCompareR1(&builder, expected, {}, ErrorSpec{1e-15}); +} +#endif + XLA_TEST_F(MathTest, SquareTenValues) { XlaBuilder builder(TestName()); auto x = ConstantR1( From 32e823339b297af8fd778fdda7483121116f69b0 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 10:46:43 -0800 Subject: [PATCH 272/383] Fix minor warnings. When compiling with -Wall/-Werror, several warnings related to signed/unsigned comparison and an incorrect format string kill the build. Additionally, when compiling under GCC 4.8.x, `max_align_t` is not a member of `std`. This change fixes these minor errors. PiperOrigin-RevId: 284216526 Change-Id: I05b6e76a626dc01da9400772ba0fbda2bb6b5b63 --- .../lite/experimental/micro/micro_allocator.cc | 17 +++++++++++++++-- .../experimental/micro/micro_interpreter.cc | 4 ++-- .../micro/micro_optional_debug_tools.cc | 9 ++++++++- .../lite/experimental/micro/test_helpers.cc | 2 +- 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/tensorflow/lite/experimental/micro/micro_allocator.cc b/tensorflow/lite/experimental/micro/micro_allocator.cc index 82b3b350c23..73c2bda1d20 100644 --- a/tensorflow/lite/experimental/micro/micro_allocator.cc +++ b/tensorflow/lite/experimental/micro/micro_allocator.cc @@ -42,6 +42,19 @@ struct TensorInfo { // requirement for SIMD extensions. constexpr int kBufferAlignment = 16; +// If building with GNU clib from GCC 4.8.x or lower, `max_align_t` is not a +// member of `std`. If using a newer version of clib, we import `max_align_t` +// into the local anonymous namespace to be able to use it like the global +// `max_align_t` from the older clib. +#if defined(__GNUC__) && defined(__GNUC_PREREQ) +#if __GNUC_PREREQ(4, 9) +using std::max_align_t; +#endif +#else +// We assume other compiler/clib configurations don't have this issue. +using std::max_align_t; +#endif + class MicroBuiltinDataAllocator : public BuiltinDataAllocator { public: explicit MicroBuiltinDataAllocator(SimpleMemoryAllocator* memory_allocator) @@ -51,7 +64,7 @@ class MicroBuiltinDataAllocator : public BuiltinDataAllocator { // Align to an address that is proper for all primitive types, but no more // than the size. return memory_allocator_->AllocateFromTail( - size, std::min(size, alignof(std::max_align_t))); + size, std::min(size, alignof(max_align_t))); } void Deallocate(void* data) override { // Do not deallocate, builtin data needs to be available for the life time @@ -412,7 +425,7 @@ TfLiteStatus MicroAllocator::InitializeRuntimeTensor( // If we've found a buffer, does it have any data? if (auto* array = buffer->data()) { // If it has any data, is the data size larger than zero? - if (size_t array_size = array->size()) { + if (array->size()) { // We've found a buffer with valid data, so update the runtime tensor // data structure to point to it. result->data.raw = diff --git a/tensorflow/lite/experimental/micro/micro_interpreter.cc b/tensorflow/lite/experimental/micro/micro_interpreter.cc index ba46cbfd95a..7185d643514 100644 --- a/tensorflow/lite/experimental/micro/micro_interpreter.cc +++ b/tensorflow/lite/experimental/micro/micro_interpreter.cc @@ -21,7 +21,7 @@ limitations under the License. namespace tflite { namespace { -const int kStackDataAllocatorSize = 128; +const size_t kStackDataAllocatorSize = 128; class StackDataAllocator : public BuiltinDataAllocator { public: void* Allocate(size_t size) override { @@ -91,7 +91,7 @@ MicroInterpreter::MicroInterpreter(const Model* model, // NOTE: This requires that the flatbuffer is held in memory which can be // modified by this process. if (!FLATBUFFERS_LITTLEENDIAN) { - for (int t = 0; t < tensors_size(); ++t) { + for (size_t t = 0; t < tensors_size(); ++t) { TfLiteTensor* thisTensor = &context_.tensors[t]; if (thisTensor->allocation_type == kTfLiteMmapRo) CorrectTensorEndianness(thisTensor); diff --git a/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc b/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc index e27317a5443..1f6ce531f05 100644 --- a/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc +++ b/tensorflow/lite/experimental/micro/micro_optional_debug_tools.cc @@ -14,6 +14,13 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/lite/experimental/micro/micro_optional_debug_tools.h" +// `cinttypes` requires `__STDC_FORMAT_MACROS` to be defined to expose `PRId32`. +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include + #include "tensorflow/lite/schema/schema_generated.h" namespace tflite { namespace { @@ -122,7 +129,7 @@ void PrintInterpreterState(MicroInterpreter* interpreter) { printf("Node %3zu Operator Custom Name %s\n", node_index, reg->custom_name); } else { - printf("Node %3zu Operator Builtin Code %3d %s\n", node_index, + printf("Node %3zu Operator Builtin Code %3" PRId32 " %s\n", node_index, reg->builtin_code, EnumNamesBuiltinOperator()[reg->builtin_code]); } printf(" Inputs:"); diff --git a/tensorflow/lite/experimental/micro/test_helpers.cc b/tensorflow/lite/experimental/micro/test_helpers.cc index 03e1d91fce0..a1b9801ffc9 100644 --- a/tensorflow/lite/experimental/micro/test_helpers.cc +++ b/tensorflow/lite/experimental/micro/test_helpers.cc @@ -47,7 +47,7 @@ class StackAllocator : public flatbuffers::Allocator { return *inst; } - static constexpr int kStackAllocatorSize = 4096; + static constexpr size_t kStackAllocatorSize = 4096; private: uint8_t data_backing_[kStackAllocatorSize]; From 722fc02b3ffa6b272469fe4719deeb60d45fe08c Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 10:52:23 -0800 Subject: [PATCH 273/383] Minor change to depthwise convolution with dilation. PiperOrigin-RevId: 284217866 Change-Id: Icd5419e62cef5870745182f8120b937ed791212f --- tensorflow/python/BUILD | 12 ++++++ tensorflow/python/ops/nn_impl.py | 23 +--------- tensorflow/python/ops/nn_ops.py | 44 +++----------------- tensorflow/python/platform/device_context.py | 22 ++++++++++ 4 files changed, 42 insertions(+), 59 deletions(-) create mode 100644 tensorflow/python/platform/device_context.py diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD index 2ba923f8b29..a25c3162752 100644 --- a/tensorflow/python/BUILD +++ b/tensorflow/python/BUILD @@ -243,6 +243,7 @@ py_library( "**/*test.py", "**/benchmark.py", # In platform_benchmark. "**/analytics.py", # In platform_analytics. + "**/device_context.py", # In platform_device_context. ], ) + ["platform/build_info.py"], srcs_version = "PY2AND3", @@ -275,6 +276,16 @@ py_library( srcs_version = "PY2AND3", ) +py_library( + name = "platform_device_context", + srcs = ["platform/device_context.py"], + srcs_version = "PY2AND3", + deps = [ + ":control_flow_ops", + ":framework", + ], +) + py_library( name = "platform_test", srcs = ["platform/googletest.py"], @@ -3805,6 +3816,7 @@ py_library( ":nn_grad", ":nn_ops", ":nn_ops_gen", + ":platform_device_context", ":rnn", ":sparse_ops", ":util", diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py index 73d761debc7..2b091464154 100644 --- a/tensorflow/python/ops/nn_impl.py +++ b/tensorflow/python/ops/nn_impl.py @@ -38,6 +38,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn_ops from tensorflow.python.ops import variables from tensorflow.python.ops.losses import util as losses_util +from tensorflow.python.platform import device_context from tensorflow.python.util.deprecation import deprecated_args from tensorflow.python.util.deprecation import deprecated_argument_lookup from tensorflow.python.util.tf_export import tf_export @@ -707,22 +708,6 @@ def zero_fraction(value, name=None): return array_ops.identity(zero_fraction_float32, "fraction") -# copybara:strip_begin -# TODO(b/138808492): Remove code inside copybara -# to make TPU code and CPU code consistent. -def _enclosing_tpu_context(): - # pylint: disable=protected-access - context = ops.get_default_graph()._get_control_flow_context() - # pylint: enable=protected-access - while context is not None and not isinstance( - context, control_flow_ops.XLAControlFlowContext): - context = context.outer_context - return context - - -# copybara:strip_end - - # pylint: disable=redefined-builtin @tf_export(v1=["nn.depthwise_conv2d"]) def depthwise_conv2d(input, @@ -782,11 +767,8 @@ def depthwise_conv2d(input, if rate is None: rate = [1, 1] - # copybara:strip_begin - # TODO(b/138808492): Remove code inside copybara - # to make TPU code and CPU code consistent. # Use depthwise_conv2d_native if executing on TPU. - if _enclosing_tpu_context() is not None: + if device_context.enclosing_tpu_context() is not None: if data_format == "NCHW": dilations = [1, 1, rate[0], rate[1]] else: @@ -799,7 +781,6 @@ def depthwise_conv2d(input, data_format=data_format, dilations=dilations, name=name) - # copybara:strip_end def op(input_converted, _, padding): return nn_ops.depthwise_conv2d_native( diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py index 5ed5bf87408..0058e9629ef 100644 --- a/tensorflow/python/ops/nn_ops.py +++ b/tensorflow/python/ops/nn_ops.py @@ -36,10 +36,6 @@ from tensorflow.python.framework import tensor_shape from tensorflow.python.framework import tensor_util from tensorflow.python.ops import array_ops from tensorflow.python.ops import check_ops -# copybara:strip_begin -# TODO(b/138808492): Remove code inside copybara -from tensorflow.python.ops import control_flow_ops -# copybara:strip_end from tensorflow.python.ops import gen_math_ops from tensorflow.python.ops import gen_nn_ops from tensorflow.python.ops import math_ops @@ -48,6 +44,7 @@ from tensorflow.python.ops import random_ops # pylint: disable=wildcard-import from tensorflow.python.ops.gen_nn_ops import * # pylint: enable=wildcard-import +from tensorflow.python.platform import device_context from tensorflow.python.platform import tf_logging as logging from tensorflow.python.util import deprecation from tensorflow.python.util.compat import collections_abc @@ -927,22 +924,6 @@ convolution_v2.__doc__ = deprecation.rewrite_argument_docstring( "filter", "filters") -# copybara:strip_begin -# TODO(b/138808492): Remove code inside copybara -# to make TPU code and CPU code consistent. -def _enclosing_tpu_context(): - # pylint: disable=protected-access - run_context = ops.get_default_graph()._get_control_flow_context() - # pylint: enable=protected-access - while run_context is not None and not isinstance( - run_context, control_flow_ops.XLAControlFlowContext): - run_context = run_context.outer_context - return run_context - - -# copybara:strip_end - - def convolution_internal( input, # pylint: disable=redefined-builtin filters, @@ -980,28 +961,20 @@ def convolution_internal( strides = _get_sequence(strides, n, channel_index, "strides") dilations = _get_sequence(dilations, n, channel_index, "dilations") - # copybara:strip_begin - # TODO(b/138808492): Remove code inside copybara - # to make TPU code and CPU code consistent. scopes = {1: "conv1d", 2: "Conv2D", 3: "Conv3D"} - if not call_from_convolution and _enclosing_tpu_context() is not None: + if not call_from_convolution and device_context.enclosing_tpu_context( + ) is not None: scope = scopes[n] else: scope = "convolution" - # copybara:strip_end - # copybara:insert scope = "convolution" with ops.name_scope(name, scope, [input, filters]) as name: conv_ops = {1: conv1d, 2: gen_nn_ops.conv2d, 3: gen_nn_ops.conv3d} - # copybara:strip_begin - # TODO(b/138808492): Remove code inside copybara - # to make TPU code and CPU code consistent. - if _enclosing_tpu_context() is not None or all(i == 1 for i in dilations): + if device_context.enclosing_tpu_context() is not None or all( + i == 1 for i in dilations): # fast path for TPU or if no dilation as gradient only supported on GPU # for dilations - # copybara:strip_end - # copybara:insert if all(i == 1 for i in dilations): op = conv_ops[n] return op( input, @@ -1120,11 +1093,8 @@ class Convolution(object): name=self.name) def __call__(self, inp, filter): # pylint: disable=redefined-builtin - # copybara:strip_begin - # TODO(b/138808492): Remove code inside copybara - # to make TPU code and CPU code consistent. # TPU convolution supports dilations greater than 1. - if _enclosing_tpu_context() is not None: + if device_context.enclosing_tpu_context() is not None: return convolution_internal( inp, filter, @@ -1136,8 +1106,6 @@ class Convolution(object): call_from_convolution=False) else: return self.conv_op(inp, filter) - # copybara:strip_end - # copybara:insert return self.conv_op(inp, filter) @tf_export(v1=["nn.pool"]) diff --git a/tensorflow/python/platform/device_context.py b/tensorflow/python/platform/device_context.py new file mode 100644 index 00000000000..7be2fdb31ee --- /dev/null +++ b/tensorflow/python/platform/device_context.py @@ -0,0 +1,22 @@ +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Helpers to get device context.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + + +def enclosing_tpu_context(): + pass From 96e6fef91a67c862eb1b1b80b917572502be1b19 Mon Sep 17 00:00:00 2001 From: Jacques Pienaar Date: Fri, 6 Dec 2019 10:52:38 -0800 Subject: [PATCH 274/383] Generate builder for ops that use InferTypeOpInterface trait in ODS For ops with infer type op interface defined, generate version that calls the inferal method on build. This is intermediate step to removing special casing of SameOperandsAndResultType & FirstAttrDereivedResultType. After that would be generating the inference code, with the initial focus on shaped container types. In between I plan to refactor these a bit to reuse generated paths. The intention would not be to add the type inference trait in multiple places, but rather to take advantage of the current modelling in ODS where possible to emit it instead. Switch the `inferReturnTypes` method to be static. Skipping ops with regions here as I don't like the Region vs unique_ptr difference at the moment, and I want the infer return type trait to be useful for verification too. So instead, just skip it for now to avoid churn. PiperOrigin-RevId: 284217913 Change-Id: I1f647a05ddbf57c0b19b826c0eecd150d085cb68 --- .../mlir/Analysis/InferTypeOpInterface.td | 2 +- .../test/lib/TestDialect/TestPatterns.cpp | 19 ++++++-- .../tools/mlir-tblgen/OpDefinitionsGen.cpp | 48 +++++++++++++++---- 3 files changed, 54 insertions(+), 15 deletions(-) diff --git a/third_party/mlir/include/mlir/Analysis/InferTypeOpInterface.td b/third_party/mlir/include/mlir/Analysis/InferTypeOpInterface.td index 56c7319181e..7ad3a3a54fa 100644 --- a/third_party/mlir/include/mlir/Analysis/InferTypeOpInterface.td +++ b/third_party/mlir/include/mlir/Analysis/InferTypeOpInterface.td @@ -37,7 +37,7 @@ def InferTypeOpInterface : OpInterface<"InferTypeOpInterface"> { }]; let methods = [ - InterfaceMethod< + StaticInterfaceMethod< /*desc=*/[{Returns the return types that an op would generate. The method takes an optional location which, if set, will be used to diff --git a/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp b/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp index 92f132613b1..7b835c5e61d 100644 --- a/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp +++ b/third_party/mlir/test/lib/TestDialect/TestPatterns.cpp @@ -73,10 +73,7 @@ struct ReturnTypeOpMatch : public RewritePattern { PatternMatchResult matchAndRewrite(Operation *op, PatternRewriter &rewriter) const final { if (auto retTypeFn = dyn_cast(op)) { - SmallVector values; - values.reserve(op->getNumOperands()); - for (auto &operand : op->getOpOperands()) - values.push_back(operand.get()); + SmallVector values(op->getOperands()); auto res = retTypeFn.inferReturnTypes(op->getLoc(), values, op->getAttrs(), op->getRegions()); SmallVector result_types(op->getResultTypes()); @@ -84,6 +81,20 @@ struct ReturnTypeOpMatch : public RewritePattern { return op->emitOpError( "inferred type incompatible with return type of operation"), matchFailure(); + + // TODO(jpienaar): Split this out to make the test more focused. + // Create new op with unknown location to verify building with + // InferTypeOpInterface is triggered. + auto fop = op->getParentOfType(); + if (values[0] == fop.getArgument(0)) { + // Use the 2nd function argument if the first function argument is used + // when constructing the new op so that a new return type is inferred. + values[0] = fop.getArgument(1); + values[1] = fop.getArgument(1); + // TODO(jpienaar): Expand to regions. + rewriter.create( + UnknownLoc::get(op->getContext()), values, op->getAttrs()); + } } return matchFailure(); } diff --git a/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp index b5fd0862b45..004b93d5941 100644 --- a/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp +++ b/third_party/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp @@ -541,6 +541,11 @@ private: // operand's type as all results' types. void genUseOperandAsResultTypeCollectiveParamBuilder(); + // Generates the build() method that takes aggregate operands/attributes + // parameters. This build() method uses inferred types as result types. + // Requires: The type needs to be inferable via InferTypeOpInterface. + void genInferedTypeCollectiveParamBuilder(); + // Generates the build() method that takes each operand/attribute as a // stand-alone parameter. The generated build() method uses first attribute's // type as all result's types. @@ -968,11 +973,6 @@ void OpEmitter::genUseOperandAsResultTypeCollectiveParamBuilder() { auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static); auto &body = m.body(); - // Result types - SmallVector resultTypes(numResults, "operands[0]->getType()"); - body << " " << builderOpState << ".addTypes({" - << llvm::join(resultTypes, ", ") << "});\n\n"; - // Operands body << " " << builderOpState << ".addOperands(operands);\n\n"; @@ -984,6 +984,27 @@ void OpEmitter::genUseOperandAsResultTypeCollectiveParamBuilder() { for (int i = 0; i < numRegions; ++i) m.body() << " (void)" << builderOpState << ".addRegion();\n"; } + + // Result types + SmallVector resultTypes(numResults, "operands[0]->getType()"); + body << " " << builderOpState << ".addTypes({" + << llvm::join(resultTypes, ", ") << "});\n\n"; +} + +void OpEmitter::genInferedTypeCollectiveParamBuilder() { + // TODO(jpienaar): Expand to support regions. + std::string params = + (Twine("Builder *, OperationState &") + builderOpState + + ", ArrayRef operands, ArrayRef attributes") + .str(); + auto &m = opClass.newMethod("void", "build", params, OpMethod::MP_Static); + auto &body = m.body(); + + body << " " << builderOpState << ".addOperands(operands);\n\n"; + body << " " << builderOpState << ".addAttributes(attributes);\n"; + body << " " << builderOpState << ".addTypes(" << opClass.getClassName() + << "::inferReturnTypes(" << builderOpState + << ".location, operands, attributes, /*regions=*/{}));\n"; } void OpEmitter::genUseOperandAsResultTypeSeparateParamBuilder() { @@ -1026,15 +1047,17 @@ void OpEmitter::genUseAttrAsResultTypeBuilder() { } else { resultType = "attr.second.getType()"; } - SmallVector resultTypes(op.getNumResults(), resultType); - body << " " << builderOpState << ".addTypes({" - << llvm::join(resultTypes, ", ") << "});\n"; - body << " }\n"; // Operands body << " " << builderOpState << ".addOperands(operands);\n\n"; // Attributes body << " " << builderOpState << ".addAttributes(attributes);\n"; + + // Result types + SmallVector resultTypes(op.getNumResults(), resultType); + body << " " << builderOpState << ".addTypes({" + << llvm::join(resultTypes, ", ") << "});\n"; + body << " }\n"; } void OpEmitter::genBuilder() { @@ -1082,7 +1105,7 @@ void OpEmitter::genBuilder() { genCollectiveParamBuilder(); // 4. one having a stand-alone parameter for each operand and attribute, // use the first operand or attribute's type as all result types - // to facilitate different call patterns. + // to facilitate different call patterns. if (op.getNumVariadicResults() == 0) { if (op.getTrait("OpTrait::SameOperandsAndResultType")) { genUseOperandAsResultTypeSeparateParamBuilder(); @@ -1091,6 +1114,11 @@ void OpEmitter::genBuilder() { if (op.getTrait("OpTrait::FirstAttrDerivedResultType")) genUseAttrAsResultTypeBuilder(); } + // TODO(jpienaar): Subsume this with general checking if type can be infered + // automatically. + // TODO(jpienaar): Expand to handle regions. + if (op.getTrait("InferTypeOpInterface::Trait") && op.getNumRegions() == 0) + genInferedTypeCollectiveParamBuilder(); } void OpEmitter::genCollectiveParamBuilder() { From 633e6e13ec5bf63a17fcc9f9d79124d63e8bf239 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 10:54:32 -0800 Subject: [PATCH 275/383] Remove tensorflow/core/framework/ protos from tensorflow/core:protos_all_go_proto. Users should now depend on the tensorflow/core/framework:*_go_proto targets directly. PiperOrigin-RevId: 284218292 Change-Id: I8ebdc3eebd33a0337d59fab1caea03795afe7e00 --- tensorflow/go/signature_test.go | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/tensorflow/go/signature_test.go b/tensorflow/go/signature_test.go index a13bbb15aab..7988347ed17 100644 --- a/tensorflow/go/signature_test.go +++ b/tensorflow/go/signature_test.go @@ -20,6 +20,8 @@ import ( "fmt" "testing" + tspb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/tensor_shape_go_proto" + typb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core/framework/types_go_proto" tfpb "github.com/tensorflow/tensorflow/tensorflow/go/genop/internal/proto/github.com/tensorflow/tensorflow/tensorflow/go/core" ) @@ -30,9 +32,9 @@ func TestSignatureFromProto(t *testing.T) { Encoding: &tfpb.TensorInfo_Name{ Name: "tensor_1", }, - Dtype: tfpb.DataType_DT_INT8, - TensorShape: &tfpb.TensorShapeProto{ - Dim: []*tfpb.TensorShapeProto_Dim{ + Dtype: typb.DataType_DT_INT8, + TensorShape: &tspb.TensorShapeProto{ + Dim: []*tspb.TensorShapeProto_Dim{ {Size: 1}, {Size: 2}, {Size: 3}, @@ -43,9 +45,9 @@ func TestSignatureFromProto(t *testing.T) { Encoding: &tfpb.TensorInfo_Name{ Name: "tensor_2", }, - Dtype: tfpb.DataType_DT_FLOAT, - TensorShape: &tfpb.TensorShapeProto{ - Dim: []*tfpb.TensorShapeProto_Dim{ + Dtype: typb.DataType_DT_FLOAT, + TensorShape: &tspb.TensorShapeProto{ + Dim: []*tspb.TensorShapeProto_Dim{ {Size: 4}, {Size: 5}, {Size: 6}, @@ -58,9 +60,9 @@ func TestSignatureFromProto(t *testing.T) { Encoding: &tfpb.TensorInfo_Name{ Name: "tensor_3", }, - Dtype: tfpb.DataType_DT_STRING, - TensorShape: &tfpb.TensorShapeProto{ - Dim: []*tfpb.TensorShapeProto_Dim{ + Dtype: typb.DataType_DT_STRING, + TensorShape: &tspb.TensorShapeProto{ + Dim: []*tspb.TensorShapeProto_Dim{ {Size: 1}, {Size: 2}, {Size: 3}, @@ -71,9 +73,9 @@ func TestSignatureFromProto(t *testing.T) { Encoding: &tfpb.TensorInfo_Name{ Name: "tensor_4", }, - Dtype: tfpb.DataType_DT_BOOL, - TensorShape: &tfpb.TensorShapeProto{ - Dim: []*tfpb.TensorShapeProto_Dim{ + Dtype: typb.DataType_DT_BOOL, + TensorShape: &tspb.TensorShapeProto{ + Dim: []*tspb.TensorShapeProto_Dim{ {Size: 4}, {Size: 5}, {Size: 6}, @@ -142,9 +144,9 @@ func TestTensorInfoFromProto(t *testing.T) { Encoding: &tfpb.TensorInfo_Name{ Name: "tensor", }, - Dtype: tfpb.DataType_DT_INT8, - TensorShape: &tfpb.TensorShapeProto{ - Dim: []*tfpb.TensorShapeProto_Dim{ + Dtype: typb.DataType_DT_INT8, + TensorShape: &tspb.TensorShapeProto{ + Dim: []*tspb.TensorShapeProto_Dim{ {Size: 1}, {Size: 2}, {Size: 3}, From b79a7349758031ef0bb17dd70d4c62c8e71ea03a Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Fri, 6 Dec 2019 11:01:54 -0800 Subject: [PATCH 276/383] [VectorOps] Add lowering of vector.broadcast to LLVM IR For example, a scalar broadcast %0 = vector.broadcast %x : f32 to vector<2xf32> return %0 : vector<2xf32> which expands scalar x into vector [x,x] by lowering to the following LLVM IR dialect to implement the duplication over the leading dimension. %0 = llvm.mlir.undef : !llvm<"<2 x float>"> %1 = llvm.mlir.constant(0 : index) : !llvm.i64 %2 = llvm.insertelement %x, %0[%1 : !llvm.i64] : !llvm<"<2 x float>"> %3 = llvm.shufflevector %2, %0 [0 : i32, 0 : i32] : !llvm<"<2 x float>">, !llvm<"<2 x float>"> return %3 : vector<2xf32> In the trailing dimensions, the operand is simply "passed through", unless a more elaborate "stretch" is required. For example %0 = vector.broadcast %arg0 : vector<1xf32> to vector<4xf32> return %0 : vector<4xf32> becomes %0 = llvm.mlir.undef : !llvm<"<4 x float>"> %1 = llvm.mlir.constant(0 : index) : !llvm.i64 %2 = llvm.extractelement %arg0[%1 : !llvm.i64] : !llvm<"<1 x float>"> %3 = llvm.mlir.constant(0 : index) : !llvm.i64 %4 = llvm.insertelement %2, %0[%3 : !llvm.i64] : !llvm<"<4 x float>"> %5 = llvm.shufflevector %4, %0 [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>"> llvm.return %5 : !llvm<"<4 x float>"> PiperOrigin-RevId: 284219926 Change-Id: I203a3ef6aa64ddb37cf4487648429226f4ed7642 --- .../VectorToLLVM/ConvertVectorToLLVM.cpp | 187 +++++++++++++++++- .../mlir/lib/Dialect/VectorOps/VectorOps.cpp | 12 +- 2 files changed, 192 insertions(+), 7 deletions(-) diff --git a/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index 7221998ce25..c40c7c5242a 100644 --- a/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -49,6 +49,191 @@ static LLVM::LLVMType getPtrToElementType(T containerType, .getPointerTo(); } +class VectorBroadcastOpConversion : public LLVMOpLowering { +public: + explicit VectorBroadcastOpConversion(MLIRContext *context, + LLVMTypeConverter &typeConverter) + : LLVMOpLowering(vector::BroadcastOp::getOperationName(), context, + typeConverter) {} + + PatternMatchResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + auto broadcastOp = cast(op); + VectorType dstVectorType = broadcastOp.getVectorType(); + if (lowering.convertType(dstVectorType) == nullptr) + return matchFailure(); + // Rewrite when the full vector type can be lowered (which + // implies all 'reduced' types can be lowered too). + VectorType srcVectorType = + broadcastOp.getSourceType().dyn_cast(); + rewriter.replaceOp( + op, expandRanks(operands[0], // source value to be expanded + op->getLoc(), // location of original broadcast + srcVectorType, dstVectorType, rewriter)); + return matchSuccess(); + } + +private: + // Expands the given source value over all the ranks, as defined + // by the source and destination type (a null source type denotes + // expansion from a scalar value into a vector). + // + // TODO(ajcbik): consider replacing this one-pattern lowering + // with a two-pattern lowering using other vector + // ops once all insert/extract/shuffle operations + // are available with lowering implemention. + // + Value *expandRanks(Value *value, Location loc, VectorType srcVectorType, + VectorType dstVectorType, + ConversionPatternRewriter &rewriter) const { + assert((dstVectorType != nullptr) && "invalid result type in broadcast"); + // Determine rank of source and destination. + int64_t srcRank = srcVectorType ? srcVectorType.getRank() : 0; + int64_t dstRank = dstVectorType.getRank(); + int64_t curDim = dstVectorType.getDimSize(0); + if (srcRank < dstRank) + // Duplicate this rank. + return duplicateOneRank(value, loc, srcVectorType, dstVectorType, dstRank, + curDim, rewriter); + // If all trailing dimensions are the same, the broadcast consists of + // simply passing through the source value and we are done. Otherwise, + // any non-matching dimension forces a stretch along this rank. + assert((srcVectorType != nullptr) && (srcRank > 0) && + (srcRank == dstRank) && "invalid rank in broadcast"); + for (int64_t r = 0; r < dstRank; r++) { + if (srcVectorType.getDimSize(r) != dstVectorType.getDimSize(r)) { + return stretchOneRank(value, loc, srcVectorType, dstVectorType, dstRank, + curDim, rewriter); + } + } + return value; + } + + // Picks the best way to duplicate a single rank. For the 1-D case, a + // single insert-elt/shuffle is the most efficient expansion. For higher + // dimensions, however, we need dim x insert-values on a new broadcast + // with one less leading dimension, which will be lowered "recursively" + // to matching LLVM IR. + // For example: + // v = broadcast s : f32 to vector<4x2xf32> + // becomes: + // x = broadcast s : f32 to vector<2xf32> + // v = [x,x,x,x] + // becomes: + // x = [s,s] + // v = [x,x,x,x] + Value *duplicateOneRank(Value *value, Location loc, VectorType srcVectorType, + VectorType dstVectorType, int64_t rank, int64_t dim, + ConversionPatternRewriter &rewriter) const { + Type llvmType = lowering.convertType(dstVectorType); + assert((llvmType != nullptr) && "unlowerable vector type"); + if (rank == 1) { + Value *undef = rewriter.create(loc, llvmType); + Value *expand = insertOne(undef, value, loc, llvmType, rank, 0, rewriter); + SmallVector zeroValues(dim, 0); + return rewriter.create( + loc, expand, undef, rewriter.getI32ArrayAttr(zeroValues)); + } + Value *expand = expandRanks(value, loc, srcVectorType, + reducedVectorType(dstVectorType), rewriter); + Value *result = rewriter.create(loc, llvmType); + for (int64_t d = 0; d < dim; ++d) { + result = insertOne(result, expand, loc, llvmType, rank, d, rewriter); + } + return result; + } + + // Picks the best way to stretch a single rank. For the 1-D case, a + // single insert-elt/shuffle is the most efficient expansion when at + // a stretch. Otherwise, every dimension needs to be expanded + // individually and individually inserted in the resulting vector. + // For example: + // v = broadcast w : vector<4x1x2xf32> to vector<4x2x2xf32> + // becomes: + // a = broadcast w[0] : vector<1x2xf32> to vector<2x2xf32> + // b = broadcast w[1] : vector<1x2xf32> to vector<2x2xf32> + // c = broadcast w[2] : vector<1x2xf32> to vector<2x2xf32> + // d = broadcast w[3] : vector<1x2xf32> to vector<2x2xf32> + // v = [a,b,c,d] + // becomes: + // x = broadcast w[0][0] : vector<2xf32> to vector <2x2xf32> + // y = broadcast w[1][0] : vector<2xf32> to vector <2x2xf32> + // a = [x, y] + // etc. + Value *stretchOneRank(Value *value, Location loc, VectorType srcVectorType, + VectorType dstVectorType, int64_t rank, int64_t dim, + ConversionPatternRewriter &rewriter) const { + Type llvmType = lowering.convertType(dstVectorType); + assert((llvmType != nullptr) && "unlowerable vector type"); + Value *result = rewriter.create(loc, llvmType); + bool atStretch = dim != srcVectorType.getDimSize(0); + if (rank == 1) { + Type redLlvmType = lowering.convertType(dstVectorType.getElementType()); + if (atStretch) { + Value *one = extractOne(value, loc, redLlvmType, rank, 0, rewriter); + Value *expand = + insertOne(result, one, loc, llvmType, rank, 0, rewriter); + SmallVector zeroValues(dim, 0); + return rewriter.create( + loc, expand, result, rewriter.getI32ArrayAttr(zeroValues)); + } + for (int64_t d = 0; d < dim; ++d) { + Value *one = extractOne(value, loc, redLlvmType, rank, d, rewriter); + result = insertOne(result, one, loc, llvmType, rank, d, rewriter); + } + } else { + VectorType redSrcType = reducedVectorType(srcVectorType); + VectorType redDstType = reducedVectorType(dstVectorType); + Type redLlvmType = lowering.convertType(redSrcType); + for (int64_t d = 0; d < dim; ++d) { + int64_t pos = atStretch ? 0 : d; + Value *one = extractOne(value, loc, redLlvmType, rank, pos, rewriter); + Value *expand = expandRanks(one, loc, redSrcType, redDstType, rewriter); + result = insertOne(result, expand, loc, llvmType, rank, d, rewriter); + } + } + return result; + } + + // Picks the proper sequence for inserting. + Value *insertOne(Value *val1, Value *val2, Location loc, Type llvmType, + int64_t rank, int64_t pos, + ConversionPatternRewriter &rewriter) const { + if (rank == 1) { + auto idxType = rewriter.getIndexType(); + auto constant = rewriter.create( + loc, lowering.convertType(idxType), + rewriter.getIntegerAttr(idxType, pos)); + return rewriter.create(loc, llvmType, val1, val2, + constant); + } + return rewriter.create(loc, llvmType, val1, val2, + rewriter.getI64ArrayAttr(pos)); + } + + // Picks the proper sequence for extracting. + Value *extractOne(Value *value, Location loc, Type llvmType, int64_t rank, + int64_t pos, ConversionPatternRewriter &rewriter) const { + if (rank == 1) { + auto idxType = rewriter.getIndexType(); + auto constant = rewriter.create( + loc, lowering.convertType(idxType), + rewriter.getIntegerAttr(idxType, pos)); + return rewriter.create(loc, llvmType, value, + constant); + } + return rewriter.create(loc, llvmType, value, + rewriter.getI64ArrayAttr(pos)); + } + + // Helper to reduce vector type by one rank. + static VectorType reducedVectorType(VectorType tp) { + assert((tp.getRank() > 1) && "unlowerable vector type"); + return VectorType::get(tp.getShape().drop_front(), tp.getElementType()); + } +}; + class VectorExtractElementOpConversion : public LLVMOpLowering { public: explicit VectorExtractElementOpConversion(MLIRContext *context, @@ -246,7 +431,7 @@ public: /// Populate the given list with patterns that convert from Vector to LLVM. void mlir::populateVectorToLLVMConversionPatterns( LLVMTypeConverter &converter, OwningRewritePatternList &patterns) { - patterns.insert( converter.getDialect()->getContext(), converter); } diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp index 5d596f388ed..65441674165 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp @@ -416,16 +416,16 @@ static LogicalResult verify(BroadcastOp op) { // Scalar to vector broadcast is always valid. A vector // to vector broadcast needs some additional checking. if (srcVectorType) { - const int64_t srcRank = srcVectorType.getRank(); - const int64_t dstRank = dstVectorType.getRank(); + int64_t srcRank = srcVectorType.getRank(); + int64_t dstRank = dstVectorType.getRank(); if (srcRank > dstRank) return op.emitOpError("source rank higher than destination rank"); // Source has an exact match or singleton value for all trailing dimensions // (all leading dimensions are simply duplicated). - const int64_t lead = dstRank - srcRank; - for (int64_t i = 0; i < srcRank; i++) { - const int64_t srcDim = srcVectorType.getDimSize(i); - const int64_t dstDim = dstVectorType.getDimSize(lead + i); + int64_t lead = dstRank - srcRank; + for (int64_t r = 0; r < srcRank; ++r) { + int64_t srcDim = srcVectorType.getDimSize(r); + int64_t dstDim = dstVectorType.getDimSize(lead + r); if (srcDim != 1 && srcDim != dstDim) return op.emitOpError("dimension mismatch (") << srcDim << " vs. " << dstDim << ")"; From 829ec36ea1d928f722850e46d4035463103e2cd5 Mon Sep 17 00:00:00 2001 From: Jose Baiocchi Date: Fri, 6 Dec 2019 11:03:21 -0800 Subject: [PATCH 277/383] Move #includes to correct headers PiperOrigin-RevId: 284220426 Change-Id: I50e10c67843d32c2a0faa77317a6a7ac4d8e6911 --- tensorflow/core/profiler/internal/profiler_factory.h | 3 +++ tensorflow/core/profiler/internal/profiler_interface.h | 3 --- tensorflow/core/profiler/lib/profiler_session.h | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/profiler/internal/profiler_factory.h b/tensorflow/core/profiler/internal/profiler_factory.h index c2d0aa70671..4473e21699e 100644 --- a/tensorflow/core/profiler/internal/profiler_factory.h +++ b/tensorflow/core/profiler/internal/profiler_factory.h @@ -15,6 +15,9 @@ limitations under the License. #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_ #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_FACTORY_H_ +#include +#include + #include "tensorflow/core/profiler/internal/profiler_interface.h" namespace tensorflow { diff --git a/tensorflow/core/profiler/internal/profiler_interface.h b/tensorflow/core/profiler/internal/profiler_interface.h index eeb6d82b75b..dc8060082f6 100644 --- a/tensorflow/core/profiler/internal/profiler_interface.h +++ b/tensorflow/core/profiler/internal/profiler_interface.h @@ -15,9 +15,6 @@ limitations under the License. #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_ #define TENSORFLOW_CORE_PROFILER_INTERNAL_PROFILER_INTERFACE_H_ -#include -#include - #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/profiler/protobuf/xplane.pb.h" #include "tensorflow/core/protobuf/config.pb.h" diff --git a/tensorflow/core/profiler/lib/profiler_session.h b/tensorflow/core/profiler/lib/profiler_session.h index e91d7ad2bee..85b9901d889 100644 --- a/tensorflow/core/profiler/lib/profiler_session.h +++ b/tensorflow/core/profiler/lib/profiler_session.h @@ -15,6 +15,9 @@ limitations under the License. #ifndef TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_ #define TENSORFLOW_CORE_PROFILER_LIB_PROFILER_SESSION_H_ +#include +#include + #include "tensorflow/core/lib/core/status.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" From 29021b9e6638e1f268fb5c6f563f07dc0fcf073d Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 11:06:38 -0800 Subject: [PATCH 278/383] Provide a way to get the type of a ValueHandle. PiperOrigin-RevId: 284221337 Change-Id: I8a4fde00eb67f712dce7866dc958d91a61b2b815 --- third_party/mlir/bindings/python/pybind.cpp | 9 ++++- .../mlir/bindings/python/test/test_py2and3.py | 38 ++++++++++++++----- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/third_party/mlir/bindings/python/pybind.cpp b/third_party/mlir/bindings/python/pybind.cpp index e3333b669be..7d3ac044391 100644 --- a/third_party/mlir/bindings/python/pybind.cpp +++ b/third_party/mlir/bindings/python/pybind.cpp @@ -110,6 +110,10 @@ struct PythonValueHandle { return ValueHandle::create(value, argValues); } + PythonType type() const { + return PythonType(value.getType().getAsOpaquePointer()); + } + mlir::edsc::ValueHandle value; }; @@ -951,7 +955,7 @@ PYBIND11_MODULE(pybind, m) { .def("affine_constant_map", &PythonMLIRModule::affineConstantMap, "Returns an affine map with single constant result.") .def("affine_map", &PythonMLIRModule::affineMap, "Returns an affine map.", - py::arg("dimCount"), py::arg("symbolCount"), py::arg("resuls")) + py::arg("dimCount"), py::arg("symbolCount"), py::arg("results")) .def("__str__", &PythonMLIRModule::getIR, "Get the string representation of the module"); @@ -1034,7 +1038,8 @@ PYBIND11_MODULE(pybind, m) { .def("__or__", [](PythonValueHandle lhs, PythonValueHandle rhs) -> PythonValueHandle { return lhs.value || rhs.value; }) - .def("__call__", &PythonValueHandle::call); + .def("__call__", &PythonValueHandle::call) + .def("type", &PythonValueHandle::type); } py::class_( diff --git a/third_party/mlir/bindings/python/test/test_py2and3.py b/third_party/mlir/bindings/python/test/test_py2and3.py index 678e5023173..7849b08f19b 100644 --- a/third_party/mlir/bindings/python/test/test_py2and3.py +++ b/third_party/mlir/bindings/python/test/test_py2and3.py @@ -19,6 +19,7 @@ import google_mlir.bindings.python.pybind as E import inspect + # Prints `str` prefixed by the current test function name so we can use it in # Filecheck label directives. # This is achieved by inspecting the stack and getting the parent name. @@ -26,6 +27,7 @@ def printWithCurrentFunctionName(str): print(inspect.stack()[1][3]) print(str) + class EdscTest: def setUp(self): @@ -104,8 +106,9 @@ class EdscTest: def testBooleanOps(self): self.setUp() - with self.module.function_context( - "booleans", [self.boolType for _ in range(4)], []) as fun: + with self.module.function_context("booleans", + [self.boolType for _ in range(4)], + []) as fun: i, j, k, l = (fun.arg(x) for x in range(4)) stmt1 = (i < j) & (j >= k) stmt2 = ~(stmt1 | (k == l)) @@ -471,15 +474,16 @@ class EdscTest: def testMatrixMultiply(self): self.setUp() memrefType = self.module.make_memref_type(self.f32Type, [32, 32]) - with self.module.function_context( - "matmul", [memrefType, memrefType, memrefType], []) as fun: + with self.module.function_context("matmul", + [memrefType, memrefType, memrefType], + []) as fun: A = E.IndexedValue(fun.arg(0)) B = E.IndexedValue(fun.arg(1)) C = E.IndexedValue(fun.arg(2)) c0 = E.constant_index(0) c32 = E.constant_index(32) - with E.LoopNestContext([c0, c0, c0], [c32, c32, c32], [1, 1, 1]) as (i, j, - k): + with E.LoopNestContext([c0, c0, c0], [c32, c32, c32], + [1, 1, 1]) as (i, j, k): C.store([i, j], A.load([i, k]) * B.load([k, j])) E.ret([]) printWithCurrentFunctionName(str(fun)) @@ -520,19 +524,33 @@ class EdscTest: # CHECK-LABEL: testSelectOp # CHECK: %{{.*}} = select %{{.*}}, %{{.*}}, %{{.*}} : i32 + def testType(self): + self.setUp() + printWithCurrentFunctionName("") + with self.module.function_context( + "foo", [self.module.make_memref_type(self.f32Type, [10])], []) as fun: + c42 = E.constant_int(42, 32) + print(str(c42.type())) + print(str(fun.arg(0).type())) + # CHECK-LABEL: testType + # CHECK: i32 + # CHECK: memref<10xf32> + # Until python 3.6 this cannot be used because the order in the dict is not the # order of method declaration. def runTests(): + def isTest(attr): return inspect.ismethod(attr) and "EdscTest.setUp " not in str(attr) edscTest = EdscTest() - tests = sorted(filter(isTest, - (getattr(edscTest, attr) for attr in dir(edscTest))), - key = lambda x : str(x)) + tests = sorted( + filter(isTest, (getattr(edscTest, attr) for attr in dir(edscTest))), + key=lambda x: str(x)) for test in tests: test() -if __name__ == '__main__': + +if __name__ == "__main__": runTests() From a5ac4c72dd305820017a73b82b704e080a0e00a5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 11:21:35 -0800 Subject: [PATCH 279/383] Make callback string formatting failure error message more user-friendly. PiperOrigin-RevId: 284224536 Change-Id: I57b8c2ac506deaaa45365ddb80f564b787abb23f --- tensorflow/python/keras/callbacks.py | 9 ++++++++- tensorflow/python/keras/callbacks_test.py | 12 ++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py index bc2f0461fbc..ca9507f0bdc 100644 --- a/tensorflow/python/keras/callbacks.py +++ b/tensorflow/python/keras/callbacks.py @@ -1052,7 +1052,14 @@ class ModelCheckpoint(Callback): # pylint: disable=protected-access if not self.model._in_multi_worker_mode( ) or multi_worker_util.should_save_checkpoint(): - return self.filepath.format(epoch=epoch + 1, **logs) + try: + # `filepath` may contain placeholders such as `{epoch:02d}` and + # `{mape:.2f}`. A mismatch between logged metrics and the path's + # placeholders can cause formatting to fail. + return self.filepath.format(epoch=epoch + 1, **logs) + except KeyError as e: + raise KeyError('Failed to format this callback filepath: "{}". ' + 'Reason: {}'.format(self.filepath, e)) else: # If this is multi-worker training, and this worker should not # save checkpoint, we use a temp filepath to store a dummy checkpoint, so diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index 46c11a14838..164d9ba01b8 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -828,6 +828,18 @@ class KerasCallbacksTest(keras_parameterized.TestCase): 'filepath for ModelCheckpoint.'): model.fit(train_ds, epochs=1, callbacks=[callback]) + def test_ModelCheckpoint_with_bad_path_placeholders(self): + (model, train_ds, callback, + filepath) = self._get_dummy_resource_for_model_checkpoint_testing() + + temp_dir = self.get_temp_dir() + filepath = os.path.join(temp_dir, 'chkpt_{epoch:02d}_{mape:.2f}.h5') + callback = keras.callbacks.ModelCheckpoint(filepath=filepath) + + with self.assertRaisesRegexp(KeyError, 'Failed to format this callback ' + 'filepath.*'): + model.fit(train_ds, epochs=1, callbacks=[callback]) + def test_EarlyStopping(self): with self.cached_session(): np.random.seed(123) From 2792dd7cf2be68500a64d18d8765bed1e5e0aaa7 Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Fri, 6 Dec 2019 11:39:55 -0800 Subject: [PATCH 280/383] [tfdbg] Support linking FuncGraph building and execution - Append a _function_name property to FuncGraph, in order to allow establishment of connections between the FuncGraph object and _EagerDefinedFunctions based on it. - The dumping op callback extracts the graph_id value and saves it with the DebugEvent.execution proto. Also in this CL: - Add unit test for the recorded graph IDs for eager execution of FuncGraphs. - Replace the magic string prefixes for Function names (e.g., "__inference_" with constants. - Use the said string constants in dumping_callback.py and in function_deserialization.py PiperOrigin-RevId: 284228685 Change-Id: I8fc540d6d6de0ed58c77d8de5804b0e997297f68 --- tensorflow/python/debug/BUILD | 2 +- .../debug/lib/debug_events_writer_test.py | 8 +- .../debug/lib/distributed_callbacks_test.py | 2 +- .../python/debug/lib/dumping_callback.py | 81 +++++++++++++- .../python/debug/lib/dumping_callback_test.py | 101 ++++++++++++++++-- .../debug/lib/dumping_callback_test_lib.py | 11 +- tensorflow/python/eager/function.py | 60 +++++++++-- .../saved_model/function_deserialization.py | 6 +- 8 files changed, 241 insertions(+), 30 deletions(-) diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD index 2bc35ef52af..43592e63fa8 100644 --- a/tensorflow/python/debug/BUILD +++ b/tensorflow/python/debug/BUILD @@ -726,7 +726,7 @@ cuda_py_test( "//tensorflow/python/keras", ], python_version = "PY3", - shard_count = 8, + shard_count = 4, tags = [ "guitar", "multi_and_single_gpu", diff --git a/tensorflow/python/debug/lib/debug_events_writer_test.py b/tensorflow/python/debug/lib/debug_events_writer_test.py index 86e7fd26e1a..f6e973befed 100644 --- a/tensorflow/python/debug/lib/debug_events_writer_test.py +++ b/tensorflow/python/debug/lib/debug_events_writer_test.py @@ -202,11 +202,11 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase): # Before FlushExecutionFiles() is called. No data should have been written # to the file. - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + executed_op_types, _, _, _, _, _ = self._readAndCheckExecutionFile() self.assertFalse(executed_op_types) writer.FlushExecutionFiles() - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + executed_op_types, _, _, _, _, _ = self._readAndCheckExecutionFile() for i, executed_op_type in enumerate(executed_op_types): self.assertEqual( executed_op_type, @@ -222,7 +222,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase): writer.WriteExecution(execution) writer.FlushExecutionFiles() - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + executed_op_types, _, _, _, _, _ = self._readAndCheckExecutionFile() self.assertLen(executed_op_types, num_execution_events) for i, executed_op_type in enumerate(executed_op_types): self.assertEqual(executed_op_type, "OpType%d" % i) @@ -302,7 +302,7 @@ class DebugEventsWriterTest(dumping_callback_test_lib.DumpingCallbackTestBase): writer.FlushExecutionFiles() # Verify the content of the .execution file. - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + executed_op_types, _, _, _, _, _ = self._readAndCheckExecutionFile() self.assertLen(executed_op_types, circular_buffer_size) self.assertLen(executed_op_types, len(set(executed_op_types))) diff --git a/tensorflow/python/debug/lib/distributed_callbacks_test.py b/tensorflow/python/debug/lib/distributed_callbacks_test.py index e1ff0f823c3..7ae555c285a 100644 --- a/tensorflow/python/debug/lib/distributed_callbacks_test.py +++ b/tensorflow/python/debug/lib/distributed_callbacks_test.py @@ -266,7 +266,7 @@ class DistributedDumpingCallbackTest( tensor_values) = self._readAndCheckGraphExecutionTracesFile(context_ids) # Eager execution of tf.function should be recorded. - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + executed_op_types, _, _, _, _, _ = self._readAndCheckExecutionFile() fit_functions = [op_type for op_type in executed_op_types if "_distributed_function" in op_type] self.assertLen(fit_functions, epochs) diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py index adb924aefaa..ab3fceca532 100644 --- a/tensorflow/python/debug/lib/dumping_callback.py +++ b/tensorflow/python/debug/lib/dumping_callback.py @@ -23,6 +23,7 @@ import re import socket import threading import uuid +import weakref from six.moves import xrange # pylint: disable=redefined-builtin @@ -31,6 +32,7 @@ from tensorflow.core.protobuf import graph_debug_info_pb2 from tensorflow.python.debug.lib import debug_events_writer from tensorflow.python.debug.lib import op_callbacks_common from tensorflow.python.debug.lib import source_utils +from tensorflow.python.eager import function as function_lib from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import op_callbacks @@ -81,14 +83,35 @@ class _DumpingCallback(object): self._stack_frame_to_id = dict() # Mapping op context to unique ID. self._context_to_id = dict() + self._function_weakref_to_graph_id = dict() + # pylint:disable=protected-access + self._function_prefixes = ( + compat.as_bytes(function_lib._FORWARD_PREFIX), + compat.as_bytes(function_lib._BACKWARD_PREFIX), + compat.as_bytes(function_lib._INFERENCE_PREFIX)) + # pylint:enable=protected-access + self._op_type_to_context_id = dict() # Keeps track of counter for symbolic tensors output by in-graph ops. self._symbolic_tensor_counter = 0 self._source_file_paths_lock = threading.Lock() self._stack_frame_to_id_lock = threading.Lock() - self._context_to_id_lock = threading.Lock() + self._context_lock = threading.Lock() self._symbolic_tensor_counter_lock = threading.Lock() self._writer = None + def function_callback(self, function): + """A callback to be called on creation of Functions. + + Used to establish a join between function name and graph (context) ID. + + Args: + function: The just-created Function. + """ + function_weakref = weakref.ref(function) + graph_id = self._get_context_id(function.graph) + with self._context_lock: + self._function_weakref_to_graph_id[function_weakref] = graph_id + @property def dump_root(self): return self._dump_root @@ -133,7 +156,7 @@ class _DumpingCallback(object): if context in self._context_to_id: # 1st check, without lock. return self._context_to_id[context] graph_is_new = False - with self._context_to_id_lock: + with self._context_lock: if context not in self._context_to_id: # 2nd check, with lock. graph_is_new = True context_id = _get_id() @@ -318,7 +341,11 @@ class _DumpingCallback(object): "Symbolic tensor instrumentation is not implemented for debug mode " "%s" % self._tensor_debug_mode) - def _dump_eager_tensors(self, tensors, op_type, input_tensor_ids): + def _dump_eager_tensors(self, + tensors, + op_type, + input_tensor_ids, + graph_id=None): """Dump the value of eager tensors. The destination of the dumping is determined by the dump_root of the @@ -332,6 +359,8 @@ class _DumpingCallback(object): value transform. op_type: Type of the op that generates the tensors, as a string. input_tensor_ids: IDs of the input EagerTensors to the op. + graph_id: ID of the executed graph, applicable only to eager execution of + a FuncGraph. Returns: A tfdbg Execution protocol buffer. @@ -342,6 +371,7 @@ class _DumpingCallback(object): if tensor_debug_mode == debug_event_pb2.TensorDebugMode.NO_TENSOR: return debug_event_pb2.Execution( op_type=op_type, + graph_id=graph_id, num_outputs=len(tensors), input_tensor_ids=input_tensor_ids, output_tensor_ids=output_tensor_ids, @@ -351,6 +381,7 @@ class _DumpingCallback(object): execution_proto = debug_event_pb2.Execution( op_type=op_type, num_outputs=len(tensors), + graph_id=graph_id, input_tensor_ids=input_tensor_ids, output_tensor_ids=output_tensor_ids, tensor_debug_mode=tensor_debug_mode, @@ -396,9 +427,45 @@ class _DumpingCallback(object): return self._instrument_symbolic_tensors( outputs, op_type, op_name, context_id, output_tensor_ids) else: + context_id = self._func_graph_id_from_func_name(op_type) input_ids = [t._id for t in inputs] # pylint:disable=protected-access - writer.WriteExecution( - self._dump_eager_tensors(outputs, op_type, input_ids)) + writer.WriteExecution(self._dump_eager_tensors( + outputs, op_type, input_ids, graph_id=context_id)) + + def _func_graph_id_from_func_name(self, op_type): + """Attempt to get the ID of a FuncGraph based on an op type name. + + Also caches the ID for faster access later. + + Args: + op_type: Op type string, which may be the name of a function. + + Returns: + If the op_type name does not fit the pattern of a function name (e.g., + one that starts with "__inference_"), `None` is returned immediately. + Else, if the FuncGraph is found, ID of the underlying FuncGraph is + returned as a string. + Else, `None` is returned. + """ + op_type = compat.as_bytes(op_type) + if op_type.startswith(self._function_prefixes): + # op_type for eagerly-executed FuncGraphs have the prefixed and suffixed + # form such as "__inference_my_function_13579", wherein the middle part + # "my_function" is the name of the Python function from which the + # FuncGraph is compiled. Due to the suffix, the op_type is unique for + # - duplicate Python function names + # - multiple compilation of the same Python function + if op_type in self._op_type_to_context_id: + return self._op_type_to_context_id[op_type] + with self._context_lock: + for function_weakref in self._function_weakref_to_graph_id: + if function_weakref().name == op_type: + graph_id = self._function_weakref_to_graph_id[function_weakref] + self._op_type_to_context_id[op_type] = graph_id + return graph_id + return None + else: + return None def _get_symbolic_tensor_ids(self, num_tensors): tensor_ids = [] @@ -578,6 +645,8 @@ def enable_dump_debug_info(dump_root, op_regex, tensor_dtypes) op_callbacks.add_op_callback(_state.dumping_callback.callback) + function_lib.add_function_callback( + _state.dumping_callback.function_callback) if _state.dumping_callback.dump_root != dump_root: _state.dumping_callback.dump_root = dump_root @@ -605,6 +674,8 @@ def disable_dump_debug_info(): dump_root = _state.dumping_callback.dump_root debug_events_writer.DebugEventsWriter(dump_root).Close() op_callbacks.remove_op_callback(_state.dumping_callback.callback) + function_lib.remove_function_callback( + _state.dumping_callback.function_callback) delattr(_state, "dumping_callback") logging.info("Disabled dumping callback in thread %s (dump root: %s)", threading.current_thread().name, dump_root) diff --git a/tensorflow/python/debug/lib/dumping_callback_test.py b/tensorflow/python/debug/lib/dumping_callback_test.py index d32d543b382..a15fd2c20c1 100644 --- a/tensorflow/python/debug/lib/dumping_callback_test.py +++ b/tensorflow/python/debug/lib/dumping_callback_test.py @@ -129,6 +129,8 @@ class TracingCallbackTest( prev_wall_time = debug_event.wall_time execution = debug_event.execution executed_op_types.append(execution.op_type) + # No graph IDs should have been logged for eager op executions. + self.assertFalse(execution.graph_id) self.assertTrue(execution.input_tensor_ids) self.assertTrue(execution.output_tensor_ids) if tensor_debug_mode == "NO_TENSOR": @@ -218,17 +220,30 @@ class TracingCallbackTest( # NOTE(b/142486213): Execution of the TF function happens with # Session.run() in v1 graph mode, so doesn't get logged to the # .execution file. - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + (executed_op_types, executed_graph_ids, + _, _, _, _) = self._readAndCheckExecutionFile() executed_op_types = [op_type for op_type in executed_op_types if "sin1p_log_sum" in op_type] self.assertLen(executed_op_types, 1) stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames() - (context_ids, op_types, - op_name_to_op_type, _) = self._readAndCheckGraphsFile(stack_frame_by_id) + (context_ids, op_types, op_name_to_op_type, + op_name_to_context_id) = self._readAndCheckGraphsFile(stack_frame_by_id) self.assertIn("AddV2", op_types) self.assertIn("Log", op_types) self.assertIn("Sin", op_types) + if context.executing_eagerly(): + # Check the correctness of the ID of the executed graph ID. + sin_op_name = [op_name for op_name in op_name_to_op_type + if op_name_to_op_type[op_name] == "Sin"] + self.assertLen(sin_op_name, 1) + sin_context_id = op_name_to_context_id[sin_op_name[0]] + # The executed "op" is a FuncGraph, and its graph ID should have been + # recorded properly and be the ID of the graph that the Sin op belongs to. + executed_graph_ids = [ + executed_graph_ids[i] for i, op_type + in enumerate(executed_op_types) if "sin1p_log_sum" in op_type] + self.assertEqual(executed_graph_ids[0], sin_context_id) (op_names, _, _, tensor_values) = self._readAndCheckGraphExecutionTracesFile(context_ids) @@ -248,6 +263,72 @@ class TracingCallbackTest( self.assertAllClose(tensor_values[3], np.sin(np.log(5.0) + 1.0)) # Sin op. + def testCapturingExecutedGraphIdsOfTwoCompilationsOfSameFunction(self): + """Test correct executed IDs of two FuncGraphs from the same Py function.""" + writer = dumping_callback.enable_dump_debug_info( + self.dump_root, tensor_debug_mode="NO_TENSOR") + + @def_function.function + def ceil_times_two(x): + return math_ops.ceil(x) * 2.0 + + x_float32 = np.array(3.5, dtype=np.float32) + x_float64 = np.array(4.5, dtype=np.float64) + # Four executions, with two different FuncGraphs, which should lead + # to two unique executed graph IDs (see assertion below). + self.assertAllClose(ceil_times_two(x_float32), 8.0) + self.assertAllClose(ceil_times_two(x_float64), 10.0) + self.assertAllClose(ceil_times_two(x_float32), 8.0) + self.assertAllClose(ceil_times_two(x_float64), 10.0) + writer.FlushNonExecutionFiles() + writer.FlushExecutionFiles() + + (executed_op_types, executed_graph_ids, + _, _, _, _) = self._readAndCheckExecutionFile() + self.assertLen(executed_op_types, 4) + for executed_op_type in executed_op_types: + self.assertStartsWith(executed_op_type, "__inference_ceil_times_two_") + self.assertLen(executed_graph_ids, 4) + self.assertEqual(executed_graph_ids[0], executed_graph_ids[2]) + self.assertEqual(executed_graph_ids[1], executed_graph_ids[3]) + self.assertLen(set(executed_graph_ids), 2) + + def testCapturingExecutedGraphIdsOfDuplicateFunctionNames(self): + """Two FuncGraphs compiled from Python functions with identical names.""" + writer = dumping_callback.enable_dump_debug_info( + self.dump_root, tensor_debug_mode="NO_TENSOR") + + class TestClass(object): + + @def_function.function + def ceil_times_two(self, x): + return math_ops.ceil(x) * 2.0 + + # The `ceil_times_two` method of the two objects will be compiled + # into separate FuncGraphs. + test_object_1 = TestClass() + test_object_2 = TestClass() + + x = np.array(3.5, dtype=np.float32) + # Four executions, with two different FuncGraphs, which should lead + # to two unique executed graph IDs (see assertion below). + self.assertAllClose(test_object_1.ceil_times_two(x), 8.0) + self.assertAllClose(test_object_2.ceil_times_two(x), 8.0) + self.assertAllClose(test_object_1.ceil_times_two(x), 8.0) + self.assertAllClose(test_object_2.ceil_times_two(x), 8.0) + writer.FlushNonExecutionFiles() + writer.FlushExecutionFiles() + + (executed_op_types, executed_graph_ids, + _, _, _, _) = self._readAndCheckExecutionFile() + self.assertLen(executed_op_types, 4) + for executed_op_type in executed_op_types: + self.assertStartsWith(executed_op_type, "__inference_ceil_times_two_") + self.assertLen(executed_graph_ids, 4) + self.assertEqual(executed_graph_ids[0], executed_graph_ids[2]) + self.assertEqual(executed_graph_ids[1], executed_graph_ids[3]) + self.assertLen(set(executed_graph_ids), 2) + @parameterized.named_parameters( ("AddV2", "AddV2"), ("Log", "Log"), @@ -438,7 +519,7 @@ class TracingCallbackTest( # After the flushing, the .execution file should hold the appropriate # contents. if context.executing_eagerly(): - (executed_op_types, input_tensor_ids, output_tensor_ids, + (executed_op_types, _, input_tensor_ids, output_tensor_ids, tensor_debug_modes, tensor_values) = self._readAndCheckExecutionFile() # NOTE(b/142486213): Execution of the TF function happens with # Session.run() in v1 graph mode, hence it doesn't get logged to the @@ -558,7 +639,7 @@ class TracingCallbackTest( writer.FlushExecutionFiles() stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames() context_ids, _, _, _ = self._readAndCheckGraphsFile(stack_frame_by_id) - _, _, _, _, tensor_values = self._readAndCheckExecutionFile() + _, _, _, _, _, tensor_values = self._readAndCheckExecutionFile() self.assertEqual(tensor_values, [[]]) (_, _, _, tensor_values) = self._readAndCheckGraphExecutionTracesFile(context_ids) @@ -702,7 +783,7 @@ class TracingCallbackTest( self.assertAllClose(v1.read_value(), -67084290.0) self.assertAllClose(v2.read_value(), -6.0) - (executed_op_types, _, _, _, + (executed_op_types, _, _, _, _, tensor_values) = self._readAndCheckExecutionFile(dump_root=dump_root_1) v1_squared_values = [ tensor_values[i] for i, op_type in enumerate(executed_op_types) @@ -714,7 +795,7 @@ class TracingCallbackTest( self.assertAllClose( negative_v1_squared_values, [[-100.0], [-8100.0], [-67076100.0]]) - (executed_op_types, _, _, _, + (executed_op_types, _, _, _, _, tensor_values) = self._readAndCheckExecutionFile(dump_root=dump_root_2) self.assertNotIn("Neg", executed_op_types) v2_squared_values = tensor_values[executed_op_types.index("Pow")] @@ -800,7 +881,7 @@ class TracingCallbackTest( # NOTE(b/142486213): Execution of the TF function happens with # Session.run() in v1 graph mode, hence it doesn't get logged to the # .execution file. - (executed_op_types, _, _, _, + (executed_op_types, _, _, _, _, tensor_values) = self._readAndCheckExecutionFile() self.assertTrue(executed_op_types) @@ -867,7 +948,7 @@ class TracingCallbackTest( # NOTE(b/142486213): Execution of the TF function happens with # Session.run() in v1 graph mode, hence it doesn't get logged to the # .execution file. - (executed_op_types, _, _, _, + (executed_op_types, _, _, _, _, tensor_values) = self._readAndCheckExecutionFile() self.assertTrue(executed_op_types) if tensor_debug_mode == "NO_TENSOR": @@ -940,7 +1021,7 @@ class TracingCallbackTest( # NOTE(b/142486213): Execution of the TF function happens with # Session.run() in v1 graph mode, hence it doesn't get logged to the # .execution file. - executed_op_types, _, _, _, _ = self._readAndCheckExecutionFile() + executed_op_types, _, _, _, _, _ = self._readAndCheckExecutionFile() self.assertTrue(executed_op_types) (op_names, _, _, diff --git a/tensorflow/python/debug/lib/dumping_callback_test_lib.py b/tensorflow/python/debug/lib/dumping_callback_test_lib.py index 74261f918ce..6144f2ba9cc 100644 --- a/tensorflow/python/debug/lib/dumping_callback_test_lib.py +++ b/tensorflow/python/debug/lib/dumping_callback_test_lib.py @@ -193,6 +193,11 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase): Returns: executed_op_types: Types of ops that are created, as a `list` of `str`. + executed_graph_ids: A `list` of the same length as `executed_op_types`. + If the executed op is a FuncGraph, the corresponding element of the + `list` will be the ID of the FuncGraph. Else, the corresponding element + will be an empty string. This allows establishing connection between + eagerly executed FuncGraphs and their prior graph building. input_tensor_ids: Input tensor IDs for each of the ops executed, as a `list` of `list` of `int`s, with the same length as `executed_op_types`. output_tensor_ids: Output tensor IDs for each of the ops executed, as a @@ -209,6 +214,7 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase): execution_iter = reader.execution_iterator() prev_wall_time = 1 executed_op_types = [] + executed_graph_ids = [] # Empty string for execution of inidividual ops. input_tensor_ids = [] output_tensor_ids = [] tensor_debug_modes = [] @@ -218,6 +224,7 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase): prev_wall_time = debug_event.wall_time execution = debug_event.execution executed_op_types.append(execution.op_type) + executed_graph_ids.append(execution.graph_id) input_tensor_ids.append(execution.input_tensor_ids) output_tensor_ids.append(execution.output_tensor_ids) tensor_debug_modes.append(execution.tensor_debug_mode) @@ -227,8 +234,8 @@ class DumpingCallbackTestBase(test_util.TensorFlowTestCase): ]) # TODO(cais): When tensor debug modes other than NO_TENSOR is supported, # return tensor_values as well. - return (executed_op_types, input_tensor_ids, output_tensor_ids, - tensor_debug_modes, tensor_values) + return (executed_op_types, executed_graph_ids, input_tensor_ids, + output_tensor_ids, tensor_debug_modes, tensor_values) def _readAndCheckGraphExecutionTracesFile(self, context_ids): """Read & verify the content of the .graph_execution_trace debug-event file. diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py index 2d8b442e1af..89d731332f8 100644 --- a/tensorflow/python/eager/function.py +++ b/tensorflow/python/eager/function.py @@ -345,19 +345,59 @@ class _InterpolateFunctionError(object): return False +_function_callbacks = set() + + +def add_function_callback(function_callback): + """Add a callback function for Function creation. + + The callback function has the signature: + + `def function_callback(function):` + + wherein `function` is the just-created _EagerDefinedFunction. + The callback is invoked immediately after a new `_EagerDefinedFunction` + is created. The return value(s) of the callback fucntion (if any) is ignored. + + Repeated registration of the same callback function is idempotent. + After a callback is added, it can be removed with the + `remove_function_callback()` method. + + Args: + function_callback: The callback to add. + """ + _function_callbacks.add(function_callback) + + +def remove_function_callback(function_callback): + """Remove an already-added function callback. + + See the doc string of `add_function_callback()` for more information. + + Args: + function_callback: The callback to remove. + """ + _function_callbacks.remove(function_callback) + + +_FORWARD_PREFIX = "__forward_" +_BACKWARD_PREFIX = "__backward_" +_INFERENCE_PREFIX = "__inference_" + + def _forward_name(n): """The name of a generated forward defun named n.""" - return "__forward_%s_%s" % (n, ops.uid()) + return "%s%s_%s" % (_FORWARD_PREFIX, n, ops.uid()) def _backward_name(n): """The name of a generated backward defun named n.""" - return "__backward_%s_%s" % (n, ops.uid()) + return "%s%s_%s" % (_BACKWARD_PREFIX, n, ops.uid()) def _inference_name(n): """The name of a forward-but-no-gradient defun named n.""" - return "__inference_%s_%s" % (n, ops.uid()) + return "%s%s_%s" % (_INFERENCE_PREFIX, n, ops.uid()) def _enclosing_xla_context(): @@ -463,7 +503,7 @@ class _EagerDefinedFunction(object): proto_data = pywrap_tensorflow.TF_GetBuffer(buffer_) function_def = function_pb2.FunctionDef() function_def.ParseFromString(compat.as_bytes(proto_data)) - self.name = compat.as_bytes(function_def.signature.name) + self._name = compat.as_bytes(function_def.signature.name) with ops.init_scope(): if context.executing_eagerly(): context.ensure_initialized() @@ -485,6 +525,9 @@ class _EagerDefinedFunction(object): self.graph = graph self._stateful_ops = tuple(op for op in operations if op._is_stateful) # pylint: disable=protected-access + for function_callback in _function_callbacks: + function_callback(self) + def add_to_graph(self, g=None): # pylint: disable=protected-access if not g and context.executing_eagerly(): @@ -497,6 +540,10 @@ class _EagerDefinedFunction(object): g._add_function(f) # pylint: enable=protected-access + @property + def name(self): + return self._name + @property def stateful_ops(self): return self._stateful_ops @@ -533,6 +580,7 @@ class _EagerDefinedFunction(object): executor_type = function_call_options.executor_type or "" executing_eagerly = ctx.executing_eagerly() + attrs = ("executor_type", executor_type, "config_proto", config) if executing_eagerly: with _InterpolateFunctionError(self): if cancellation_manager is None: @@ -540,14 +588,14 @@ class _EagerDefinedFunction(object): str(self.signature.name), num_outputs=self._num_outputs, inputs=args, - attrs=("executor_type", executor_type, "config_proto", config), + attrs=attrs, ctx=ctx) else: outputs = execute.execute_with_cancellation( str(self.signature.name), num_outputs=self._num_outputs, inputs=args, - attrs=("executor_type", executor_type, "config_proto", config), + attrs=attrs, ctx=ctx, cancellation_manager=cancellation_manager) # Replace empty list with None diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py index c7e1d028ac3..3ae6c8414f1 100644 --- a/tensorflow/python/saved_model/function_deserialization.py +++ b/tensorflow/python/saved_model/function_deserialization.py @@ -447,11 +447,15 @@ def _list_function_deps(fdef, library_function_names): return deps +_FUNCTION_WARPPER_NAME_REGEX = r"^%s(.*)_\d+$" % ( + function_lib._INFERENCE_PREFIX) # pylint:disable=protected-access + + def _clean_function_name(name): """Vanity function to keep the function names comprehensible.""" # Note: each time a function is wrapped into `function_lib.ConcreteFunction` # its name becomes "__inference__xyz". - match = re.search(r"^__inference_(.*)_\d+$", name) + match = re.search(_FUNCTION_WARPPER_NAME_REGEX, name) if match: return match.group(1) else: From be5429dd035315da76bd8681d4a3f0e05c209ec1 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 11:42:25 -0800 Subject: [PATCH 281/383] Update Eigen to https://gitlab.com/libeigen/eigen/commit/4e696901f873a2347f76d931cf2f701e31e15d05 PiperOrigin-RevId: 284229330 Change-Id: I5cc4bbe373cfef69bc9664ed5c56b86dc71de6d1 --- .../eigen_tensor_reduced_instantiations_google.h | 2 -- .../optimized/eigen_tensor_reduced_instantiations_oss.h | 2 -- tensorflow/python/keras/activations.py | 4 ++-- tensorflow/workspace.bzl | 8 ++++---- 4 files changed, 6 insertions(+), 10 deletions(-) diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h index de10f2c9259..1eb65c5bd5c 100644 --- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h +++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_google.h @@ -91,7 +91,6 @@ typedef unsigned __int64 uint64_t; #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" @@ -149,7 +148,6 @@ typedef unsigned __int64 uint64_t; #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h" -#include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" diff --git a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h index 5b54024ac5a..027dd479af5 100644 --- a/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h +++ b/tensorflow/lite/kernels/internal/optimized/eigen_tensor_reduced_instantiations_oss.h @@ -91,7 +91,6 @@ typedef unsigned __int64 uint64_t; #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceDefault.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceGpu.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorDeviceSycl.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h" @@ -149,7 +148,6 @@ typedef unsigned __int64 uint64_t; #include "unsupported/Eigen/CXX11/src/Tensor/TensorAssign.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorScan.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorTrace.h" -#include "unsupported/Eigen/CXX11/src/Tensor/TensorSycl.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorDevice.h" #include "unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h" diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index 17af5d36b41..f26c5a117c2 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -260,8 +260,8 @@ def sigmoid(x): >>> a = tf.constant([-20, -1.0, 0.0, 1.0, 20], dtype = tf.float32) >>> b = tf.keras.activations.sigmoid(a) - >>> b.numpy() > 0.0 - array([False, True, True, True, True]) + >>> b.numpy() >= 0.0 + array([ True, True, True, True, True]) Arguments: x: Input tensor. diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 93969656e67..1fb148e078a 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -172,11 +172,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): name = "eigen_archive", build_file = clean_dep("//third_party:eigen.BUILD"), patch_file = clean_dep("//third_party/eigen3:gpu_packet_math.patch"), - sha256 = "8a4d3ef6c18c9d8e047c6444ec0a28b43d587e7a3363eb9819eb49dd6b390aed", - strip_prefix = "eigen-ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8", + sha256 = "65d732985b593b553c20566e1f236f48dcc626730c418aed7b2aa1d0e3f1a0af", + strip_prefix = "eigen-4e696901f873a2347f76d931cf2f701e31e15d05", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8/eigen-ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8.tar.gz", - "https://gitlab.com/libeigen/eigen/-/archive/ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8/eigen-ea51a9eace7e4f0ea839e61eb2df85ccfb94aee8.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/gitlab.com/libeigen/eigen/-/archive/4e696901f873a2347f76d931cf2f701e31e15d05/eigen-4e696901f873a2347f76d931cf2f701e31e15d05.tar.gz", + "https://gitlab.com/libeigen/eigen/-/archive/4e696901f873a2347f76d931cf2f701e31e15d05/eigen-4e696901f873a2347f76d931cf2f701e31e15d05.tar.gz", ], ) From 8a2ad877e9100934ed3f4725424dce86185b6b36 Mon Sep 17 00:00:00 2001 From: Jian Li Date: Fri, 6 Dec 2019 11:46:24 -0800 Subject: [PATCH 282/383] Add quantization support to all variants of LSTM. - peephole coefficients are quantized to 16 bits symmetric. Int16 is used because the calculation is a 16x16 vector vector elementwise multiplication. - without projection, hidden tensor becomes the output and reuses the quantization parameters of the output - with layer normalization, the gate matmul uses intermediate result as output; without layer normalization, gate matmul is fed into activation directly so 2^(-12) is the output scale. PiperOrigin-RevId: 284230412 Change-Id: Ibfa66dc6fc2614de28b0ba92e8fb2d42a338aab4 --- tensorflow/lite/tools/optimize/BUILD | 2 + .../lite/tools/optimize/operator_property.cc | 475 +++++++++++++++++- .../tools/optimize/quantize_model_test.cc | 59 +++ tensorflow/lite/tools/optimize/test_util.cc | 3 + tensorflow/lite/tools/optimize/test_util.h | 8 +- .../optimize/testdata/lstm_calibrated2.bin | Bin 0 -> 2808 bytes .../optimize/testdata/lstm_quantized2.bin | Bin 0 -> 4152 bytes 7 files changed, 544 insertions(+), 3 deletions(-) create mode 100644 tensorflow/lite/tools/optimize/testdata/lstm_calibrated2.bin create mode 100644 tensorflow/lite/tools/optimize/testdata/lstm_quantized2.bin diff --git a/tensorflow/lite/tools/optimize/BUILD b/tensorflow/lite/tools/optimize/BUILD index 185969fee55..462841d9053 100644 --- a/tensorflow/lite/tools/optimize/BUILD +++ b/tensorflow/lite/tools/optimize/BUILD @@ -234,7 +234,9 @@ tf_cc_test( "//tensorflow/lite/tools/optimize:testdata/concat.bin", "//tensorflow/lite/tools/optimize:testdata/fc.bin", "//tensorflow/lite/tools/optimize:testdata/lstm_calibrated.bin", + "//tensorflow/lite/tools/optimize:testdata/lstm_calibrated2.bin", "//tensorflow/lite/tools/optimize:testdata/lstm_quantized.bin", + "//tensorflow/lite/tools/optimize:testdata/lstm_quantized2.bin", "//tensorflow/lite/tools/optimize:testdata/mixed.bin", "//tensorflow/lite/tools/optimize:testdata/multi_input_add_reshape.bin", "//tensorflow/lite/tools/optimize:testdata/single_avg_pool_min_minus_5_max_plus_5.bin", diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc index 7b4056c7f9c..2eacd62725b 100644 --- a/tensorflow/lite/tools/optimize/operator_property.cc +++ b/tensorflow/lite/tools/optimize/operator_property.cc @@ -196,6 +196,77 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index, // TODO(jianlijianli): extend this to other variants of LSTM. // LSTM needs 5 intermediate tensors. This agrees with the fully quantized // kernels in lstm_eval.cc + if (op_variant.use_layer_norm && op_variant.use_projection && + op_variant.use_peephole) { + static const float alpha = static_cast(std::pow(2, -10)); + TensorProperty tensor_property_9; + tensor_property_9.number_of_bits = 16; + tensor_property_9.symmetric = true; + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{20}, {}, {alpha}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{21}, {}, {alpha}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{22}, {}, {alpha}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{23}, {}, {alpha}}; + TensorProperty tensor_property_17; + tensor_property_17.use_derived_scale = true; + tensor_property_17.number_of_bits = 32; + tensor_property_17.derived_scale = {{16}, {4}, {}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + TensorProperty tensor_property_20; + tensor_property_20.number_of_bits = 16; + tensor_property_20.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {9, tensor_property_9}, + {10, tensor_property_9}, + {11, tensor_property_9}, + {16, {}}, + {19, tensor_property_19}, + {20, tensor_property_20}, + {21, tensor_property_20}, + {22, tensor_property_20}, + {23, tensor_property_20}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + {17, tensor_property_17}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + {0, tensor_property_20}, + {1, tensor_property_20}, + {2, tensor_property_20}, + {3, tensor_property_20}, + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } if (op_variant.use_layer_norm && op_variant.use_projection && !op_variant.use_peephole) { static const float alpha = static_cast(std::pow(2, -10)); @@ -261,8 +332,408 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index, }; property.restrict_scale = {{18, 0}}; property.version = 2; - } else { - property.quantizable = false; + } + if (op_variant.use_layer_norm && !op_variant.use_projection && + op_variant.use_peephole) { + static const float alpha = static_cast(std::pow(2, -10)); + TensorProperty tensor_property_9; + tensor_property_9.number_of_bits = 16; + tensor_property_9.symmetric = true; + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{20}, {}, {alpha}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{21}, {}, {alpha}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{22}, {}, {alpha}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{23}, {}, {alpha}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + TensorProperty tensor_property_20; + tensor_property_20.number_of_bits = 16; + tensor_property_20.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {9, tensor_property_9}, + {10, tensor_property_9}, + {11, tensor_property_9}, + {19, tensor_property_19}, + {20, tensor_property_20}, + {21, tensor_property_20}, + {22, tensor_property_20}, + {23, tensor_property_20}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + {0, tensor_property_20}, + {1, tensor_property_20}, + {2, tensor_property_20}, + {3, tensor_property_20}, + // Without projection, hidden state (4), output (0) and input + // activation state (18) are the same except that the very first + // inference of input activation is not captured in hidden and + // output. + // This is not an issue because this intermediate tensor is not used + // in the kernel and its quantization parameters are ignored. + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } + if (op_variant.use_layer_norm && !op_variant.use_projection && + !op_variant.use_peephole) { + static const float alpha = static_cast(std::pow(2, -10)); + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{20}, {}, {alpha}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{21}, {}, {alpha}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{22}, {}, {alpha}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{23}, {}, {alpha}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + TensorProperty tensor_property_20; + tensor_property_20.number_of_bits = 16; + tensor_property_20.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {19, tensor_property_19}, + {20, tensor_property_20}, + {21, tensor_property_20}, + {22, tensor_property_20}, + {23, tensor_property_20}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + {0, tensor_property_20}, + {1, tensor_property_20}, + {2, tensor_property_20}, + {3, tensor_property_20}, + // Without projection, hidden state (4), output (0) and input + // activation state (18) are the same except that the very first + // inference of input activation is not captured in hidden and + // output. + // This is not an issue because this intermediate tensor is not used + // in the kernel and its quantization parameters are ignored. + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } + if (!op_variant.use_layer_norm && op_variant.use_projection && + op_variant.use_peephole) { + TensorProperty tensor_property_9; + tensor_property_9.number_of_bits = 16; + tensor_property_9.symmetric = true; + // Without layer norm, we choose to quantize bias with the scale of + // input and its correpsonding weight. The other choice will + // be to ues the scale of recurrent and its correpsonding weight but we + // choose to use the smaller scale, which means higher resolution. + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{0, 1}, {}, {}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{0, 2}, {}, {}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{0, 3}, {}, {}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{0, 4}, {}, {}}; + TensorProperty tensor_property_17; + tensor_property_17.use_derived_scale = true; + tensor_property_17.number_of_bits = 32; + tensor_property_17.derived_scale = {{16}, {4}, {}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {9, tensor_property_9}, + {10, tensor_property_9}, + {11, tensor_property_9}, + {16, {}}, + {19, tensor_property_19}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + {17, tensor_property_17}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + // Without layer normliazation, intermediate tensors 0, 1, 2, 3 are + // not used and and their quantization parameters are ignored. + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + // Hidden state is quantized as usual. + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } + if (!op_variant.use_layer_norm && op_variant.use_projection && + !op_variant.use_peephole) { + // Without layer norm, we choose to quantize bias with the scale of + // input and its correpsonding weight. The other choice will + // be to ues the scale of recurrent and its correpsonding weight but we + // choose to use the smaller scale, which means higher resolution. + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{0, 1}, {}, {}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{0, 2}, {}, {}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{0, 3}, {}, {}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{0, 4}, {}, {}}; + TensorProperty tensor_property_17; + tensor_property_17.use_derived_scale = true; + tensor_property_17.number_of_bits = 32; + tensor_property_17.derived_scale = {{16}, {4}, {}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {16, {}}, + {19, tensor_property_19}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + {17, tensor_property_17}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + // Without layer normliazation, intermediate tensors 0, 1, 2, 3 are + // not used and their quantization parameters are ignored. + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + // Hidden state is quantized as usual. + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } + if (!op_variant.use_layer_norm && !op_variant.use_projection && + op_variant.use_peephole) { + TensorProperty tensor_property_9; + tensor_property_9.number_of_bits = 16; + tensor_property_9.symmetric = true; + // Without layer norm, we choose to quantize bias with the scale of + // input and its correpsonding weight. The other choice will + // be to ues the scale of recurrent and its correpsonding weight but we + // choose to use the smaller scale, which means higher resolution. + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{0, 1}, {}, {}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{0, 2}, {}, {}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{0, 3}, {}, {}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{0, 4}, {}, {}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {9, tensor_property_9}, + {10, tensor_property_9}, + {11, tensor_property_9}, + {19, tensor_property_19}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + // Without layer normliazation, intermediate tensors 0, 1, 2, 3 are + // not used and their quantization parameters are ignored. + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + // Without projection, hidden state (4), output (0) and input + // activation state (18) are the same except that the very first + // inference of input activation is not captured in hidden and + // output. + // This is not an issue because this intermediate tensor is not used + // in the kernel and its quantization parameters are ignored. + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; + } + if (!op_variant.use_layer_norm && !op_variant.use_projection && + !op_variant.use_peephole) { + // Without layer norm, we choose to quantize bias with the scale of + // input and its correpsonding weight. The other choice will + // be to ues the scale of recurrent and its correpsonding weight but we + // choose to use the smaller scale, which means higher resolution. + TensorProperty tensor_property_12; + tensor_property_12.use_derived_scale = true; + tensor_property_12.number_of_bits = 32; + tensor_property_12.derived_scale = {{0, 1}, {}, {}}; + TensorProperty tensor_property_13; + tensor_property_13.use_derived_scale = true; + tensor_property_13.number_of_bits = 32; + tensor_property_13.derived_scale = {{0, 2}, {}, {}}; + TensorProperty tensor_property_14; + tensor_property_14.use_derived_scale = true; + tensor_property_14.number_of_bits = 32; + tensor_property_14.derived_scale = {{0, 3}, {}, {}}; + TensorProperty tensor_property_15; + tensor_property_15.use_derived_scale = true; + tensor_property_15.number_of_bits = 32; + tensor_property_15.derived_scale = {{0, 4}, {}, {}}; + TensorProperty tensor_property_19; + tensor_property_19.extend_to_power_of_two = true; + tensor_property_19.number_of_bits = 16; + tensor_property_19.state_tensor = true; + tensor_property_19.symmetric = true; + + property.inputs = { + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + {4, {}}, + {5, {}}, + {6, {}}, + {7, {}}, + {8, {}}, + {19, tensor_property_19}, + {12, tensor_property_12}, + {13, tensor_property_13}, + {14, tensor_property_14}, + {15, tensor_property_15}, + }; + property.outputs = {{0, {}}}; + property.intermediates = { + // Without layer normliazation, intermediate tensors 0, 1, 2, 3 are + // not used and their quantization parameters are ignored. + {0, {}}, + {1, {}}, + {2, {}}, + {3, {}}, + // Without projection, hidden state (4), output (0) and input + // activation state (18) are the same except that the very first + // inference of input activation is not captured in hidden and + // output. + // This is not an issue because this intermediate tensor is not used + // in the kernel and its quantization parameters are ignored. + {4, {}}, + }; + property.restrict_scale = {{18, 0}}; + property.version = 2; } break; } diff --git a/tensorflow/lite/tools/optimize/quantize_model_test.cc b/tensorflow/lite/tools/optimize/quantize_model_test.cc index 0e13949f7af..023b3183577 100644 --- a/tensorflow/lite/tools/optimize/quantize_model_test.cc +++ b/tensorflow/lite/tools/optimize/quantize_model_test.cc @@ -1038,6 +1038,65 @@ TEST_F(QuantizeLSTMTest, VerifyLSTM) { } } +class QuantizeLSTM2Test : public QuantizeModelTest { + protected: + QuantizeLSTM2Test() { + input_model_ = ReadModel(internal::kLstmCalibrated2); + readonly_model_ = input_model_->GetModel(); + readonly_model_->UnPackTo(&model_); + } +}; + +TEST_F(QuantizeLSTM2Test, VerifyLSTM) { + // Quantize model. + auto status = QuantizeModel(&builder_, &model_, TensorType_FLOAT32, + TensorType_FLOAT32, &error_reporter_); + ASSERT_EQ(kTfLiteOk, status); + + // Read expected model. + auto expected_fb_model = ReadModel(internal::kLstmQuantized2); + auto expected_read_only_model = expected_fb_model->GetModel(); + ModelT expected_model; + expected_read_only_model->UnPackTo(&expected_model); + + // Comparison. + ASSERT_EQ(model_.subgraphs.size(), expected_model.subgraphs.size()); + for (size_t subgraph_idx = 0; subgraph_idx < model_.subgraphs.size(); + subgraph_idx++) { + const auto graph = model_.subgraphs[subgraph_idx].get(); + const auto expected_graph = expected_model.subgraphs[subgraph_idx].get(); + ASSERT_EQ(graph->tensors.size(), expected_graph->tensors.size()); + for (size_t i = 0; i < graph->tensors.size(); i++) { + const auto tensor = graph->tensors[i].get(); + const auto expected_tensor = expected_graph->tensors[i].get(); + EXPECT_EQ(tensor->buffer, expected_tensor->buffer); + EXPECT_EQ(tensor->is_variable, expected_tensor->is_variable); + EXPECT_EQ(tensor->shape, expected_tensor->shape); + EXPECT_EQ(tensor->name, expected_tensor->name); + EXPECT_EQ(tensor->type, expected_tensor->type); + const auto quantization_params = tensor->quantization.get(); + const auto expected_quantization_params = + expected_tensor->quantization.get(); + if (quantization_params != nullptr || + expected_quantization_params != nullptr) { + EXPECT_NE(quantization_params, nullptr); + EXPECT_NE(expected_quantization_params, nullptr); + EXPECT_EQ(quantization_params->scale, + expected_quantization_params->scale); + EXPECT_EQ(quantization_params->zero_point, + expected_quantization_params->zero_point); + } + } + } + ASSERT_EQ(model_.buffers.size(), expected_model.buffers.size()); + for (size_t buffer_idx = 0; buffer_idx < model_.buffers.size(); + ++buffer_idx) { + const auto buffer = model_.buffers[buffer_idx].get()->data; + const auto expected_buffer = expected_model.buffers[buffer_idx].get()->data; + EXPECT_EQ(buffer, expected_buffer); + } +} + class QuantizeFCTest : public QuantizeModelTest { protected: QuantizeFCTest() { diff --git a/tensorflow/lite/tools/optimize/test_util.cc b/tensorflow/lite/tools/optimize/test_util.cc index ecceacb278c..7fb91e02761 100644 --- a/tensorflow/lite/tools/optimize/test_util.cc +++ b/tensorflow/lite/tools/optimize/test_util.cc @@ -52,6 +52,9 @@ const char* kModelSplit = "split.bin"; const char* kLstmCalibrated = "lstm_calibrated.bin"; const char* kLstmQuantized = "lstm_quantized.bin"; +const char* kLstmCalibrated2 = "lstm_calibrated2.bin"; +const char* kLstmQuantized2 = "lstm_quantized2.bin"; + const char* kModelWithUnpack = "unpack.bin"; int FailOnErrorReporter::Report(const char* format, va_list args) { diff --git a/tensorflow/lite/tools/optimize/test_util.h b/tensorflow/lite/tools/optimize/test_util.h index 7690ab212cf..03b27f66870 100644 --- a/tensorflow/lite/tools/optimize/test_util.h +++ b/tensorflow/lite/tools/optimize/test_util.h @@ -76,10 +76,16 @@ extern const char* kModelMixed; // Test model with split op. extern const char* kModelSplit; -// Test model with LSTM op. +// Test model with LSTM op that has layer norm, has projection, without +// peephole, without cifg. extern const char* kLstmCalibrated; extern const char* kLstmQuantized; +// Test model with LSTM op that has peephole, without layer norm, without +// projection, without cifg. +extern const char* kLstmCalibrated2; +extern const char* kLstmQuantized2; + // Test model with an unpack op. extern const char* kModelWithUnpack; diff --git a/tensorflow/lite/tools/optimize/testdata/lstm_calibrated2.bin b/tensorflow/lite/tools/optimize/testdata/lstm_calibrated2.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf1fae62da1f21e4431eda94a590b88d5c4fc625 GIT binary patch literal 2808 zcmb7G%S%*I96nQ1noVYAYEKL#Tud>eRvI=HNVu3qgouQ2ot)b^G0w<%?Lqzl64Gim zwh3xcgh&vCfrNyF3)_Vev}z?G)xyBaoWAeebM)SEy5NOx&YbgmKhE#oS|LQ|gft<{V{`H4Lm*y27qD2!3Ei=^!8LB}BDc z*%KvFQb4j6r}8OheY=4Upbnso9l$~05O5fvo})l5u=(-Pl zZGaz`N02$77ia)}z@cei6zBx%fMqy31M~tbaC{aR1A2gZU=had0JXpr>~#YW6J@{h z`F5=JYQMUY&$B;CqkqBI_aj{S)Bwq}Wo8G=crs?0M#yM3`jhdvnU=idxdwe#uxfo# z?49}^O`eka9)Ejs{`h8nO-3qq*UTE}Om@IXrU!>D@M!z53Uoi)n(js~-Ic%He*3ZC zg5O~;-RT0|y<5{A^3u(&WB>8tG-VI!+|L!*srm3-=Kg1JG{PRv3|mNWBgdua#$0Q_ z%DVFo*fb--GN;nUZ8MqZw}wuJL6Y@gFONZ{4tvV%1NzKVs*qXNZmK~4jZ8LSTL0G% zbz%uJIu^V4;BFBSZDRsHLCkj%KU!}7kGqbE_D}2m%=;;O%#fZBSqL`VYTtY6#I?Z` zs#0?zfd6_b315f9MrhS-L+2SRGMBcESf7==6|<6=v|(FnA)Pj~t~22gF%?J2hLo@3o5zoSxGruna(N6se7GHz#mX(D`j!;iS)r%v+o9>@&I zZjjuvXSClr#m^b2^*ie;CGf`A{IiOmeM^4!j-8_o1C2P-{LA2HZ!#z3KLF65Miqn# zIn3tNG7YNVIHO%M$}ueya>(?ke&cMX3}?5rV>_orTGwr`|LeiSyQKNEW?z9@EwaX} zGcluN_#UAQW2cPFud;9!XPTeuvXa58z-7lB${CkT1A2+R;dLQpSaaHOg@T$p?esuK z&j9+yJWKw9h**55Tu#}-t7qLmTh19(lS^h!*`gk|Eq7Gx1E($CLh|z)f&D{&S#zFM z0k&ONlxz!^4$e$li>-%!-Ub8#>ZESYQr6Y^4tO{|E@#L4`Dlc-kXFz@^iTI4SL(38 MlS<@w5;hh80G-uQaaBrFin=Je>&Bo6rHfjm3kd-tZ#~c z2k)7%&-H(Ne*^EzgB$x4?{qOKV9X{PXXUWT_A=x&h1Nc8Ihq4tFX#Z8<6iI>cpN+d zl&2>_Fdr{fPlKpzjWqGA9P%(t$1Z1s8z)(|$_K}z7&HNmYXKXt~0kx6EipPEBl(U>4w-IlO*F_PQ?W+A)*aS<#tfd_72(2R@%vly5JConbz z&SH2PoF`a&!DjF)PI({!7IE?vm;`BX6~{-wB8DlKmzj11Yz4Bj)kO{;57&Y|(a_w0m9uvp5zoi9vGdWWCy;*+f-mGLxVI*`TtAQ;aV zi}B8`uFu5ge{-rlk@x<8ex@^vjOnD+{K2|l?bR6KIL=}dk!jdVe;1Dv=%Vr9Q{@TA z^=p5#HJ-H|9QJU!YoL0KCt1bKb_ZxB{%%)qX*kGO#ZNhDwY!My4NmVUuUq$Yv++|N z^gZAm8}|xF$GpN+DI2hBO1rnnM(r_65kIt7PnW0O%Dzz;Q%aow;#aJKlOb}}yZ(_Z zx-lpJw6(lpls?+?GeC1B(ATPGKG{d(e|+|vKDF8kJ`@fvqsaQs9M<#Z?82Btd z8vnY@r*kF)#OJM(PjRnepDAjKzx_rtF<5Vsq$!!mHEN#YNI)P1X(fd z4ig!cO_vtG<6VgG>g;#CgXrt#4Fl|WHJ{_%b2IRF6rHXK86bb>;aw}fwS3jR#`+or zIiwt@@O>TdwdClhyMgnw6aAg|d9Cr(OS1i^E1OE`0#rt7|SN3Fp{9UQr-|FmYkBa)MdrOKH)WjL3vs8OOkIlE%MFz-@8SGn6+;zte ze6Py3wygiyp_5!|N|rb~Mv;d>((2tm@T5J_YuV$@bJ@PobK2giPtVO&?$p|gttIK; whNW1juUh4M5a_-oKK+ks3vYDVOL-dAPyaX?`DdtgXv!P0CAjs4eUj?)FUSr2DF6Tf literal 0 HcmV?d00001 From 0f60d0b537a397f1b3600e82d4a87796934aad2e Mon Sep 17 00:00:00 2001 From: Anthony Liu Date: Fri, 6 Dec 2019 11:47:51 -0800 Subject: [PATCH 283/383] [tfdbg] Add ConciseHealth mode to DebugNumericSummaryV2Op. - The TensorDebugMode added is CONCISE_HEALTH, a mode that computes a shape-[5] rank-1 tensor given any float-type tensor. The first element is the id of the tensor. The second element is the number of elements in the input, and the remaining 3 elements are the counts of -inf, +inf, and nan values respectively. - The CPU and GPU kernels of the op are added. PiperOrigin-RevId: 284230777 Change-Id: Ibe2220cc614bcfe03c59ba7e9439c5c4a201bb91 --- tensorflow/core/kernels/debug_ops.h | 70 ++++++++++++++ tensorflow/core/kernels/debug_ops_gpu.cu.cc | 50 ++++++++++ .../python/debug/lib/debug_v2_ops_test.py | 93 +++++++++++++++++++ 3 files changed, 213 insertions(+) diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 31f5e1ca6de..963b2bb58fc 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -467,6 +467,18 @@ extern template struct CurtHealthLaunch; extern template struct CurtHealthLaunch; extern template struct CurtHealthLaunch; +template +struct ConciseHealthLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); +}; + +extern template struct ConciseHealthLaunch; +extern template struct ConciseHealthLaunch; +extern template struct ConciseHealthLaunch; +extern template struct ConciseHealthLaunch; +extern template struct ConciseHealthLaunch; +extern template struct ConciseHealthLaunch; + template struct ReduceInfNanThreeSlotsLaunch { void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); @@ -502,6 +514,7 @@ class DebugNumericSummaryV2Op : public OpKernel { const int64 size = in.size(); Tensor* output_tensor; Tout tensor_id = static_cast(tensor_id_); + const float num_elem = static_cast(context->input(0).NumElements()); // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because // that mode does not make use of tensor_id. if (tensor_debug_mode_ != 8) { @@ -527,6 +540,31 @@ class DebugNumericSummaryV2Op : public OpKernel { if (fp_props) { output_tensor->flat()(1) = 1.0; } + } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH + TensorShape shape({5}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + output_tensor->flat()(0) = tensor_id; + output_tensor->flat()(1) = num_elem; + + // Accumlator value [neg_inf_count, pos_inf_count, nan_count] + Tout fp_props[3] = {0.0, 0.0, 0.0}; + std::for_each(data, data + size, [&fp_props](const Tin& y) { + if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { + // Do nothing: common case. + } else if (Eigen::numext::isinf(y)) { + if (y < static_cast(0.f)) { + ++fp_props[0]; + } else { + ++fp_props[1]; + } + } else if (Eigen::numext::isnan(y)) { + ++fp_props[2]; + } + }); + output_tensor->flat()(2) = fp_props[0]; // Slot for -inf count + output_tensor->flat()(3) = fp_props[1]; // Slot for inf count + output_tensor->flat()(4) = fp_props[2]; // Slot for nan count } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. TensorShape shape({3}); OP_REQUIRES_OK(context, @@ -590,6 +628,7 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { void ComputeAsync(OpKernelContext* context, DoneCallback done) override { Tensor* output_tensor; Tout tensor_id = static_cast(tensor_id_); + const float num_elem = static_cast(context->input(0).NumElements()); // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because // that mode does not make use of tensor_id. if (tensor_debug_mode_ != 8) { @@ -631,6 +670,37 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { auto check_cb = [this, done]() { done(); }; + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, std::move(check_cb)); + } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH. + TensorShape shape({5}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + + auto* stream = context->op_device_context()->stream(); + OP_REQUIRES_ASYNC(context, stream != nullptr, + errors::Internal("No GPU stream available."), done); + + se::DeviceMemoryBase output_tensor_ptr( + output_tensor->flat().data(), + output_tensor->flat().size()); + stream->ThenMemset32(&output_tensor_ptr, 0, 5 * sizeof(Tout)); + const Tout static_output[] = {tensor_id, num_elem}; + stream->ThenMemcpy(&output_tensor_ptr, &static_output, 2 * sizeof(Tout)); + if (num_elem == 0) { + done(); + return; + } + + // Call the GPU kernels for the numerical (inf/nan) checks. + const Device& d = context->eigen_device(); + auto input = context->input(0).flat(); + ConciseHealthLaunch().Run( + d, input.data(), input.size(), + output_tensor->flat().data() + 2); + + auto check_cb = [this, done]() { done(); }; + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( stream, std::move(check_cb)); } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. diff --git a/tensorflow/core/kernels/debug_ops_gpu.cu.cc b/tensorflow/core/kernels/debug_ops_gpu.cu.cc index 5597c12a5ad..e6f42f3d4b6 100644 --- a/tensorflow/core/kernels/debug_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/debug_ops_gpu.cu.cc @@ -51,6 +51,36 @@ __global__ void CurtHealthKernel(const Tin* __restrict__ data, int size, } } +// A CUDA kernel that fills the three elements of an output +// vector with the number of NaNs, -infs, and infs in the input respectively. +template +__global__ void ConciseHealthKernel(const Tin* __restrict__ data, int size, + Tout output[3]) { + const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32 total_thread_count = gridDim.x * blockDim.x; + + int32 offset = thread_id; + Tout accum[3] = {0.0, 0.0, 0.0}; + + while (offset < size) { + if (isinf(data[offset])) { + if (data[offset] < static_cast(0.f)) { + ++accum[0]; + } else { + ++accum[1]; + } + } + if (isnan(data[offset])) { + ++accum[2]; + } + offset += total_thread_count; + } + + atomicAdd(output, accum[0]); + atomicAdd(output + 1, accum[1]); + atomicAdd(output + 2, accum[2]); +} + // A CUDA kernel that fills a length-3 vector according to whether any of the // input data contains negative infinity, positive infinity, or NaN. The first // element is filled with -infinity if any of the elements is -infinity. @@ -101,6 +131,26 @@ template struct CurtHealthLaunch; template struct CurtHealthLaunch; template struct CurtHealthLaunch; +template +struct ConciseHealthLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]) { + const int32 block_size = d.maxGpuThreadsPerBlock(); + const int32 num_blocks = + (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / + block_size; + + TF_CHECK_OK(GpuLaunchKernel(ConciseHealthKernel, num_blocks, + block_size, 0, d.stream(), data, size, output)); + } +}; + +template struct ConciseHealthLaunch; +template struct ConciseHealthLaunch; +template struct ConciseHealthLaunch; +template struct ConciseHealthLaunch; +template struct ConciseHealthLaunch; +template struct ConciseHealthLaunch; + template struct ReduceInfNanThreeSlotsLaunch { void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]) { diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py index 474b2330e99..76d077c6286 100644 --- a/tensorflow/python/debug/lib/debug_v2_ops_test.py +++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py @@ -271,6 +271,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): def testDebugNumericSummaryV2OpLargeTensorIDError(self): modes = [ debug_event_pb2.TensorDebugMode.CURT_HEALTH, + debug_event_pb2.TensorDebugMode.CONCISE_HEALTH, ] # Maximum allowed tensor_id tensor_id = np.power(2, 53) @@ -388,6 +389,98 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): self.assertAllEqual(tensor_1, tensor_2) self.assertEqual(tensor_id_1, tensor_id_2) + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpConciseHealthSmall(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=( + debug_event_pb2.TensorDebugMode.CONCISE_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + tensor, tensor_id = debug_summary(constant_op.constant([])) + self.assertAllEqual(tensor, [tensor_id, 0.0, 0.0, 0.0, 0.0]) + + tensor, tensor_id = debug_summary(constant_op.constant(42.0)) + self.assertAllEqual(tensor, [tensor_id, 1.0, 0.0, 0.0, 0.0]) + + tensor, tensor_id = debug_summary(constant_op.constant([3.0, 4.0])) + self.assertAllEqual(tensor, [tensor_id, 2.0, 0.0, 0.0, 0.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([3.0, -np.inf]))) + self.assertAllEqual(tensor, [tensor_id, 2.0, 1.0, 0.0, 0.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, 0], [np.nan, 0]]))) + self.assertAllEqual(tensor, [tensor_id, 4.0, 0.0, 0.0, 1.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, 0], [np.nan, np.inf]]))) + self.assertAllEqual(tensor, [tensor_id, 4.0, 0.0, 1.0, 1.0]) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, np.inf], [np.nan, -np.inf]]))) + self.assertAllEqual(tensor, [tensor_id, 4.0, 1.0, 1.0, 1.0]) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpConciseHealthLarge(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=( + debug_event_pb2.TensorDebugMode.CONCISE_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + x = np.zeros([100, 100], dtype=np.float16) + x[32, :] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 10000.0, 0.0, 0.0, 100.0]) + x = np.zeros([97, 97], dtype=np.float32) + x[50, 83:85] = -np.inf + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 97 * 97, 2.0, 0.0, 0.0]) + x[1:9, 41] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 97 * 97, 2.0, 0.0, 8.0]) + x = np.zeros([9701], dtype=np.float64) + x[9700] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [tensor_id, 9701, 0.0, 0.0, 1.0]) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpConciseHealthConsistency(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=( + debug_event_pb2.TensorDebugMode.CONCISE_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + # Assert the same op is returns a consistent value + x = np.zeros([100, 100], dtype=np.float16) + x[3, 4] = -np.inf + c = constant_op.constant(x) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + + c = constant_op.constant(np.ones((100, 200), np.double)) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + if __name__ == "__main__": ops.enable_eager_execution() From b278a8b3a8f20ca5feb6cc06da0c2951899dbcef Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 11:59:59 -0800 Subject: [PATCH 284/383] Move GPU::FuncOp definition to ODS - NFC Move the definition of the GPU function opreation from hand-rolled C++ code to ODS framework. This only does the moves, a follow-up is necessary to clean up users of custom functions that could be auto-generated by ODS. PiperOrigin-RevId: 284233245 Change-Id: Ie9cf86de6365cde0d87101cbf5c8586ce6ae69b2 --- third_party/mlir/g3doc/Dialects/GPU.md | 100 +++---------- .../include/mlir/Dialect/GPU/GPUDialect.h | 82 ----------- .../mlir/include/mlir/Dialect/GPU/GPUOps.td | 139 ++++++++++++++++++ third_party/mlir/include/mlir/IR/OpBase.td | 2 + .../mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 59 ++++---- 5 files changed, 193 insertions(+), 189 deletions(-) diff --git a/third_party/mlir/g3doc/Dialects/GPU.md b/third_party/mlir/g3doc/Dialects/GPU.md index faa07219e03..d34ce1891e8 100644 --- a/third_party/mlir/g3doc/Dialects/GPU.md +++ b/third_party/mlir/g3doc/Dialects/GPU.md @@ -12,6 +12,28 @@ manipulations to launch a GPU kernel and provide a simple path towards GPU execution from MLIR. It may be targeted, for example, by DSLs using MLIR. The dialect uses `gpu` as its canonical prefix. +## Memory attribution + +Memory buffers are defined at the function level, either in "gpu.launch" or in +"gpu.func" ops. This encoding makes it clear where the memory belongs and makes +the lifetime of the memory visible. The memory is only accessible while the +kernel is launched/the function is currently invoked. The latter is more strict +than actual GPU implementations but using static memory at the function level is +just for convenience. It is also always possible to pass pointers to the +workgroup memory into other functions, provided they expect the correct memory +space. + +The buffers are considered live throughout the execution of the GPU function +body. The absence of memory attribution syntax means that the function does not +require special buffers. Rationale: although the underlying models declare +memory buffers at the module level, we chose to do it at the function level to +provide some structuring for the lifetime of those buffers; this avoids the +incentive to use the buffers for communicating between different kernels or +launches of the same kernel, which should be done through function arguments +instead; we chose not to use `alloca`-style approach that would require more +complex lifetime analysis following the principles of MLIR that promote +structure and representing analysis results in the IR. + ## Operations ### `gpu.block_dim` @@ -47,84 +69,6 @@ Example: %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) ``` -### `gpu.func` - -Defines a function that can be executed on a GPU. This supports memory -attribution and its body has a particular execution model. - -GPU functions are either kernels (as indicated by the `kernel` attribute) or -regular functions. The former can be launched from the host side, while the -latter are device side only. - -The memory attribution defines SSA values that correspond to memory buffers -allocated in the memory hierarchy of the GPU (see below). - -The operation has one attached region that corresponds to the body of the -function. The region arguments consist of the function arguments without -modification, followed by buffers defined in memory annotations. The body of a -GPU function, when launched, is executed by multiple work items. There are no -guarantees on the order in which work items execute, or on the connection -between them. In particular, work items are not necessarily executed in -lock-step. Synchronization ops such as "gpu.barrier" should be used to -coordinate work items. Declarations of GPU functions, i.e. not having the body -region, are not supported. - -#### Memory attribution - -Memory buffers are defined at the function level, either in "gpu.launch" or in -"gpu.func" ops. This encoding makes it clear where the memory belongs and makes -the lifetime of the memory visible. The memory is only accessible while the -kernel is launched/the function is currently invoked. The latter is more strict -than actual GPU implementations but using static memory at the function level is -just for convenience. It is also always possible to pass pointers to the -workgroup memory into other functions, provided they expect the correct memory -space. - -The buffers are considered live throughout the execution of the GPU function -body. The absence of memory attribution syntax means that the function does not -require special buffers. Rationale: although the underlying models declare -memory buffers at the module level, we chose to do it at the function level to -provide some structuring for the lifetime of those buffers; this avoids the -incentive to use the buffers for communicating between different kernels or -launches of the same kernel, which should be done through function arguments -instead; we chose not to use `alloca`-style approach that would require more -complex lifetime analysis following the principles of MLIR that promote -structure and representing analysis results in the IR. - -Syntax: - -``` {.ebnf} -op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->` -function-result-list)? - memory-attribution `kernel`? function-attributes? region - -memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? - (`private` `(` ssa-id-and-type-list `)`)? -``` - -Example: - -```mlir {.mlir} -gpu.func @foo(%arg0: index) - workgroup(%workgroup: memref<32xf32, 3>) - private(%private: memref<1xf32, 5>) - kernel - attributes {qux: "quux"} { - gpu.return -} -``` - -The generic form illustrates the concept - -```mlir {.mlir} -"gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({ -^bb0(%arg0: index, %workgroup: memref<32xf32, 3>, %private: memref<1xf32, 5>): - "gpu.return"() : () -> () -}) : (index) -> () -``` - -Note the non-default memory spaces used in memref types in memory-attribution. - ### `gpu.launch` Launch a kernel on the specified grid of thread blocks. The body of the kernel diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h index 619f76937bc..194dd9c1e1d 100644 --- a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -197,88 +197,6 @@ private: static StringRef getKernelModuleAttrName() { return "kernel_module"; } }; -class GPUFuncOp : public Op { -public: - using Op::Op; - - /// Returns the name of the operation. - static StringRef getOperationName() { return "gpu.func"; } - - /// Constructs a FuncOp, hook for Builder methods. - static void build(Builder *builder, OperationState &result, StringRef name, - FunctionType type, ArrayRef workgroupAttributions, - ArrayRef privateAttributions, - ArrayRef attrs); - - /// Prints the Op in custom format. - void print(OpAsmPrinter &p); - - /// Parses the Op in custom format. - static ParseResult parse(OpAsmParser &parser, OperationState &result); - - /// Returns `true` if the GPU function defined by this Op is a kernel, i.e. - /// it is intended to be launched from host. - bool isKernel() { - return getAttrOfType(GPUDialect::getKernelFuncAttrName()) != - nullptr; - } - - /// Returns the type of the function this Op defines. - FunctionType getType() { - return getTypeAttr().getValue().cast(); - } - - /// Returns the number of buffers located in the workgroup memory. - unsigned getNumWorkgroupAttributions() { - return getAttrOfType(getNumWorkgroupAttributionsAttrName()) - .getInt(); - } - - /// Returns a list of block arguments that correspond to buffers located in - /// the workgroup memory - ArrayRef getWorkgroupAttributions() { - auto begin = - std::next(getBody().front().args_begin(), getType().getNumInputs()); - auto end = std::next(begin, getNumWorkgroupAttributions()); - return {begin, end}; - } - - /// Returns a list of block arguments that correspond to buffers located in - /// the private memory. - ArrayRef getPrivateAttributions() { - auto begin = - std::next(getBody().front().args_begin(), - getType().getNumInputs() + getNumWorkgroupAttributions()); - return {begin, getBody().front().args_end()}; - } - - /// Returns the name of the attribute containing the number of buffers located - /// in the workgroup memory. - static StringRef getNumWorkgroupAttributionsAttrName() { - return "workgroup_attibutions"; - } - -private: - // FunctionLike trait needs access to the functions below. - friend class OpTrait::FunctionLike; - - /// Hooks for the input/output type enumeration in FunctionLike . - unsigned getNumFuncArguments() { return getType().getNumInputs(); } - unsigned getNumFuncResults() { return getType().getNumResults(); } - - /// Returns the keywords used in the custom syntax for this Op. - static StringRef getWorkgroupKeyword() { return "workgroup"; } - static StringRef getPrivateKeyword() { return "private"; } - static StringRef getKernelKeyword() { return "kernel"; } - - /// Hook for FunctionLike verifier. - LogicalResult verifyType(); - - /// Verifies the body of the function. - LogicalResult verifyBody(); -}; - #define GET_OP_CLASSES #include "mlir/Dialect/GPU/GPUOps.h.inc" diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td b/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td index 4329084ff50..fcaa77ce779 100644 --- a/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -42,6 +42,145 @@ def GPU_BlockIdOp : GPU_IndexOp<"block_id">; def GPU_GridDimOp : GPU_IndexOp<"grid_dim">; def GPU_ThreadIdOp : GPU_IndexOp<"thread_id">; +def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> { + let summary = "Function executable on a GPU"; + + let description = [{ + Defines a function that can be executed on a GPU. This supports memory + attribution and its body has a particular execution model. + + GPU functions are either kernels (as indicated by the `kernel` attribute) or + regular functions. The former can be launched from the host side, while the + latter are device side only. + + The memory attribution defines SSA values that correspond to memory buffers + allocated in the memory hierarchy of the GPU (see below). + + The operation has one attached region that corresponds to the body of the + function. The region arguments consist of the function arguments without + modification, followed by buffers defined in memory annotations. The body of + a GPU function, when launched, is executed by multiple work items. There are + no guarantees on the order in which work items execute, or on the connection + between them. In particular, work items are not necessarily executed in + lock-step. Synchronization ops such as "gpu.barrier" should be used to + coordinate work items. Declarations of GPU functions, i.e. not having the + body region, are not supported. + + Syntax: + + ``` {.ebnf} + op ::= `gpu.func` symbol-ref-id `(` argument-list `)` (`->` + function-result-list)? + memory-attribution `kernel`? function-attributes? region + + memory-attribution ::= (`workgroup` `(` ssa-id-and-type-list `)`)? + (`private` `(` ssa-id-and-type-list `)`)? + ``` + + Example: + + ```mlir + gpu.func @foo(%arg0: index) + workgroup(%workgroup: memref<32xf32, 3>) + private(%private: memref<1xf32, 5>) + kernel + attributes {qux: "quux"} { + gpu.return + } + ``` + + The generic form illustrates the concept + + ```mlir + "gpu.func"(%arg: index) {sym_name: "foo", kernel, qux: "quux"} ({ + ^bb0(%arg0: index, %workgroup: memref<32xf32, 3>, + %private: memref<1xf32, 5>): + "gpu.return"() : () -> () + }) : (index) -> () + ``` + + Note the non-default memory spaces used in memref types in memory + attribution. + }]; + + let regions = (region AnyRegion:$body); + + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<"Builder *builder, OperationState &result, StringRef name, " + "FunctionType type, ArrayRef workgroupAttributions, " + "ArrayRef privateAttributions, " + "ArrayRef attrs"> + ]; + + let extraClassDeclaration = [{ + /// Returns `true` if the GPU function defined by this Op is a kernel, i.e. + /// it is intended to be launched from host. + bool isKernel() { + return getAttrOfType(GPUDialect::getKernelFuncAttrName()) != + nullptr; + } + + /// Returns the type of the function this Op defines. + FunctionType getType() { + return getTypeAttr().getValue().cast(); + } + + /// Returns the number of buffers located in the workgroup memory. + unsigned getNumWorkgroupAttributions() { + return getAttrOfType(getNumWorkgroupAttributionsAttrName()) + .getInt(); + } + + /// Returns a list of block arguments that correspond to buffers located in + /// the workgroup memory + ArrayRef getWorkgroupAttributions() { + auto begin = + std::next(getBody().front().args_begin(), getType().getNumInputs()); + auto end = std::next(begin, getNumWorkgroupAttributions()); + return {begin, end}; + } + + /// Returns a list of block arguments that correspond to buffers located in + /// the private memory. + ArrayRef getPrivateAttributions() { + auto begin = + std::next(getBody().front().args_begin(), + getType().getNumInputs() + getNumWorkgroupAttributions()); + return {begin, getBody().front().args_end()}; + } + + /// Returns the name of the attribute containing the number of buffers + /// located in the workgroup memory. + static StringRef getNumWorkgroupAttributionsAttrName() { + return "workgroup_attibutions"; + } + + // FunctionLike trait needs access to the functions below. + friend class OpTrait::FunctionLike; + + /// Hooks for the input/output type enumeration in FunctionLike . + unsigned getNumFuncArguments() { return getType().getNumInputs(); } + unsigned getNumFuncResults() { return getType().getNumResults(); } + + /// Returns the keywords used in the custom syntax for this Op. + static StringRef getWorkgroupKeyword() { return "workgroup"; } + static StringRef getPrivateKeyword() { return "private"; } + static StringRef getKernelKeyword() { return "kernel"; } + + /// Hook for FunctionLike verifier. + LogicalResult verifyType(); + + /// Verifies the body of the function. + LogicalResult verifyBody(); + }]; + + // let verifier = [{ return ::verifFuncOpy(*this); }]; + let printer = [{ printGPUFuncOp(p, *this); }]; + let parser = [{ return parseGPUFuncOp(parser, result); }]; +} + def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>, Results<(outs)> { let summary = "Terminator for GPU launch regions."; diff --git a/third_party/mlir/include/mlir/IR/OpBase.td b/third_party/mlir/include/mlir/IR/OpBase.td index 6a884f2e948..dd7fac27a00 100644 --- a/third_party/mlir/include/mlir/IR/OpBase.td +++ b/third_party/mlir/include/mlir/IR/OpBase.td @@ -1335,6 +1335,8 @@ class PredOpTrait : OpTrait { def Broadcastable : NativeOpTrait<"BroadcastableTwoOperandsOneResult">; // X op Y == Y op X def Commutative : NativeOpTrait<"IsCommutative">; +// Op behaves like a function. +def FunctionLike : NativeOpTrait<"FunctionLike">; // Op is isolated from above. def IsolatedFromAbove : NativeOpTrait<"IsIsolatedFromAbove">; // Op results are float or vectors/tensors thereof. diff --git a/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 8d84fadae8a..38998b968ad 100644 --- a/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -46,7 +46,7 @@ bool GPUDialect::isKernel(Operation *op) { GPUDialect::GPUDialect(MLIRContext *context) : Dialect(getDialectName(), context) { - addOperations(); @@ -165,14 +165,6 @@ static LogicalResult verifyAllReduce(gpu::AllReduceOp allReduce) { return success(); } -// Namespace avoids ambiguous ReturnOpOperandAdaptor. -namespace mlir { -namespace gpu { -#define GET_OP_CLASSES -#include "mlir/Dialect/GPU/GPUOps.cpp.inc" -} // namespace gpu -} // namespace mlir - //===----------------------------------------------------------------------===// // LaunchOp //===----------------------------------------------------------------------===// @@ -639,7 +631,7 @@ parseAttributions(OpAsmParser &parser, StringRef keyword, /// ::= `gpu.func` symbol-ref-id `(` argument-list `)` /// (`->` function-result-list)? memory-attribution `kernel`? /// function-attributes? region -ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) { +static ParseResult parseGPUFuncOp(OpAsmParser &parser, OperationState &result) { SmallVector entryArgs; SmallVector, 1> argAttrs; SmallVector, 1> resultAttrs; @@ -667,26 +659,26 @@ ParseResult GPUFuncOp::parse(OpAsmParser &parser, OperationState &result) { // not to the functiont type. Builder &builder = parser.getBuilder(); auto type = builder.getFunctionType(argTypes, resultTypes); - result.addAttribute(getTypeAttrName(), TypeAttr::get(type)); + result.addAttribute(GPUFuncOp::getTypeAttrName(), TypeAttr::get(type)); // Parse workgroup memory attributions. - if (failed(parseAttributions(parser, getWorkgroupKeyword(), entryArgs, - argTypes))) + if (failed(parseAttributions(parser, GPUFuncOp::getWorkgroupKeyword(), + entryArgs, argTypes))) return failure(); // Store the number of operands we just parsed as the number of workgroup // memory attributions. unsigned numWorkgroupAttrs = argTypes.size() - type.getNumInputs(); - result.addAttribute(getNumWorkgroupAttributionsAttrName(), + result.addAttribute(GPUFuncOp::getNumWorkgroupAttributionsAttrName(), builder.getI64IntegerAttr(numWorkgroupAttrs)); // Parse private memory attributions. - if (failed( - parseAttributions(parser, getPrivateKeyword(), entryArgs, argTypes))) + if (failed(parseAttributions(parser, GPUFuncOp::getPrivateKeyword(), + entryArgs, argTypes))) return failure(); // Parse the kernel attribute if present. - if (succeeded(parser.parseOptionalKeyword(getKernelKeyword()))) + if (succeeded(parser.parseOptionalKeyword(GPUFuncOp::getKernelKeyword()))) result.addAttribute(GPUDialect::getKernelFuncAttrName(), builder.getUnitAttr()); @@ -712,24 +704,25 @@ static void printAttributions(OpAsmPrinter &p, StringRef keyword, p << ')'; } -void GPUFuncOp::print(OpAsmPrinter &p) { - p << getOperationName() << ' '; - p.printSymbolName(getName()); +/// Prints a GPU Func op. +void printGPUFuncOp(OpAsmPrinter &p, GPUFuncOp op) { + p << GPUFuncOp::getOperationName() << ' '; + p.printSymbolName(op.getName()); - FunctionType type = getType(); - impl::printFunctionSignature(p, this->getOperation(), type.getInputs(), + FunctionType type = op.getType(); + impl::printFunctionSignature(p, op.getOperation(), type.getInputs(), /*isVariadic=*/false, type.getResults()); - printAttributions(p, getWorkgroupKeyword(), getWorkgroupAttributions()); - printAttributions(p, getPrivateKeyword(), getPrivateAttributions()); - if (isKernel()) - p << ' ' << getKernelKeyword(); + printAttributions(p, op.getWorkgroupKeyword(), op.getWorkgroupAttributions()); + printAttributions(p, op.getPrivateKeyword(), op.getPrivateAttributions()); + if (op.isKernel()) + p << ' ' << op.getKernelKeyword(); - impl::printFunctionAttributes(p, this->getOperation(), type.getNumInputs(), + impl::printFunctionAttributes(p, op.getOperation(), type.getNumInputs(), type.getNumResults(), - {getNumWorkgroupAttributionsAttrName(), + {op.getNumWorkgroupAttributionsAttrName(), GPUDialect::getKernelFuncAttrName()}); - p.printRegion(getBody(), /*printEntryBlockArgs=*/false); + p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false); } /// Hook for FunctionLike verifier. @@ -762,3 +755,11 @@ LogicalResult GPUFuncOp::verifyBody() { return success(); } + +// Namespace avoids ambiguous ReturnOpOperandAdaptor. +namespace mlir { +namespace gpu { +#define GET_OP_CLASSES +#include "mlir/Dialect/GPU/GPUOps.cpp.inc" +} // namespace gpu +} // namespace mlir From 86a6cfd2509226ac325f88345897f65bfecb6c5f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 12:00:01 -0800 Subject: [PATCH 285/383] LLVM::GlobalOp: take address space as builder argument Accept the address space of the global as a builder argument when constructing an LLVM::GlobalOp instance. This decreases the reliance of LLVM::GlobalOp users on the internal name of the attribute used for this purpose. Update several uses of the address space in GPU to NVVM conversion. PiperOrigin-RevId: 284233254 Change-Id: Ia7d6ceaa52399efef49ad988814ae44f4a6e5c30 --- third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 3 ++- .../lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp | 9 ++------- third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 5 ++++- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 66d9ba3f750..8aa5397651c 100644 --- a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -553,7 +553,8 @@ def LLVM_GlobalOp let builders = [ OpBuilder<"Builder *builder, OperationState &result, LLVMType type, " "bool isConstant, Linkage linkage, StringRef name, " - "Attribute value, ArrayRef attrs = {}"> + "Attribute value, unsigned addrSpace = 0, " + "ArrayRef attrs = {}"> ]; let extraClassDeclaration = [{ diff --git a/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp index 50a2e2efd2c..949294cf692 100644 --- a/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/third_party/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -385,12 +385,10 @@ private: auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); StringRef name = "reduce_buffer"; - auto addrSpace = - builder.getNamedAttr("addr_space", builder.getI32IntegerAttr(3)); auto globalOp = builder.create( loc, arrayType.cast(), /*isConstant=*/false, LLVM::Linkage::Internal, name, - /*value=*/Attribute(), llvm::makeArrayRef(addrSpace)); + /*value=*/Attribute(), gpu::GPUDialect::getWorkgroupAddressSpace()); return rewriter.create(loc, globalOp); } @@ -481,15 +479,12 @@ struct FuncOpLowering : LLVMOpLowering { auto elementType = lowering.convertType(type.getElementType()).cast(); auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); - auto addSpaceAttr = rewriter.getNamedAttr( - "addr_space", rewriter.getI32IntegerAttr( - gpu::GPUDialect::getWorkgroupAddressSpace())); std::string name = llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()); auto globalOp = rewriter.create( gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, LLVM::Linkage::Internal, name, /*value=*/Attribute(), - llvm::makeArrayRef(addSpaceAttr)); + gpu::GPUDialect::getWorkgroupAddressSpace()); workgroupBuffers.push_back(globalOp); } diff --git a/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp index fb4555674eb..78da9998c6d 100644 --- a/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp +++ b/third_party/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp @@ -869,7 +869,8 @@ static StringRef getLinkageAttrName() { return "linkage"; } void GlobalOp::build(Builder *builder, OperationState &result, LLVMType type, bool isConstant, Linkage linkage, StringRef name, - Attribute value, ArrayRef attrs) { + Attribute value, unsigned addrSpace, + ArrayRef attrs) { result.addAttribute(SymbolTable::getSymbolAttrName(), builder->getStringAttr(name)); result.addAttribute("type", TypeAttr::get(type)); @@ -879,6 +880,8 @@ void GlobalOp::build(Builder *builder, OperationState &result, LLVMType type, result.addAttribute("value", value); result.addAttribute(getLinkageAttrName(), builder->getI64IntegerAttr( static_cast(linkage))); + if (addrSpace != 0) + result.addAttribute("addr_space", builder->getI32IntegerAttr(addrSpace)); result.attributes.append(attrs.begin(), attrs.end()); result.addRegion(); } From 7c79535982f808f12adfef57ed51568318fe600c Mon Sep 17 00:00:00 2001 From: Ken Franko Date: Fri, 6 Dec 2019 12:06:20 -0800 Subject: [PATCH 286/383] Group variable initialization when calling lift_to_graph. When initializing variables defined inside a @tf.function which are lifted to the outer graph, group the variables together and call lift_to_graph once. lift_to_graph supports passing in multiple tensors and the graph to lift to is the same for all of the variable initialization. This improves setup time. PiperOrigin-RevId: 284234788 Change-Id: I5b0ab9b69d823e8057a44dd778b04a3e75dc5c47 --- tensorflow/python/eager/def_function.py | 10 ++++++++-- tensorflow/python/eager/def_function_test.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py index 2f20179d1da..1c804835d91 100644 --- a/tensorflow/python/eager/def_function.py +++ b/tensorflow/python/eager/def_function.py @@ -728,13 +728,19 @@ class Function(object): resource_variable_ops.var_is_initialized_op(v.handle)) var_is_initialized = array_ops.stack(var_is_initialized).numpy() + inits = [] for (v, init), is_initialized in zip(initializers, var_is_initialized): with ops.init_scope(): if is_initialized: continue + inits.append(init) - op_map = lift_to_graph.lift_to_graph( - [init], ops.get_default_graph(), op_map=op_map) + op_map = lift_to_graph.lift_to_graph( + inits, ops.get_default_graph(), op_map=op_map) + for (v, init), is_initialized in zip(initializers, var_is_initialized): + with ops.init_scope(): + if is_initialized: + continue v.assign(op_map[init], read_value=False) with ops.init_scope(): diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py index dca257f91a3..b558412fd9a 100644 --- a/tensorflow/python/eager/def_function_test.py +++ b/tensorflow/python/eager/def_function_test.py @@ -137,6 +137,19 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0) + def testFunctionMultipleVariableInitializer(self): + + state = [] + + @def_function.function + def fn(x): + if not state: + state.append(variables.Variable(lambda: 2.0)) + state.append(variables.Variable(lambda: 5.0)) + return state[0] * x, state[1] * x + + self.assertAllEqual(fn(constant_op.constant(1.0)), [2.0, 5.0]) + def testFunctionInitializationFunction(self): state = [] From 56b242e7abe8aa957aa5d4ecf10cb5cf0de31bf8 Mon Sep 17 00:00:00 2001 From: Frank Chen Date: Fri, 6 Dec 2019 12:12:19 -0800 Subject: [PATCH 287/383] Experimental C API headers for maximal compatibility during linking PiperOrigin-RevId: 284235883 Change-Id: I73d19ad2fd1bb80aad25eba7a5f0f160c0585cd6 --- .../compiler/xla/python/tpu_driver/BUILD | 5 ++ .../compiler/xla/python/tpu_driver/c_api.h | 30 +++++++++++ .../python/tpu_driver/client/c_api_client.c | 50 +++++++++++++++++++ 3 files changed, 85 insertions(+) create mode 100644 tensorflow/compiler/xla/python/tpu_driver/c_api.h create mode 100644 tensorflow/compiler/xla/python/tpu_driver/client/c_api_client.c diff --git a/tensorflow/compiler/xla/python/tpu_driver/BUILD b/tensorflow/compiler/xla/python/tpu_driver/BUILD index 96c6636323b..99a07c31256 100644 --- a/tensorflow/compiler/xla/python/tpu_driver/BUILD +++ b/tensorflow/compiler/xla/python/tpu_driver/BUILD @@ -31,6 +31,11 @@ tf_proto_library_cc( use_grpc_namespace = True, ) +cc_library( + name = "c_api", + hdrs = ["c_api.h"], +) + cc_library( name = "tpu_driver", srcs = [ diff --git a/tensorflow/compiler/xla/python/tpu_driver/c_api.h b/tensorflow/compiler/xla/python/tpu_driver/c_api.h new file mode 100644 index 00000000000..5b892dfdaa3 --- /dev/null +++ b/tensorflow/compiler/xla/python/tpu_driver/c_api.h @@ -0,0 +1,30 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_C_API_H_ +#define TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_C_API_H_ + +#define TPUDRIVER_CAPI_EXPORT __attribute__((visibility("default"))) + +extern "C" { + +TPUDRIVER_CAPI_EXPORT extern void TpuDriver_Initialize(); + +TPUDRIVER_CAPI_EXPORT extern void TpuDriver_Open(const char* worker); + +TPUDRIVER_CAPI_EXPORT extern const char* TpuDriver_Version(void); +} + +#endif // TENSORFLOW_COMPILER_XLA_PYTHON_TPU_DRIVER_C_API_H_ diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/c_api_client.c b/tensorflow/compiler/xla/python/tpu_driver/client/c_api_client.c new file mode 100644 index 00000000000..70ab4af85fd --- /dev/null +++ b/tensorflow/compiler/xla/python/tpu_driver/client/c_api_client.c @@ -0,0 +1,50 @@ +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +// To compile: gcc -o c_api_client c_api_client.c -ldl +// To run, make sure c_api.so and c_api_client in the same directory, and then +// sudo ./c_api_client + +#include +#include +#include + +int main(int argc, char** argv) { + void* handle; + handle = dlopen("./c_api.so", RTLD_NOW); + if (!handle) { + fprintf(stderr, "Error: %s\n", dlerror()); + exit(EXIT_FAILURE); + } + + const char* (*TpuDriver_Version)(void); + void (*TpuDriver_Initialize)(void); + void (*TpuDriver_Open)(const char* worker); + + fprintf(stdout, "------ Going to Find Out Version ------\n"); + *(void**)(&TpuDriver_Version) = dlsym(handle, "TpuDriver_Version"); + fprintf(stdout, "TPU Driver Version: %s\n", TpuDriver_Version()); + + fprintf(stdout, "------ Going to Initialize ------\n"); + *(void**)(&TpuDriver_Initialize) = dlsym(handle, "TpuDriver_Initialize"); + TpuDriver_Initialize(); + + fprintf(stdout, "------ Going to Open a TPU Driver ------\n"); + *(void**)(&TpuDriver_Open) = dlsym(handle, "TpuDriver_Open"); + TpuDriver_Open("local://"); + + dlclose(handle); + exit(EXIT_SUCCESS); +} From 4998629955d42ef6ae6e3cd8e72d7099dd97fa98 Mon Sep 17 00:00:00 2001 From: Aart Bik Date: Fri, 6 Dec 2019 12:38:52 -0800 Subject: [PATCH 288/383] [VecOps] Rename vector.[insert|extract]element to just vector.[insert|extract] Since these operations lower to [insert|extract][element|value] at LLVM dialect level, neither element nor value would correctly reflect the meaning. PiperOrigin-RevId: 284240727 Change-Id: Idccb5232954a73a702f8036c2c1394d38850aac2 --- .../mlir/Dialect/VectorOps/VectorOps.td | 20 ++++----- .../VectorToLLVM/ConvertVectorToLLVM.cpp | 6 +-- .../mlir/lib/Dialect/VectorOps/VectorOps.cpp | 44 ++++++++----------- 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td index ebeecfbb715..6c2b4e6bb16 100644 --- a/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td +++ b/third_party/mlir/include/mlir/Dialect/VectorOps/VectorOps.td @@ -216,21 +216,21 @@ def Vector_BroadcastOp : }]; } -def Vector_ExtractElementOp : - Vector_Op<"extractelement", [NoSideEffect, +def Vector_ExtractOp : + Vector_Op<"extract", [NoSideEffect, PredOpTrait<"operand and result have same element type", TCresVTEtIsSameAsOpBase<0, 0>>]>, Arguments<(ins AnyVector:$vector, I32ArrayAttr:$position)>, Results<(outs AnyType)> { - let summary = "extractelement operation"; + let summary = "extract operation"; let description = [{ Takes an n-D vector and a k-D position and extracts the (n-k)-D vector at the proper position. Degenerates to an element type in the 0-D case. Examples: ``` - %1 = vector.extractelement %0[3]: vector<4x8x16xf32> - %2 = vector.extractelement %0[3, 3, 3]: vector<4x8x16xf32> + %1 = vector.extract %0[3]: vector<4x8x16xf32> + %2 = vector.extract %0[3, 3, 3]: vector<4x8x16xf32> ``` }]; let builders = [OpBuilder< @@ -243,15 +243,15 @@ def Vector_ExtractElementOp : }]; } -def Vector_InsertElementOp : - Vector_Op<"insertelement", [NoSideEffect, +def Vector_InsertOp : + Vector_Op<"insert", [NoSideEffect, PredOpTrait<"source operand and result have same element type", TCresVTEtIsSameAsOpBase<0, 0>>, PredOpTrait<"dest operand and result have same type", TCresIsSameAsOpBase<0, 1>>]>, Arguments<(ins AnyType:$source, AnyVector:$dest, I32ArrayAttr:$position)>, Results<(outs AnyVector)> { - let summary = "insertelement operation"; + let summary = "insert operation"; let description = [{ Takes an n-D source vector, an (n+k)-D destination vector and a k-D position and inserts the n-D source into the (n+k)-D destination at the proper @@ -259,9 +259,9 @@ def Vector_InsertElementOp : Examples: ``` - %2 = vector.insertelement %0, %1[3 : i32]: + %2 = vector.insert %0, %1[3 : i32]: vector<8x16xf32> into vector<4x8x16xf32> - %5 = vector.insertelement %3, %4[3 : i32, 3 : i32, 3 : i32]: + %5 = vector.insert %3, %4[3 : i32, 3 : i32, 3 : i32]: f32 into vector<4x8x16xf32> ``` }]; diff --git a/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp index c40c7c5242a..8adc415f820 100644 --- a/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp +++ b/third_party/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp @@ -238,15 +238,15 @@ class VectorExtractElementOpConversion : public LLVMOpLowering { public: explicit VectorExtractElementOpConversion(MLIRContext *context, LLVMTypeConverter &typeConverter) - : LLVMOpLowering(vector::ExtractElementOp::getOperationName(), context, + : LLVMOpLowering(vector::ExtractOp::getOperationName(), context, typeConverter) {} PatternMatchResult matchAndRewrite(Operation *op, ArrayRef operands, ConversionPatternRewriter &rewriter) const override { auto loc = op->getLoc(); - auto adaptor = vector::ExtractElementOpOperandAdaptor(operands); - auto extractOp = cast(op); + auto adaptor = vector::ExtractOpOperandAdaptor(operands); + auto extractOp = cast(op); auto vectorType = extractOp.vector()->getType().cast(); auto resultType = extractOp.getResult()->getType(); auto llvmResultType = lowering.convertType(resultType); diff --git a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp index 65441674165..c1e88aa0076 100644 --- a/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp +++ b/third_party/mlir/lib/Dialect/VectorOps/VectorOps.cpp @@ -324,35 +324,33 @@ SmallVector ContractionOp::getIndexingMaps() { } //===----------------------------------------------------------------------===// -// ExtractElementOp +// ExtractOp //===----------------------------------------------------------------------===// -static Type inferExtractElementOpResultType(VectorType vectorType, - ArrayAttr position) { +static Type inferExtractOpResultType(VectorType vectorType, + ArrayAttr position) { if (static_cast(position.size()) == vectorType.getRank()) return vectorType.getElementType(); return VectorType::get(vectorType.getShape().drop_front(position.size()), vectorType.getElementType()); } -void vector::ExtractElementOp::build(Builder *builder, OperationState &result, - Value *source, - ArrayRef position) { +void vector::ExtractOp::build(Builder *builder, OperationState &result, + Value *source, ArrayRef position) { result.addOperands(source); auto positionAttr = builder->getI32ArrayAttr(position); - result.addTypes(inferExtractElementOpResultType( - source->getType().cast(), positionAttr)); + result.addTypes(inferExtractOpResultType(source->getType().cast(), + positionAttr)); result.addAttribute(getPositionAttrName(), positionAttr); } -static void print(OpAsmPrinter &p, vector::ExtractElementOp op) { +static void print(OpAsmPrinter &p, vector::ExtractOp op) { p << op.getOperationName() << " " << *op.vector() << op.position(); p.printOptionalAttrDict(op.getAttrs(), {"position"}); p << " : " << op.vector()->getType(); } -static ParseResult parseExtractElementOp(OpAsmParser &parser, - OperationState &result) { +static ParseResult parseExtractOp(OpAsmParser &parser, OperationState &result) { llvm::SMLoc attributeLoc, typeLoc; SmallVector attrs; OpAsmParser::OperandType vector; @@ -375,13 +373,13 @@ static ParseResult parseExtractElementOp(OpAsmParser &parser, attributeLoc, "expected position attribute of rank smaller than vector rank"); - Type resType = inferExtractElementOpResultType(vectorType, positionAttr); + Type resType = inferExtractOpResultType(vectorType, positionAttr); result.attributes = attrs; return failure(parser.resolveOperand(vector, type, result.operands) || parser.addTypeToList(resType, result.types)); } -static LogicalResult verify(vector::ExtractElementOp op) { +static LogicalResult verify(vector::ExtractOp op) { auto positionAttr = op.position().getValue(); if (positionAttr.empty()) return op.emitOpError("expected non-empty position attribute"); @@ -447,29 +445,26 @@ static ParseResult parseBroadcastOp(OpAsmParser &parser, } //===----------------------------------------------------------------------===// -// InsertElementOp +// InsertOp //===----------------------------------------------------------------------===// -void InsertElementOp::build(Builder *builder, OperationState &result, - Value *source, Value *dest, - ArrayRef position) { +void InsertOp::build(Builder *builder, OperationState &result, Value *source, + Value *dest, ArrayRef position) { result.addOperands({source, dest}); auto positionAttr = builder->getI32ArrayAttr(position); result.addTypes(dest->getType()); result.addAttribute(getPositionAttrName(), positionAttr); } -static void print(OpAsmPrinter &p, InsertElementOp op) { +static void print(OpAsmPrinter &p, InsertOp op) { p << op.getOperationName() << " " << *op.source() << ", " << *op.dest() << op.position(); - p.printOptionalAttrDict(op.getAttrs(), - {InsertElementOp::getPositionAttrName()}); + p.printOptionalAttrDict(op.getAttrs(), {InsertOp::getPositionAttrName()}); p << " : " << op.getSourceType(); p << " into " << op.getDestVectorType(); } -static ParseResult parseInsertElementOp(OpAsmParser &parser, - OperationState &result) { +static ParseResult parseInsertOp(OpAsmParser &parser, OperationState &result) { SmallVector attrs; OpAsmParser::OperandType source, dest; Type sourceType; @@ -477,8 +472,7 @@ static ParseResult parseInsertElementOp(OpAsmParser &parser, Attribute attr; return failure(parser.parseOperand(source) || parser.parseComma() || parser.parseOperand(dest) || - parser.parseAttribute(attr, - InsertElementOp::getPositionAttrName(), + parser.parseAttribute(attr, InsertOp::getPositionAttrName(), result.attributes) || parser.parseOptionalAttrDict(attrs) || parser.parseColonType(sourceType) || @@ -488,7 +482,7 @@ static ParseResult parseInsertElementOp(OpAsmParser &parser, parser.addTypeToList(destType, result.types)); } -static LogicalResult verify(InsertElementOp op) { +static LogicalResult verify(InsertOp op) { auto positionAttr = op.position().getValue(); if (positionAttr.empty()) return op.emitOpError("expected non-empty position attribute"); From 47953aa5c0952d5d457fe04d9ffbbad6ec5c8706 Mon Sep 17 00:00:00 2001 From: Rick Chao Date: Fri, 6 Dec 2019 12:47:09 -0800 Subject: [PATCH 289/383] Re-write docstring for ModelCheckpoint. Add example. PiperOrigin-RevId: 284242148 Change-Id: I1cb9265fcad710100bd7bcf4d6193779ce250923 --- tensorflow/python/keras/callbacks.py | 49 +++++++++++++++++++++++----- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py index ca9507f0bdc..1239ab40f98 100644 --- a/tensorflow/python/keras/callbacks.py +++ b/tensorflow/python/keras/callbacks.py @@ -811,18 +811,51 @@ class History(Callback): @keras_export('keras.callbacks.ModelCheckpoint') class ModelCheckpoint(Callback): - """Save the model after every epoch. + """Callback to save the Keras model or model weights at some frequency. - `filepath` can contain named formatting options, - which will be filled the value of `epoch` and - keys in `logs` (passed in `on_epoch_end`). + `ModelCheckpoint` callback is used in conjunction with training using + `model.fit()` to save a model or weights (in a checkpoint file) at some + interval, so the model or weights can be loaded later to continue the training + from the state saved. - For example: if `filepath` is `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, - then the model checkpoints will be saved with the epoch number and - the validation loss in the filename. + A few options this callback provides include: + + - Whether to only keep the model that has achieved the "best performance" so + far, or whether to save the model at the end of every epoch regardless of + performance. + - Definition of 'best'; which quantity to monitor and whether it should be + maximized or minimized. + - The frequency it should save at. Currently, the callback supports saving at + the end of every epoch, or after a fixed number of training samples. + - Whether only weights are saved, or the whole model is saved. + + Example: + + ```python + EPOCHS = 10 + checkpoint_filepath = '/tmp/checkpoint' + model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( + filepath=checkpoint_filepath, + save_weights_only=True, + monitor='val_acc', + mode='max', + save_best_only=True) + + # Model weights are saved at the end of every epoch, if it's the best seen + # so far. + model.fit(epochs=EPOCHS, callbacks=[model_checkpoint_callback]) + + # The model weights (that are considered the best) are loaded into the model. + model.load_weights(checkpoint_filepath) + ``` Arguments: - filepath: string, path to save the model file. + filepath: string, path to save the model file. `filepath` can contain + named formatting options, which will be filled the value of `epoch` and + keys in `logs` (passed in `on_epoch_end`). For example: if `filepath` is + `weights.{epoch:02d}-{val_loss:.2f}.hdf5`, then the model checkpoints + will be saved with the epoch number and the validation loss in the + filename. monitor: quantity to monitor. verbose: verbosity mode, 0 or 1. save_best_only: if `save_best_only=True`, the latest best model according From 3371ea49735ced60c48a25732b622f59abb8bd5a Mon Sep 17 00:00:00 2001 From: Ken Franko Date: Fri, 6 Dec 2019 12:51:42 -0800 Subject: [PATCH 290/383] Group variable initialization when calling lift_to_graph. When initializing variables defined inside a @tf.function which are lifted to the outer graph, group the variables together and call lift_to_graph once. lift_to_graph supports passing in multiple tensors and the graph to lift to is the same for all of the variable initialization. This improves setup time. PiperOrigin-RevId: 284242959 Change-Id: I1f84b7ac5037658b40a4a8a9a8781485b5be80df --- tensorflow/python/eager/def_function.py | 10 ++-------- tensorflow/python/eager/def_function_test.py | 13 ------------- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py index 1c804835d91..2f20179d1da 100644 --- a/tensorflow/python/eager/def_function.py +++ b/tensorflow/python/eager/def_function.py @@ -728,19 +728,13 @@ class Function(object): resource_variable_ops.var_is_initialized_op(v.handle)) var_is_initialized = array_ops.stack(var_is_initialized).numpy() - inits = [] for (v, init), is_initialized in zip(initializers, var_is_initialized): with ops.init_scope(): if is_initialized: continue - inits.append(init) - op_map = lift_to_graph.lift_to_graph( - inits, ops.get_default_graph(), op_map=op_map) - for (v, init), is_initialized in zip(initializers, var_is_initialized): - with ops.init_scope(): - if is_initialized: - continue + op_map = lift_to_graph.lift_to_graph( + [init], ops.get_default_graph(), op_map=op_map) v.assign(op_map[init], read_value=False) with ops.init_scope(): diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py index b558412fd9a..dca257f91a3 100644 --- a/tensorflow/python/eager/def_function_test.py +++ b/tensorflow/python/eager/def_function_test.py @@ -137,19 +137,6 @@ class DefFunctionTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(fn(constant_op.constant(1.0)), 2.0) - def testFunctionMultipleVariableInitializer(self): - - state = [] - - @def_function.function - def fn(x): - if not state: - state.append(variables.Variable(lambda: 2.0)) - state.append(variables.Variable(lambda: 5.0)) - return state[0] * x, state[1] * x - - self.assertAllEqual(fn(constant_op.constant(1.0)), [2.0, 5.0]) - def testFunctionInitializationFunction(self): state = [] From 53a864a1e2d761a42159fd2a00f8981aad10f73d Mon Sep 17 00:00:00 2001 From: Smit Hinsu Date: Fri, 6 Dec 2019 12:51:44 -0800 Subject: [PATCH 291/383] Lower TensorFlow AddN to sequence of AddV2 ops PiperOrigin-RevId: 284242962 Change-Id: Icea1f1e77b6be10c193090868b04d16645b76cf7 --- .../mlir/tensorflow/ir/tf_generated_ops.td | 2 ++ .../compiler/mlir/tensorflow/ir/tf_ops.cc | 9 +++++ .../mlir/tensorflow/tests/canonicalize.mlir | 7 ++++ .../mlir/tensorflow/tests/lower_tf.mlir | 16 +++++++++ .../mlir/tensorflow/transforms/lower_tf.cc | 35 +++++++++++++++++++ 5 files changed, 69 insertions(+) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td index 4b0670256c1..164bbe57ee3 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td @@ -94,6 +94,8 @@ Inputs must be of same size and shape. TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>; TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>; + + let hasFolder = 1; } def TF_AddV2Op : TF_Op<"AddV2", [Broadcastable, Commutative, NoSideEffect]>, diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc index d7dee85188f..81e64d9c4f0 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.cc @@ -301,6 +301,15 @@ void AddOp::getCanonicalizationPatterns(OwningRewritePatternList &results, results.insert(context); } +//===----------------------------------------------------------------------===// +// AddNOp +//===----------------------------------------------------------------------===// + +OpFoldResult AddNOp::fold(ArrayRef operands) { + if (operands.size() == 1) return *inputs().begin(); + return {}; +} + //===----------------------------------------------------------------------===// // AddV2Op //===----------------------------------------------------------------------===// diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir index a2cc33a8201..18c63912a86 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir @@ -382,3 +382,10 @@ func @nonIdentityTranspose(%arg0: tensor<2x3x4x5x6xf32>) -> tensor<2x3x4x6x5xf32 // CHECK: %1 = "tf.Transpose"(%arg0, %0) : (tensor<2x3x4x5x6xf32>, tensor<5xi32>) -> tensor<2x3x4x6x5xf32> // CHECK: return %1 } + +// CHECK-LABEL: func @addN +func @addN(%arg0: tensor<*xf32>) -> tensor<*xf32> { + // CHECK: return %arg0 + %0 = "tf.AddN"(%arg0) : (tensor<*xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> +} diff --git a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir index 120e73f6e94..60ffc924ae5 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/lower_tf.mlir @@ -250,3 +250,19 @@ func @ZerosLike_variant(%arg0: tensor>>) -> tensor>>) -> tensor>> return %0 : tensor>> } + +// CHECK-LABEL: func @addN +func @addN(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xf32>) -> tensor<*xf32> { + // CHECK: %[[SUM0:.*]] = "tf.AddV2"(%arg0, %arg1) + // CHECK: %[[SUM1:.*]] = "tf.AddV2"(%[[SUM0]], %arg2) + // return %[[SUM1]] + %0 = "tf.AddN"(%arg0, %arg1, %arg2) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32> + return %0 : tensor<*xf32> +} + +// CHECK-LABEL: func @addN_variant +func @addN_variant(%arg0: tensor>>, %arg1: tensor>>, %arg2: tensor>>) -> tensor>> { + // CHECK: tf.AddN + %0 = "tf.AddN"(%arg0, %arg1, %arg2) : (tensor>>, tensor>>, tensor>>) -> tensor>> + return %0 : tensor>> +} diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc index 65c6ac86288..89941c2fab4 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.cc @@ -24,6 +24,7 @@ limitations under the License. #include "mlir/IR/StandardTypes.h" // TF:local_config_mlir #include "mlir/IR/TypeUtilities.h" // TF:local_config_mlir #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h" #include "tensorflow/core/util/tensor_format.h" namespace mlir { @@ -109,6 +110,39 @@ Type InferExpandDimsType(Type ty, int64_t axis, Builder *builder) { return RankedTensorType::get(shape, ranked_ty.getElementType()); } +// Lowers AddN op to a sequence of AddV2 ops to accumulate operands. +// +// %result = "tf.AddN"(%0, %1, %2) +// +// is lowered to: +// +// %sum_0 = "tf.AddV2"(%0, %1) +// %result = "tf.AddV2"(%sum_0, %2) +// +class LowerAddNOp : public OpRewritePattern { + public: + explicit LowerAddNOp(MLIRContext *context) + : OpRewritePattern(context) {} + + PatternMatchResult matchAndRewrite(TF::AddNOp op, + PatternRewriter &rewriter) const override { + // TODO(hinsu): Support variant with TensorList type. tf.AddV2 doesn't + // support variant type so variant types require special handling. + if (getElementTypeOrSelf(op.getType()).isa()) + return matchFailure(); + + // TODO(hinsu): Improve parallelism by splitting operands in two halves and + // accumulating them first. + Value *result = *op.inputs().begin(); + for (Value *operand : llvm::drop_begin(op.inputs(), 1)) { + result = rewriter.create(op.getLoc(), result, operand); + } + + rewriter.replaceOp(op, result); + return matchSuccess(); + } +}; + // Lowers Pack op to ConcatV2 op after changing shape of the inputs with // ExpandDims op. // @@ -159,6 +193,7 @@ class LowerPackOp : public OpRewritePattern { void PopulateLoweringTFPatterns(MLIRContext *context, OwningRewritePatternList *patterns) { + patterns->insert(context); patterns->insert(context); populateWithGenerated(context, patterns); } From faf5ed46c0cf3bf2cf2c7f7834684ff86cf0835b Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 12:53:13 -0800 Subject: [PATCH 292/383] prepare to export to XSpace for device trace. I realize that I need to port more XPlaneBuilder to OSS to continue. PiperOrigin-RevId: 284243248 Change-Id: Ib5f35ba5ebc0136c878e75a65a079af2df56c3b0 --- .../profiler/internal/gpu/device_tracer.cc | 83 +++++++++++++------ 1 file changed, 58 insertions(+), 25 deletions(-) diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc index 58d414413f9..9b3254ed905 100644 --- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc @@ -38,22 +38,19 @@ limitations under the License. namespace tensorflow { namespace profiler { -// Adapter from CuptiTraceCollector to StepStatsCollector: This class convert -// and filter from CuptiTracerEvent to tensorflow::NodeExecStats. -// We can not just forward event on the fly because StepStatsCollector have -// a single mutex for all devices, Therefore we will cache events and forward -// only when Flush(). -class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { +// CuptiTraceCollectorImpl store the CuptiTracerEvents from CuptiTracer and +// eventually convert and filter them to StepStats or XSpace. +class CuptiTraceCollectorImpl : public CuptiTraceCollector { public: - StepStatsCuptiTracerAdaptor(const CuptiTracerCollectorOptions& option, - uint64 start_walltime_ns, uint64 start_gpu_ns) + CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option, + uint64 start_walltime_ns, uint64 start_gpu_ns) : CuptiTraceCollector(option), num_callback_events_(0), num_activity_events_(0), start_walltime_ns_(start_walltime_ns), start_gpu_ns_(start_gpu_ns), num_gpus_(option.num_gpus), - per_device_adaptor_(option.num_gpus) {} + per_device_collector_(option.num_gpus) {} void AddEvent(CuptiTracerEvent&& event) override { if (event.device_id >= num_gpus_) return; @@ -70,7 +67,7 @@ class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { } num_activity_events_++; } - per_device_adaptor_[event.device_id].AddEvent(std::move(event)); + per_device_collector_[event.device_id].AddEvent(std::move(event)); } void OnEventsDropped(const std::string& reason, uint32 num_events) override {} void Flush() override {} @@ -79,8 +76,21 @@ class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { << " callback api events and " << num_activity_events_ << " activity events."; for (int i = 0; i < num_gpus_; ++i) { - per_device_adaptor_[i].Flush(trace_collector, i, start_walltime_ns_, - start_gpu_ns_); + per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_, + trace_collector); + } + } + void Export(XSpace* space) { + LOG(INFO) << " GpuTracer has collected " << num_callback_events_ + << " callback api events and " << num_activity_events_ + << " activity events."; + for (int i = 0; i < num_gpus_; ++i) { + // TODO(jiesun): determine if we need to export the launching events into + // the same plane that host tracer uses. + XPlane* host_plane = nullptr; + XPlane* device_plane = space->add_planes(); + per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_, + device_plane, host_plane); } } @@ -96,7 +106,7 @@ class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { uint32 thread_id; uint64 enqueue_time_ns; }; - struct PerDeviceAdaptor { + struct PerDeviceCollector { void AddEvent(CuptiTracerEvent&& event) { absl::MutexLock lock(&mutex); if (event.source == CuptiTracerEventSource::DriverCallback) { @@ -114,8 +124,9 @@ class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { events.emplace_back(std::move(event)); } } - void Flush(StepStatsCollector* collector, int32 device_ordinal, - uint64 start_walltime_ns, uint64 start_gpu_ns) { + + void Flush(int32 device_ordinal, uint64 start_walltime_ns, + uint64 start_gpu_ns, StepStatsCollector* collector) { absl::MutexLock lock(&mutex); stream_device = absl::StrCat("/device:GPU:", device_ordinal, "/stream:"); memcpy_device = absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"); @@ -199,6 +210,9 @@ class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { } } + void Flush(int32 device_ordinal, uint64 start_walltime_ns, + uint64 start_gpu_ns, XPlane* device_plane, XPlane* host_plane) {} + absl::Mutex mutex; std::string stream_device GUARDED_BY(mutex); std::string memcpy_device GUARDED_BY(mutex); @@ -207,9 +221,9 @@ class StepStatsCuptiTracerAdaptor : public CuptiTraceCollector { absl::flat_hash_map correlation_info GUARDED_BY(mutex); }; - absl::FixedArray per_device_adaptor_; + absl::FixedArray per_device_collector_; - TF_DISALLOW_COPY_AND_ASSIGN(StepStatsCuptiTracerAdaptor); + TF_DISALLOW_COPY_AND_ASSIGN(CuptiTraceCollectorImpl); }; // GpuTracer for GPU. @@ -246,7 +260,7 @@ class GpuTracer : public profiler::ProfilerInterface { CuptiTracer* cupti_tracer_; CuptiTracerOptions options_; StepStats step_stats_; - std::unique_ptr step_stats_cupti_adaptor_; + std::unique_ptr cupti_collector_; }; Status GpuTracer::DoStart() { @@ -307,11 +321,11 @@ Status GpuTracer::DoStart() { collector_options.num_gpus = cupti_tracer_->NumGpus(); uint64 start_gputime_ns = CuptiTracer::GetTimestamp(); uint64 start_walltime_ns = tensorflow::EnvTime::NowNanos(); - step_stats_cupti_adaptor_ = absl::make_unique( + cupti_collector_ = absl::make_unique( collector_options, start_walltime_ns, start_gputime_ns); AnnotationStack::Enable(true); - cupti_tracer_->Enable(options_, step_stats_cupti_adaptor_.get()); + cupti_tracer_->Enable(options_, cupti_collector_.get()); return Status::OK(); } @@ -355,11 +369,11 @@ Status GpuTracer::CollectData(RunMetadata* run_metadata) { return Status::OK(); case State::kStoppedOk: { // Input run_metadata is shared by profiler interfaces, we need append. - StepStatsCollector trace_collector(&step_stats_); - if (step_stats_cupti_adaptor_) { - step_stats_cupti_adaptor_->Export(&trace_collector); + StepStatsCollector step_stats_collector(&step_stats_); + if (cupti_collector_) { + cupti_collector_->Export(&step_stats_collector); } - trace_collector.Finalize(); + step_stats_collector.Finalize(); for (auto& dev_stats : *step_stats_.mutable_dev_stats()) { run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats); } @@ -370,7 +384,26 @@ Status GpuTracer::CollectData(RunMetadata* run_metadata) { } Status GpuTracer::CollectData(XSpace* space) { - return errors::Unimplemented("Collect data into XSpace not yet implemented"); + switch (profiling_state_) { + case State::kNotStarted: + VLOG(1) << "No trace data collected, session wasn't started"; + return Status::OK(); + case State::kStartedOk: + return errors::FailedPrecondition("Cannot collect trace before stopping"); + case State::kStartedError: + LOG(ERROR) << "Cannot collect, xprof failed to start"; + return Status::OK(); + case State::kStoppedError: + VLOG(1) << "No trace data collected"; + return Status::OK(); + case State::kStoppedOk: { + if (cupti_collector_) { + cupti_collector_->Export(space); + } + return Status::OK(); + } + } + return errors::Internal("Invalid profiling state: ", profiling_state_); } // Not in anonymous namespace for testing purposes. From 893d9a67ffb37c607ce6b551a53323cb2207f768 Mon Sep 17 00:00:00 2001 From: Anthony Liu Date: Fri, 6 Dec 2019 12:53:17 -0800 Subject: [PATCH 293/383] [tfdbg] Add Shape mode to DebugNumericSummaryV2Op. - The TensorDebugMode added is SHAPE, a mode that computes a shape-[10] rank-1 tensor given any float-type tensor. The first element is the id of the tensor. The second element is the dtype of the tensor, represented by the enumerated type defined in tensorflow/core/framework/types.proto. The third and fourth elements are the rank and size of the tensor respectively, and finally the fourth to tenth elements represent the shape of the tensor. Shorter shapes are right-padded with zero and longer shapes have the head truncated. - The CPU and GPU kernels of the op are added. PiperOrigin-RevId: 284243269 Change-Id: I2adc2c68792ee284ac2401bedd816c0ea960f87b --- tensorflow/core/kernels/debug_ops.h | 80 ++++++++++++++++--- .../python/debug/lib/debug_v2_ops_test.py | 65 +++++++++++++++ 2 files changed, 132 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 963b2bb58fc..643dfdad38f 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -514,7 +514,7 @@ class DebugNumericSummaryV2Op : public OpKernel { const int64 size = in.size(); Tensor* output_tensor; Tout tensor_id = static_cast(tensor_id_); - const float num_elem = static_cast(context->input(0).NumElements()); + const Tout num_elem = static_cast(context->input(0).NumElements()); // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because // that mode does not make use of tensor_id. if (tensor_debug_mode_ != 8) { @@ -565,6 +565,32 @@ class DebugNumericSummaryV2Op : public OpKernel { output_tensor->flat()(2) = fp_props[0]; // Slot for -inf count output_tensor->flat()(3) = fp_props[1]; // Slot for inf count output_tensor->flat()(4) = fp_props[2]; // Slot for nan count + } else if (tensor_debug_mode_ == 5) { // SHAPE + TensorShape shape({10}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + + int num_dims = tensor.dims(); + output_tensor->flat()(0) = tensor_id; + output_tensor->flat()(1) = static_cast(tensor.dtype()); + output_tensor->flat()(2) = static_cast(num_dims); + output_tensor->flat()(3) = num_elem; + + // Tensor shape - stored as (6 columns) + // if num_dim is less than 6, we right pad the shape with zeros + // if num_dim is greater than 6, we truncate the head (left most) of the + // dimensions as they are more predictable than the last few (e.g. batch + // size as first dimension) + int dim_idx = 4; + for (int i = std::max(0, num_dims - kShapeDims); + i < std::max(6, num_dims); ++i) { + if (i < num_dims) { + output_tensor->flat()(dim_idx++) = + static_cast(tensor.dim_size(i)); + } else { + output_tensor->flat()(dim_idx++) = 0.0; + } + } } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. TensorShape shape({3}); OP_REQUIRES_OK(context, @@ -605,6 +631,7 @@ class DebugNumericSummaryV2Op : public OpKernel { private: int tensor_debug_mode_; int64 tensor_id_; + static constexpr int kShapeDims = 6; static constexpr int kNegInfBit = 0x01; static constexpr int kPosInfBit = 0x02; static constexpr int kNaNBit = 0x04; @@ -628,7 +655,11 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { void ComputeAsync(OpKernelContext* context, DoneCallback done) override { Tensor* output_tensor; Tout tensor_id = static_cast(tensor_id_); - const float num_elem = static_cast(context->input(0).NumElements()); + const Tensor& tensor = context->input(0); + const Tout num_elem = static_cast(tensor.NumElements()); + const Device& d = context->eigen_device(); + auto input = tensor.flat(); + auto check_cb = [this, done]() { done(); }; // Disregard lossy cast if mode is REDUCE_INF_NAN_THREE_SLOTS because // that mode does not make use of tensor_id. if (tensor_debug_mode_ != 8) { @@ -657,19 +688,16 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { stream->ThenMemZero(&output_tensor_ptr, 2 * sizeof(Tout)); // Copy tensor_id to slot zero stream->ThenMemcpy(&output_tensor_ptr, &tensor_id, sizeof(Tout)); - if (context->input(0).NumElements() == 0) { + if (num_elem == 0) { done(); return; } // Call the GPU kernels for the numerical (inf/nan) checks. - const Device& d = context->eigen_device(); auto input = context->input(0).flat(); CurtHealthLaunch().Run(d, input.data(), input.size(), output_tensor->flat().data() + 1); - auto check_cb = [this, done]() { done(); }; - context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( stream, std::move(check_cb)); } else if (tensor_debug_mode_ == 3) { // CONCISE_HEALTH. @@ -693,14 +721,43 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { } // Call the GPU kernels for the numerical (inf/nan) checks. - const Device& d = context->eigen_device(); - auto input = context->input(0).flat(); ConciseHealthLaunch().Run( d, input.data(), input.size(), output_tensor->flat().data() + 2); - auto check_cb = [this, done]() { done(); }; + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, std::move(check_cb)); + } else if (tensor_debug_mode_ == 5) { // SHAPE + TensorShape shape({10}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + auto* stream = context->op_device_context()->stream(); + OP_REQUIRES_ASYNC(context, stream != nullptr, + errors::Internal("No GPU stream available."), done); + + se::DeviceMemoryBase output_tensor_ptr( + output_tensor->flat().data(), + output_tensor->flat().size()); + + int num_dims = tensor.dims(); + Tout static_output[10] = {tensor_id, + static_cast(tensor.dtype()), + static_cast(num_dims), + num_elem, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0, + 0.0}; + // Tensor shape: right pad zeros, truncate head + int dim_idx = 4; + for (int i = std::max(0, num_dims - 6); i < num_dims; ++i) { + static_output[dim_idx++] = static_cast(tensor.dim_size(i)); + } + // Write to device stream + stream->ThenMemcpy(&output_tensor_ptr, &static_output, sizeof(Tout) * 10); context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( stream, std::move(check_cb)); } else if (tensor_debug_mode_ == 8) { // REDUCE_INF_NAN_THREE_SLOTS. @@ -717,19 +774,16 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { output_tensor->flat().size()); stream->ThenMemset32(&output_tensor_ptr, 0, output_tensor->flat().size() * sizeof(Tout)); - if (context->input(0).NumElements() == 0) { + if (num_elem == 0) { done(); return; } // Call the GPU kernels for the numerical (inf/nan) checks. - const Device& d = context->eigen_device(); auto input = context->input(0).flat(); ReduceInfNanThreeSlotsLaunch().Run( d, input.data(), input.size(), output_tensor->flat().data()); - auto check_cb = [this, done]() { done(); }; - context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( stream, std::move(check_cb)); } else { diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py index 76d077c6286..ea3d8970993 100644 --- a/tensorflow/python/debug/lib/debug_v2_ops_test.py +++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py @@ -272,6 +272,7 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): modes = [ debug_event_pb2.TensorDebugMode.CURT_HEALTH, debug_event_pb2.TensorDebugMode.CONCISE_HEALTH, + debug_event_pb2.TensorDebugMode.SHAPE, ] # Maximum allowed tensor_id tensor_id = np.power(2, 53) @@ -481,6 +482,70 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): self.assertAllEqual(tensor_1, tensor_2) self.assertEqual(tensor_id_1, tensor_id_2) + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpShapeEmpty(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.SHAPE), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + tensor, tensor_id = debug_summary(constant_op.constant(0.0)) + self.assertAllEqual( + tensor, [tensor_id, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpShapeSmall(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.SHAPE), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + x = np.zeros([3, 4], dtype=np.float32) + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual( + tensor, [tensor_id, 1.0, 2.0, 12.0, 3.0, 4.0, 0.0, 0.0, 0.0, 0.0]) + + x = np.ones([1, 2, 3, 4, 5, 6], dtype=np.float16) + x[0, 1, 2, 2, 2, 2] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual( + tensor, + [tensor_id, 19, 6.0, 2 * 3 * 4 * 5 * 6, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0]) + + x = np.zeros([2], dtype=np.float32) + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual( + tensor, [tensor_id, 1.0, 1.0, 2.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + + tensor, tensor_id = debug_summary(constant_op.constant([])) + self.assertAllEqual( + tensor, [tensor_id, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpShapeLarge(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.SHAPE), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + x = np.ones([1, 2, 3, 4, 5, 6, 7], dtype=np.double) + tensor, tensor_id = debug_summary(constant_op.constant(x)) + self.assertAllEqual(tensor, [ + tensor_id, 2.0, 7.0, 2 * 3 * 4 * 5 * 6 * 7, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 + ]) + if __name__ == "__main__": ops.enable_eager_execution() From 8a0fc8f23c112e7a38251c66a29a3513881df1f0 Mon Sep 17 00:00:00 2001 From: Taylor Robie Date: Fri, 6 Dec 2019 12:58:13 -0800 Subject: [PATCH 294/383] Update keras standardization code to error out when a namedtuple is encountered. PiperOrigin-RevId: 284244075 Change-Id: I2bee4628df9e0e7cbc0fde126d99020698731fa6 --- .../python/keras/engine/data_adapter.py | 17 +++ tensorflow/python/keras/engine/training.py | 39 ++++++ .../python/keras/engine/training_test.py | 121 ++++++++++++++++++ tensorflow/python/keras/engine/training_v2.py | 6 + 4 files changed, 183 insertions(+) diff --git a/tensorflow/python/keras/engine/data_adapter.py b/tensorflow/python/keras/engine/data_adapter.py index 1474cf7a127..50db978e77a 100644 --- a/tensorflow/python/keras/engine/data_adapter.py +++ b/tensorflow/python/keras/engine/data_adapter.py @@ -19,6 +19,7 @@ from __future__ import division from __future__ import print_function import abc +import collections import itertools import math import random @@ -744,6 +745,7 @@ class GeneratorDataAdapter(DataAdapter): # Since we have to know the dtype of the python generator when we build the # dataset, we have to look at a batch to infer the structure. peek, x = self._peek_and_restore(x) + assert_not_namedtuple(peek) (peek, wrap_in_tuple, elements_to_keep, partial_sample_weight, sample_weight_modes, nested_shape, nested_dtypes @@ -1093,3 +1095,18 @@ def broadcast_sample_weight_modes(target_structure, sample_weight_modes): "structure:\n {}\n to \n {}".format(target_str, mode_str)) return sample_weight_modes + + +def assert_not_namedtuple(x): + if (isinstance(x, tuple) and + # TODO(b/144192902): Use a namedtuple checking utility. + hasattr(x, "_fields") and + isinstance(x._fields, collections.Sequence) and + all(isinstance(f, six.string_types) for f in x._fields)): + raise ValueError( + "Received namedtuple ({}) with fields `{}` as input. namedtuples " + "cannot, in general, be unambiguously resolved into `x`, `y`, " + "and `sample_weight`. For this reason Keras has elected not to " + "support them. If you would like the value to be unpacked, " + "please explicitly convert it to a tuple before passing it to " + "Keras.".format(x.__class__, x._fields)) diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py index 86930a4cfad..78c4feb7be9 100644 --- a/tensorflow/python/keras/engine/training.py +++ b/tensorflow/python/keras/engine/training.py @@ -493,6 +493,8 @@ class Model(network.Network, version_utils.VersionSelector): `(inputs, targets, sample_weights)`. - A generator or `keras.utils.Sequence` returning `(inputs, targets)` or `(inputs, targets, sample weights)`. + A more detailed description of unpacking behavior for iterator types + (Dataset, generator, Sequence) is given below. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and @@ -614,6 +616,30 @@ class Model(network.Network, version_utils.VersionSelector): the generator as they can't be passed easily to children processes. **kwargs: Used for backwards compatibility. + Unpacking behavior for iterator-like inputs: + A common pattern is to pass a tf.data.Dataset, generator, or + tf.keras.utils.Sequence to the `x` argument of fit, which will in fact + yield not only features (x) but optionally targets (y) and sample weights. + Keras requires that the output of such iterator-likes be unambiguous. The + iterator should return a tuple of length 1, 2, or 3, where the optional + second and third elements will be used for y and sample_weight + respectively. Any other type provided will be wrapped in a length one + tuple, effectively treating everything as 'x'. When yielding dicts, they + should still adhere to the top-level tuple structure. + e.g. `({"x0": x0, "x1": x1}, y)`. Keras will not attempt to separate + features, targets, and weights from the keys of a single dict. + A notable unsupported data type is the namedtuple. The reason is that + it behaves like both an ordered datatype (tuple) and a mapping + datatype (dict). So given a namedtuple of the form: + `namedtuple("example_tuple", ["y", "x"])` + it is ambiguous whether to reverse the order of the elements when + interpreting the value. Even worse is a tuple of the form: + `namedtuple("other_tuple", ["x", "y", "z"])` + where it is unclear if the tuple was intended to be unpacked into x, y, + and sample_weight or passed through as a single element to `x`. As a + result the data processing code will simply raise a ValueError if it + encounters a namedtuple. (Along with instructions to remedy the issue.) + Returns: A `History` object. Its `History.history` attribute is a record of training loss values and metrics values @@ -685,6 +711,9 @@ class Model(network.Network, version_utils.VersionSelector): if the model has named inputs. - A `tf.data` dataset. - A generator or `keras.utils.Sequence` instance. + A more detailed description of unpacking behavior for iterator types + (Dataset, generator, Sequence) is given in the `Unpacking behavior + for iterator-like inputs` section of `Model.fit`. y: Target data. Like the input data `x`, it could be either Numpy array(s) or TensorFlow tensor(s). It should be consistent with `x` (you cannot have Numpy inputs and @@ -738,6 +767,9 @@ class Model(network.Network, version_utils.VersionSelector): multiprocessing, you should not pass non-picklable arguments to the generator as they can't be passed easily to children processes. + See the discussion of `Unpacking behavior for iterator-like inputs` for + `Model.fit`. + Returns: Scalar test loss (if the model has a single output and no metrics) or list of scalars (if the model has multiple outputs @@ -787,6 +819,9 @@ class Model(network.Network, version_utils.VersionSelector): (in case the model has multiple inputs). - A `tf.data` dataset. - A generator or `keras.utils.Sequence` instance. + A more detailed description of unpacking behavior for iterator types + (Dataset, generator, Sequence) is given in the `Unpacking behavior + for iterator-like inputs` section of `Model.fit`. batch_size: Integer or `None`. Number of samples per gradient update. If unspecified, `batch_size` will default to 32. @@ -817,6 +852,10 @@ class Model(network.Network, version_utils.VersionSelector): multiprocessing, you should not pass non-picklable arguments to the generator as they can't be passed easily to children processes. + See the discussion of `Unpacking behavior for iterator-like inputs` for + `Model.fit`. Note that Model.predict uses the same interpretation rules as + `Model.fit` and `Model.evaluate`, so inputs must be unambiguous for all + three methods. Returns: Numpy array(s) of predictions. diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py index dc2495c6661..1209215c1b6 100644 --- a/tensorflow/python/keras/engine/training_test.py +++ b/tensorflow/python/keras/engine/training_test.py @@ -18,8 +18,10 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function +import collections import io import logging +import re import sys from absl.testing import parameterized @@ -735,6 +737,125 @@ class TrainingTest(keras_parameterized.TestCase): }) self.assertEqual(len(out), 2) + def _make_sequence_input_functions(self, input_type): + # train and test + xy_namedtuple = collections.namedtuple('xy_namedtuple', ['x', 'y']) + + # predict + x_namedtuple = collections.namedtuple('x_namedtuple', ['x']) + + if input_type == 'dataset': + dataset = dataset_ops.Dataset.range(16).map( + lambda _: array_ops.ones(shape=(1,))) + + xy_dataset = dataset_ops.Dataset.zip((dataset, dataset)).batch(4) + x_dataset = dataset.batch(4) + def xy_function(use_namedtuple): + return xy_dataset.map(xy_namedtuple) if use_namedtuple else xy_dataset + + def x_function(use_namedtuple): + return x_dataset.map(x_namedtuple) if use_namedtuple else x_dataset + + return xy_function, x_function + + elif input_type == 'generator': + def xy_generator(use_namedtuple): + x, y = np.ones((4, 1)), np.ones((4, 1)) + for _ in range(4): + if use_namedtuple: + yield xy_namedtuple(x, y) + else: + yield x, y + + def x_generator(use_namedtuple): + x = np.ones((4, 1)) + for _ in range(4): + if use_namedtuple: + yield x_namedtuple(x) + else: + yield x + + return xy_generator, x_generator + + elif input_type == 'sequence': + class XYSequence(data_utils.Sequence): + + def __init__(self, use_namedtuple): + self._use_namedtuple = use_namedtuple + super(XYSequence, self).__init__() + + def __getitem__(self, idx): + x, y = np.ones((4, 1)), np.ones((4, 1)) + if self._use_namedtuple: + return xy_namedtuple(x, y) + return x, y + + def __len__(self): + return 4 + + class XSequence(data_utils.Sequence): + + def __init__(self, use_namedtuple): + self._use_namedtuple = use_namedtuple + super(XSequence, self).__init__() + + def __getitem__(self, idx): + x = np.ones((4, 1)) + if self._use_namedtuple: + return x_namedtuple(x) + return x + + def __len__(self): + return 4 + + return XYSequence, XSequence + + @keras_parameterized.run_all_keras_modes(always_skip_v1=True) + @keras_parameterized.run_with_all_model_types + @parameterized.named_parameters( + ('dataset', 'dataset'), + ('generator', 'generator'), + ('sequence', 'sequence'), + ) + def test_sequence_input_types(self, input_type): + """Ensure that namedtuples and tuples are plumbed identically.""" + if not testing_utils.should_run_tf_function(): + self.skipTest('Improved checking is only present in data_adapter.') + + xy_function, x_function = self._make_sequence_input_functions(input_type) + fit_kwargs, evaluate_kwargs, predict_kwargs = {}, {}, {} + if input_type == 'generator': + fit_kwargs['steps_per_epoch'] = 4 + evaluate_kwargs['steps'] = 4 + predict_kwargs['steps'] = 4 + + model = testing_utils.get_small_mlp(1, 1, 1) + model.compile( + loss='mse', + optimizer='sgd', + run_eagerly=testing_utils.should_run_eagerly(), + experimental_run_tf_function=testing_utils.should_run_tf_function()) + + model.fit(xy_function(use_namedtuple=False), **fit_kwargs) + model.evaluate(xy_function(use_namedtuple=False), **evaluate_kwargs) + model.predict(x_function(use_namedtuple=False), **predict_kwargs) + + xy_pattern = re.escape( + "Received namedtuple () with fields " + "`('x', 'y')` as input.") + x_pattern = re.escape( + "Received namedtuple () with fields " + "`('x',)` as input.") + + with self.assertRaisesRegex(ValueError, xy_pattern): + model.fit(xy_function(use_namedtuple=True), **fit_kwargs) + + with self.assertRaisesRegex(ValueError, xy_pattern): + model.evaluate(xy_function(use_namedtuple=True), **evaluate_kwargs) + + with self.assertRaisesRegex(ValueError, x_pattern): + model.predict(x_function(use_namedtuple=True), **predict_kwargs) + @keras_parameterized.run_all_keras_modes @keras_parameterized.run_with_all_model_types def test_activity_regularizer_fit(self): diff --git a/tensorflow/python/keras/engine/training_v2.py b/tensorflow/python/keras/engine/training_v2.py index ad176a99a2e..a920b4f484e 100644 --- a/tensorflow/python/keras/engine/training_v2.py +++ b/tensorflow/python/keras/engine/training_v2.py @@ -649,6 +649,12 @@ def _process_inputs(model, # Then we map using only the tensor standardization portion. def map_fn(x, y=None, sample_weights=None): """Tensor manipulation portion of standardization for Dataset.map.""" + if (y is None and sample_weights is None): + # namedtuples are forbidden because it is ambiguous if they should be + # unpacked. If y or sample_weights is present then `x` was not the + # top level structure, and the correct behavior is unambiguous. + data_adapter.assert_not_namedtuple(x) + standardized = model._standardize_tensors( x, y, sample_weights, run_eagerly=False, From 6d4c47b632b09cf03b7487ede86e10523fc1990d Mon Sep 17 00:00:00 2001 From: Anthony Liu Date: Fri, 6 Dec 2019 13:19:24 -0800 Subject: [PATCH 295/383] [tfdbg] Add FullHealth mode to DebugNumericSummaryV2Op. - The TensorDebugMode added is FULL_HEALTH, a mode that computes a shape-[11] rank-1 tensor given any float-type tensor. The first element is the id of the tensor. The second element is the id of the device the tensor is on (TODO, default to -1 for now). The third element is the dtype of the tensor described by the enum in tensorflow/core/framework/types.proto. The fourth and fifth elements are the rank and element count respectively. Finally, the sixth to eleventh elements are the counts of -infs, infs, nans, negative finite values, zeroes, and positive finite values. - The CPU and GPU kernels of the op are added. PiperOrigin-RevId: 284248404 Change-Id: I8d2accf5ff20f67e14d1e1405144117d92fc82e7 --- tensorflow/core/kernels/debug_ops.h | 85 ++++++++++- tensorflow/core/kernels/debug_ops_gpu.cu.cc | 61 ++++++++ .../python/debug/lib/debug_v2_ops_test.py | 134 ++++++++++++++++++ 3 files changed, 279 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h index 643dfdad38f..5d1c78e9d15 100644 --- a/tensorflow/core/kernels/debug_ops.h +++ b/tensorflow/core/kernels/debug_ops.h @@ -479,6 +479,18 @@ extern template struct ConciseHealthLaunch; extern template struct ConciseHealthLaunch; extern template struct ConciseHealthLaunch; +template +struct FullHealthLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]); +}; + +extern template struct FullHealthLaunch; +extern template struct FullHealthLaunch; +extern template struct FullHealthLaunch; +extern template struct FullHealthLaunch; +extern template struct FullHealthLaunch; +extern template struct FullHealthLaunch; + template struct ReduceInfNanThreeSlotsLaunch { void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]); @@ -565,7 +577,46 @@ class DebugNumericSummaryV2Op : public OpKernel { output_tensor->flat()(2) = fp_props[0]; // Slot for -inf count output_tensor->flat()(3) = fp_props[1]; // Slot for inf count output_tensor->flat()(4) = fp_props[2]; // Slot for nan count - } else if (tensor_debug_mode_ == 5) { // SHAPE + } else if (tensor_debug_mode_ == 4) { // FULL HEALTH + TensorShape shape({11}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + int num_dims = tensor.dims(); + output_tensor->flat()(0) = tensor_id; + output_tensor->flat()(1) = -1.0; // TODO(144919262): Device ID + output_tensor->flat()(2) = static_cast(tensor.dtype()); + output_tensor->flat()(3) = static_cast(num_dims); + output_tensor->flat()(4) = num_elem; + + // Accumlator value [neg_inf_count, pos_inf_count, nan_count, neg_count, + // zero_count, pos_count] + Tout fp_props[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + std::for_each(data, data + size, [&fp_props](const Tin& y) { + if (TF_PREDICT_TRUE(Eigen::numext::isfinite(y))) { + if (y < static_cast(0.f)) { + ++fp_props[3]; + } else if (y == static_cast(0.f)) { + ++fp_props[4]; + } else { + ++fp_props[5]; + } + } else if (Eigen::numext::isinf(y)) { + if (y < static_cast(0.f)) { + ++fp_props[0]; + } else { + ++fp_props[1]; + } + } else if (Eigen::numext::isnan(y)) { + ++fp_props[2]; + } + }); + output_tensor->flat()(5) = fp_props[0]; // Slot for -inf count + output_tensor->flat()(6) = fp_props[1]; // Slot for inf count + output_tensor->flat()(7) = fp_props[2]; // Slot for nan count. + output_tensor->flat()(8) = fp_props[3]; // Slot for neg count. + output_tensor->flat()(9) = fp_props[4]; // Slot for zero count. + output_tensor->flat()(10) = fp_props[5]; // Slot for pos count. + } else if (tensor_debug_mode_ == 5) { // SHAPE TensorShape shape({10}); OP_REQUIRES_OK(context, context->allocate_output(0, shape, &output_tensor)); @@ -725,6 +776,38 @@ class DebugNumericSummaryV2Op : public AsyncOpKernel { d, input.data(), input.size(), output_tensor->flat().data() + 2); + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( + stream, std::move(check_cb)); + } else if (tensor_debug_mode_ == 4) { // FULL HEALTH + TensorShape shape({11}); + OP_REQUIRES_OK(context, + context->allocate_output(0, shape, &output_tensor)); + + auto* stream = context->op_device_context()->stream(); + OP_REQUIRES_ASYNC(context, stream != nullptr, + errors::Internal("No GPU stream available."), done); + + se::DeviceMemoryBase output_tensor_ptr( + output_tensor->flat().data(), + output_tensor->flat().size()); + stream->ThenMemset32(&output_tensor_ptr, 0, 11 * sizeof(Tout)); + + int num_dims = tensor.dims(); + const Tout static_output[] = {tensor_id, + -1.0, // TODO(144919262): Device ID + static_cast(tensor.dtype()), + static_cast(num_dims), num_elem}; + stream->ThenMemcpy(&output_tensor_ptr, &static_output, 5 * sizeof(Tout)); + if (num_elem == 0) { + done(); + return; + } + + // Call the GPU kernels for the numerical (inf/nan) checks and + // pos/neg/zero counts. + FullHealthLaunch().Run(d, input.data(), input.size(), + output_tensor->flat().data() + 5); + context->device()->tensorflow_gpu_device_info()->event_mgr->ThenExecute( stream, std::move(check_cb)); } else if (tensor_debug_mode_ == 5) { // SHAPE diff --git a/tensorflow/core/kernels/debug_ops_gpu.cu.cc b/tensorflow/core/kernels/debug_ops_gpu.cu.cc index e6f42f3d4b6..882c6be008e 100644 --- a/tensorflow/core/kernels/debug_ops_gpu.cu.cc +++ b/tensorflow/core/kernels/debug_ops_gpu.cu.cc @@ -81,6 +81,47 @@ __global__ void ConciseHealthKernel(const Tin* __restrict__ data, int size, atomicAdd(output + 2, accum[2]); } +// A CUDA kernel that fills the six elements of an output vector with the +// number of -infs, infs, nans, negatives, zeros, and positives in the input +// respectively. +template +__global__ void FullHealthKernel(const Tin* __restrict__ data, int size, + Tout output[6]) { + const int32 thread_id = blockIdx.x * blockDim.x + threadIdx.x; + const int32 total_thread_count = gridDim.x * blockDim.x; + + int32 offset = thread_id; + Tout accum[6] = {0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; + + while (offset < size) { + if (isinf(data[offset])) { + if (data[offset] < static_cast(0.f)) { + ++accum[0]; + } else { + ++accum[1]; + } + } else if (isnan(data[offset])) { + ++accum[2]; + } else { + if (data[offset] < static_cast(0.f)) { + ++accum[3]; + } else if (data[offset] == static_cast(0.f)) { + ++accum[4]; + } else { + ++accum[5]; + } + } + offset += total_thread_count; + } + + atomicAdd(output, accum[0]); + atomicAdd(output + 1, accum[1]); + atomicAdd(output + 2, accum[2]); + atomicAdd(output + 3, accum[3]); + atomicAdd(output + 4, accum[4]); + atomicAdd(output + 5, accum[5]); +} + // A CUDA kernel that fills a length-3 vector according to whether any of the // input data contains negative infinity, positive infinity, or NaN. The first // element is filled with -infinity if any of the elements is -infinity. @@ -151,6 +192,26 @@ template struct ConciseHealthLaunch; template struct ConciseHealthLaunch; template struct ConciseHealthLaunch; +template +struct FullHealthLaunch { + void Run(const GPUDevice& d, const Tin* data, int size, Tout output[6]) { + const int32 block_size = d.maxGpuThreadsPerBlock(); + const int32 num_blocks = + (d.getNumGpuMultiProcessors() * d.maxGpuThreadsPerMultiProcessor()) / + block_size; + + TF_CHECK_OK(GpuLaunchKernel(FullHealthKernel, num_blocks, + block_size, 0, d.stream(), data, size, output)); + } +}; + +template struct FullHealthLaunch; +template struct FullHealthLaunch; +template struct FullHealthLaunch; +template struct FullHealthLaunch; +template struct FullHealthLaunch; +template struct FullHealthLaunch; + template struct ReduceInfNanThreeSlotsLaunch { void Run(const GPUDevice& d, const Tin* data, int size, Tout output[3]) { diff --git a/tensorflow/python/debug/lib/debug_v2_ops_test.py b/tensorflow/python/debug/lib/debug_v2_ops_test.py index ea3d8970993..c665da7132d 100644 --- a/tensorflow/python/debug/lib/debug_v2_ops_test.py +++ b/tensorflow/python/debug/lib/debug_v2_ops_test.py @@ -546,6 +546,140 @@ class DebugIdentityV2OpTest(dumping_callback_test_lib.DumpingCallbackTestBase): tensor_id, 2.0, 7.0, 2 * 3 * 4 * 5 * 6 * 7, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0 ]) + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpFullHealthSmall(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.FULL_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + tensor, tensor_id = debug_summary(constant_op.constant([])) + expected = [tensor_id, -1, 1, 1, 0, 0, 0, 0, 0, 0, 0] + self.assertAllEqual(tensor, expected) + + tensor, tensor_id = debug_summary(constant_op.constant(42.0)) + expected = [tensor_id, -1, 1, 0, 1, 0, 0, 0, 0, 0, 1] + self.assertAllEqual(tensor, expected) + + tensor, tensor_id = debug_summary(constant_op.constant([3.0, 4.0])) + expected = [tensor_id, -1, 1, 1, 2, 0, 0, 0, 0, 0, 2] + self.assertAllEqual(tensor, expected) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([3, -np.inf], dtype=np.float32))) + expected = [tensor_id, -1, 1, 1, 2, 1, 0, 0, 0, 0, 1] + self.assertAllEqual(tensor, expected) + + tensor, tensor_id = debug_summary( + constant_op.constant(np.array([[0, 0], [np.nan, 0]], dtype=np.float64))) + expected = [tensor_id, -1, 2, 2, 4, 0, 0, 1, 0, 3, 0] + self.assertAllEqual(tensor, expected) + + tensor, tensor_id = debug_summary( + constant_op.constant( + np.array([[0, 0], [np.nan, np.inf]], dtype=np.float16))) + expected = [tensor_id, -1, 19, 2, 4, 0, 1, 1, 0, 2, 0] + self.assertAllEqual(tensor, expected) + + tensor, tensor_id = debug_summary( + constant_op.constant( + np.array([[0, np.inf], [np.nan, -np.inf]], dtype=np.float32))) + expected = [tensor_id, -1, 1, 2, 4, 1, 1, 1, 0, 1, 0] + self.assertAllEqual(tensor, expected) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpFullHealthLarge(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.FULL_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + def tensor_counts(arr): + counts = [len(np.shape(arr)), np.size(arr), 0, 0, 0, 0, 0, 0] + for n in np.ravel(arr): + if np.isneginf(n): + counts[2] += 1 + elif np.isposinf(n): + counts[3] += 1 + elif np.isnan(n): + counts[4] += 1 + elif n < 0.: + counts[5] += 1 + elif n == 0.: + counts[6] += 1 + else: + counts[7] += 1 + return counts + + x = np.zeros([50, 50], dtype=np.float16) + x[32, 47] = np.nan + x[0:4, 3] = np.inf + x[40:50, 40:50] = 10 + x[3, 20] = -10 + tensor, tensor_id = debug_summary(constant_op.constant(x)) + expected = [tensor_id, -1, 19] + tensor_counts(x) + self.assertAllEqual(tensor, expected) + + x = np.ones([25, 25, 50], dtype=np.float32) * np.inf + x[:, :, 1] = np.nan + x[:, :, 2] = -np.inf + x[:, :, 3] = -1 + x[:, :, 4] = 0 + x[:, :, 5] = 1 + tensor, tensor_id = debug_summary(constant_op.constant(x)) + expected = [tensor_id, -1, 1] + tensor_counts(x) + self.assertAllEqual(tensor, expected) + x[0, 0, 0] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + expected = [tensor_id, -1, 1,] + tensor_counts(x) + self.assertAllEqual(tensor, expected) + x = np.zeros([9701], dtype=np.float64) + x[9700] = np.nan + tensor, tensor_id = debug_summary(constant_op.constant(x)) + expected = [tensor_id, -1, 2] + tensor_counts(x) + self.assertAllEqual(tensor, expected) + + @test_util.run_in_graph_and_eager_modes + def testDebugNumericSummaryV2OpFullHealthConsistency(self): + + def debug_summary(x): + return self.evaluate( + gen_debug_ops.debug_numeric_summary_v2( + x, + tensor_debug_mode=(debug_event_pb2.TensorDebugMode.FULL_HEALTH), + tensor_id=x._id, + output_dtype=dtypes.float64)), x._id + + # Assert the same op is returns a consistant value + x = np.zeros([100, 100], dtype=np.float16) + x[32, 47] = np.nan + x[0:4, 3] = np.inf + x[90:100, 90:100] = 10 + x[3, 20] = -10 + c = constant_op.constant(x) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + + x = np.ones((100, 200, 3, 10), np.double) + x[1, 30, 2] = 10 + x[5, :, 0, 1] = np.nan + x[90:100, 150, :, :] = np.inf + c = constant_op.constant(x) + tensor_1, tensor_id_1 = debug_summary(c) + tensor_2, tensor_id_2 = debug_summary(c) + self.assertAllEqual(tensor_1, tensor_2) + self.assertEqual(tensor_id_1, tensor_id_2) + if __name__ == "__main__": ops.enable_eager_execution() From 00c6bb2b7ce164716c0a39d7fad757d1e25fbdb1 Mon Sep 17 00:00:00 2001 From: Jaesung Chung Date: Fri, 6 Dec 2019 13:19:53 -0800 Subject: [PATCH 296/383] Use mlir::TypeAttr for the type attribute instead of mangled string repr All TF types are modeled in either standard or TF dialect and can be directly represented without need for string mangling. PiperOrigin-RevId: 284248488 Change-Id: Id57c670808da30eaaaa85b0d4a96fd4e813df8f3 --- .../lite/tests/end2end/custom_opdef.pbtxt | 2 +- .../graphdef2mlir/graph-default-attr.pbtxt | 2 +- .../tests/graphdef2mlir/switch_n.pbtxt | 4 +- .../tensorflow/tests/resource_op_lifting.mlir | 42 +++++++++---------- .../transforms/resource_op_lifting.cc | 40 +++++------------- .../mlir/tensorflow/translate/import_model.cc | 8 ++-- 6 files changed, 40 insertions(+), 58 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt b/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt index 7036ef71b58..0fcee7d7e8f 100644 --- a/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt +++ b/tensorflow/compiler/mlir/lite/tests/end2end/custom_opdef.pbtxt @@ -38,6 +38,6 @@ versions { # CHECK: func @main(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<*xi32> # CHECK: attributes {tf.entry_function = {inputs = "input0,input1", outputs = "output"}} { -# CHECK-NEXT: %0 = "tf.BannaPotatoSaladWithColeslaw"(%arg0, %arg1) {T = "tfdtype$DT_INT32", device = "", name = "output"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<*xi32> +# CHECK-NEXT: %0 = "tf.BannaPotatoSaladWithColeslaw"(%arg0, %arg1) {T = i32, device = "", name = "output"} : (tensor<4xi32>, tensor<4xi32>) -> tensor<*xi32> # CHECK-NEXT: return %0 : tensor<*xi32> # CHECK-NEXT: } diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt index b26d7e7f2ba..ac248041994 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-default-attr.pbtxt @@ -8,7 +8,7 @@ # Verify that we can also pull some attributes that are needed to be able to # create a Graph in memory, like `T`. # CHECK: tf.MaxPool -# CHECK-SAME: T = "tfdtype$DT_FLOAT" +# CHECK-SAME: T = f32 node { name: "input" diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt index d33ac2f3b5b..3dd5ce58ed2 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt +++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/switch_n.pbtxt @@ -2,11 +2,11 @@ # CHECK: tf_executor.SwitchN # CHECK-SAME: of 3 : tensor -# CHECK-SAME: T = "tfdtype$DT_INT32" +# CHECK-SAME: T = i32 # CHECK-SAME: name = "Case/branch_index/_3" # CHECK: tf_executor.SwitchN # CHECK-SAME: of 2 : tensor -# CHECK-SAME: T = "tfdtype$DT_FLOAT" +# CHECK-SAME: T = f32 # CHECK-SAME: name = "Case/Case/input_0/_7" node { diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir index 8ff72dbc7fc..d4c006de288 100644 --- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir +++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir @@ -8,7 +8,7 @@ func @only_resource_load() -> tensor<*xi32> { // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp" %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource> - // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = "tfdtype$DT_INT32"} + // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32} // CHECK: "tf_device.launch" // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) // CHECK: tf_device.return %[[COMPUTE_RES]] @@ -16,7 +16,7 @@ func @only_resource_load() -> tensor<*xi32> { // CHECK-SAME: () -> tensor<*xi32> %1 = "tf_device.launch"() ( { - %2 = "tf.ReadVariableOp"(%0) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>) -> tensor<*xi32> + %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32> %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>) tf_device.return %3 : tensor<*xi32> }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32> @@ -39,11 +39,11 @@ func @only_resource_store() -> tensor<*xi32> { // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]] // CHECK: {device = "tpu0", launch_attr = "launch_attr"} // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>) - // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = "tfdtype$DT_INT32"} + // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = i32} %1 = "tf_device.launch"() ( { %2 = "tf.SomeComputation"() : () -> (tensor<*xi32>) - "tf.AssignVariableOp"(%0, %2) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>, tensor<*xi32>) -> () + "tf.AssignVariableOp"(%0, %2) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> () tf_device.return %2 : tensor<*xi32> }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32> @@ -61,18 +61,18 @@ func @same_resource_load_and_store() -> tensor<*xi32> { // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp" %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource> - // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = "tfdtype$DT_INT32"} + // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32} // CHECK: %[[LAUNCH_RES:[0-9]*]]:2 = "tf_device.launch" // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]] // CHECK: {device = "tpu0", launch_attr = "launch_attr"} // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>) - // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = "tfdtype$DT_INT32"} + // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = i32} %1 = "tf_device.launch"() ( { - %2 = "tf.ReadVariableOp"(%0) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>) -> tensor<*xi32> + %2 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32> %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>) - "tf.AssignVariableOp"(%0, %3) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>, tensor<*xi32>) -> () + "tf.AssignVariableOp"(%0, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> () tf_device.return %3 : tensor<*xi32> }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32> @@ -91,19 +91,19 @@ func @decompose_assign_add_variable_op() -> tensor<*xi32> { // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp" %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource> - // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = "tfdtype$DT_INT32"} + // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = i32} // CHECK: %[[LAUNCH_RES:[0-9]*]]:2 = "tf_device.launch" // CHECK: %[[ONE:[0-9]*]] = "tf.Const"() {value = dense<1> : tensor} // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.AddV2"(%[[RES_READ_VAL]], %[[ONE]]) // CHECK: tf_device.return %[[COMPUTE_RES]], %[[COMPUTE_RES]] // CHECK: {device = "tpu0", launch_attr = "launch_attr"} // CHECK-SAME: () -> (tensor<*xi32>, tensor<*xi32>) - // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = "tfdtype$DT_INT32"} + // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = i32} %1 = "tf_device.launch"() ( { %2 = "tf.Const"() {value = dense<1> : tensor} : () -> tensor - "tf.AssignAddVariableOp"(%0, %2) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>, tensor) -> () - %3 = "tf.ReadVariableOp"(%0) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>) -> tensor<*xi32> + "tf.AssignAddVariableOp"(%0, %2) {dtype = i32} : (tensor<*x!tf.resource>, tensor) -> () + %3 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32> tf_device.return %3 : tensor<*xi32> }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32> @@ -128,8 +128,8 @@ func @decompose_assign_sub_variable_op() -> tensor<*xi32> { %1 = "tf_device.launch"() ( { %2 = "tf.Const"() {value = dense<1> : tensor} : () -> tensor - "tf.AssignSubVariableOp"(%0, %2) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>, tensor) -> () - %3 = "tf.ReadVariableOp"(%0) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>) -> tensor<*xi32> + "tf.AssignSubVariableOp"(%0, %2) {dtype = i32} : (tensor<*x!tf.resource>, tensor) -> () + %3 = "tf.ReadVariableOp"(%0) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32> tf_device.return %3 : tensor<*xi32> }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xi32> @@ -147,7 +147,7 @@ func @decompose_resource_apply_gradient_descent() -> tensor<*xf32> { // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp" %0 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource> - // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = "tfdtype$DT_FLOAT"} + // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) {dtype = f32} // CHECK: %[[LAUNCH_RES:[0-9]*]]:2 = "tf_device.launch" // CHECK: %[[ALPHA:[0-9]*]] = "tf.Const" // CHECK: %[[DELTA:[0-9]*]] = "tf.Const" @@ -156,13 +156,13 @@ func @decompose_resource_apply_gradient_descent() -> tensor<*xf32> { // CHECK: tf_device.return %[[SUB]], %[[SUB]] // CHECK: {device = "tpu0", launch_attr = "launch_attr"} // CHECK-SAME: () -> (tensor<*xf32>, tensor<*xf32>) - // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = "tfdtype$DT_FLOAT"} + // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[LAUNCH_RES]]#1) {dtype = f32} %1 = "tf_device.launch"() ( { - %2 = "tf.Const"() {T = "tfdtype$DT_FLOAT", value = dense<[1.0]> : tensor<1xf32>} : () -> tensor - %3 = "tf.Const"() {T = "tfdtype$DT_FLOAT", value = dense<[0.5]> : tensor<1xf32>} : () -> tensor + %2 = "tf.Const"() {T = f32, value = dense<[1.0]> : tensor<1xf32>} : () -> tensor + %3 = "tf.Const"() {T = f32, value = dense<[0.5]> : tensor<1xf32>} : () -> tensor "tf.ResourceApplyGradientDescent"(%0, %2, %3) : (tensor<*x!tf.resource>, tensor, tensor) -> () - %4 = "tf.ReadVariableOp"(%0) {dtype = "tfdtype$DT_FLOAT"} : (tensor<*x!tf.resource>) -> tensor<*xf32> + %4 = "tf.ReadVariableOp"(%0) {dtype = f32} : (tensor<*x!tf.resource>) -> tensor<*xf32> tf_device.return %4 : tensor<*xf32> }) {device = "tpu0", launch_attr = "launch_attr"} : () -> tensor<*xf32> @@ -184,13 +184,13 @@ func @internal_resource() -> tensor<*xi32> { %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf.resource> // CHECK: %[[RES_READ_VAL:[0-9]*]] = "tf.ReadVariableOp"(%[[RES_HANDLE]]) - %2 = "tf.ReadVariableOp"(%1) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>) -> tensor<*xi32> + %2 = "tf.ReadVariableOp"(%1) {dtype = i32} : (tensor<*x!tf.resource>) -> tensor<*xi32> // CHECK: %[[COMPUTE_RES:[0-9]*]] = "tf.SomeComputation"(%[[RES_READ_VAL]]) %3 = "tf.SomeComputation"(%2) : (tensor<*xi32>) -> (tensor<*xi32>) // CHECK: "tf.AssignVariableOp"(%[[RES_HANDLE]], %[[COMPUTE_RES]]) - "tf.AssignVariableOp"(%1, %3) {dtype = "tfdtype$DT_INT32"} : (tensor<*x!tf.resource>, tensor<*xi32>) -> () + "tf.AssignVariableOp"(%1, %3) {dtype = i32} : (tensor<*x!tf.resource>, tensor<*xi32>) -> () // CHECK: tf_device.return %[[COMPUTE_RES]] tf_device.return %3 : tensor<*xi32> diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc index 7aa5c19fead..e764c6b0b87 100644 --- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc +++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc @@ -91,28 +91,17 @@ struct ResourceOpLiftingPass : public FunctionPass { // template LogicalResult RewriteCompositeAssignVariableOp(T src_op, OpBuilder* builder) { - // Read mangled dtype, which indicates type of data stored in resource + // Read dtype attribute, which indicates type of data stored in resource // variable. It can then be used to construct type needed for both // ReadVariableOp and AssignVariableOp. - StringAttr mangled_dtype_attr = - src_op.template getAttrOfType(kDTypeAttr); - std::string type_string = mangled_dtype_attr.getValue(); - tensorflow::DataType dtype_proto; - auto s = - tensorflow::mangling_util::DemangleDataType(type_string, &dtype_proto); - if (!s.ok()) return src_op.emitError() << s.error_message(); - - Type type; - s = tensorflow::ConvertDataType(dtype_proto, *builder, &type); - if (!s.ok()) return src_op.emitError() << s.error_message(); - type = UnrankedTensorType::get(type); + TypeAttr dtype_attr = src_op.template getAttrOfType(kDTypeAttr); + Type type = UnrankedTensorType::get(dtype_attr.getValue()); builder->setInsertionPoint(src_op); auto read_variable_op = builder->create( src_op.getLoc(), type, src_op.resource()); - read_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), - mangled_dtype_attr); + read_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), dtype_attr); Value* result; if (std::is_same()) { @@ -125,8 +114,7 @@ LogicalResult RewriteCompositeAssignVariableOp(T src_op, OpBuilder* builder) { auto assign_variable_op = builder->create( src_op.getLoc(), src_op.resource(), result); - assign_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), - mangled_dtype_attr); + assign_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), dtype_attr); src_op.erase(); return success(); @@ -147,22 +135,15 @@ LogicalResult RewriteCompositeAssignVariableOp(T src_op, OpBuilder* builder) { // tf.AssignVariableOp(%var, %new_var_val) LogicalResult RewriteResourceApplyGradientDescentOp( TF::ResourceApplyGradientDescentOp op, OpBuilder* builder) { - Type type = op.alpha()->getType(); - auto t = UnrankedTensorType::get(type.cast().getElementType()); + Type type = getElementTypeOrSelf(op.alpha()); + auto t = UnrankedTensorType::get(type); - tensorflow::DataType data_type; - auto s = tensorflow::ConvertToDataType(type, &data_type); - if (!s.ok()) return op.emitError() << s.error_message(); - - std::string mangled_data_type = - tensorflow::mangling_util::MangleDataType(data_type); - auto mangled_dtype_attr = builder->getStringAttr(mangled_data_type); + TypeAttr dtype_attr = TypeAttr::get(type); builder->setInsertionPoint(op); auto read_variable_op = builder->create(op.getLoc(), t, op.var()); - read_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), - mangled_dtype_attr); + read_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), dtype_attr); auto mul_op = builder->create(op.getLoc(), t, op.alpha(), op.delta()); @@ -170,8 +151,7 @@ LogicalResult RewriteResourceApplyGradientDescentOp( op.getLoc(), t, read_variable_op.value(), mul_op.z()); auto assign_variable_op = builder->create(op.getLoc(), op.var(), sub_op.z()); - assign_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), - mangled_dtype_attr); + assign_variable_op.setAttr(builder->getIdentifier(kDTypeAttr), dtype_attr); op.erase(); diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc index 3659a6b5a2b..3bf2c34e2c7 100644 --- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc +++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc @@ -945,9 +945,11 @@ StatusOr ImporterBase::ConvertAttributeValue( return builder_.getFloatAttr(builder_.getF32Type(), value.f()); case AttrValue::kB: return builder_.getBoolAttr(value.b()); - case AttrValue::kType: - return builder_.getStringAttr( - mangling_util::MangleDataType(value.type())); + case AttrValue::kType: { + mlir::Type type; + TF_RETURN_IF_ERROR(ConvertDataType(value.type(), builder_, &type)); + return mlir::TypeAttr::get(type); + } case AttrValue::kShape: return builder_.getStringAttr(mangling_util::MangleShape(value.shape())); case AttrValue::kTensor: From c8e94d4a6e76d079b01de59cafe9e923c178daf6 Mon Sep 17 00:00:00 2001 From: Yunlu Li Date: Fri, 6 Dec 2019 13:34:26 -0800 Subject: [PATCH 297/383] Fix msan failure. PiperOrigin-RevId: 284251368 Change-Id: I8e04e6340b1d82e467a3366aa04b68a7b43e1cb1 --- tensorflow/lite/c/common_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/c/common_test.cc b/tensorflow/lite/c/common_test.cc index 65c6ec63b28..7230adff0e9 100644 --- a/tensorflow/lite/c/common_test.cc +++ b/tensorflow/lite/c/common_test.cc @@ -106,7 +106,7 @@ TEST(Quantization, TestQuantizationFree) { } TEST(Sparsity, TestSparsityFree) { - TfLiteTensor t; + TfLiteTensor t = {}; // Set these values, otherwise TfLiteTensorFree has uninitialized values. t.allocation_type = kTfLiteArenaRw; t.dims = nullptr; From 0fd34ccc179c355ad86d97af539526e7036b9ae9 Mon Sep 17 00:00:00 2001 From: Shanqing Cai Date: Fri, 6 Dec 2019 13:57:21 -0800 Subject: [PATCH 298/383] [tfdbg] Add CURT_HEALTH mode to enable_dump_debug_info() - Add unit tests for CURT_HEALTH - Add CURT_HEALTH to example 'debug_mnist_v2' PiperOrigin-RevId: 284256308 Change-Id: Ia5b78da714cf3d43978d063fd44aba9bfed91cff --- .../debug/examples/v2/debug_mnist_v2.py | 4 +- .../python/debug/lib/dumping_callback.py | 58 +++++++++++++++++-- .../python/debug/lib/dumping_callback_test.py | 41 ++++++++++++- tensorflow/python/framework/tensor_util.py | 4 +- 4 files changed, 96 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/debug/examples/v2/debug_mnist_v2.py b/tensorflow/python/debug/examples/v2/debug_mnist_v2.py index fab52b9cfd6..9d410b36c98 100644 --- a/tensorflow/python/debug/examples/v2/debug_mnist_v2.py +++ b/tensorflow/python/debug/examples/v2/debug_mnist_v2.py @@ -99,8 +99,8 @@ def parse_args(): "--dump_tensor_debug_mode", type=str, default="NO_TENSOR", - help="Mode for dumping tensor values. Options: NO_TENSOR, FULL_TENSOR. " - "This is relevant only when --dump_dir is set.") + help="Mode for dumping tensor values. Options: NO_TENSOR, CURT_HEALTH, " + "FULL_TENSOR. This is relevant only when --dump_dir is set.") # TODO(cais): Add more tensor debug mode strings once they are supported. parser.add_argument( "--dump_circular_buffer_size", diff --git a/tensorflow/python/debug/lib/dumping_callback.py b/tensorflow/python/debug/lib/dumping_callback.py index ab3fceca532..b0cac891dde 100644 --- a/tensorflow/python/debug/lib/dumping_callback.py +++ b/tensorflow/python/debug/lib/dumping_callback.py @@ -27,6 +27,7 @@ import weakref from six.moves import xrange # pylint: disable=redefined-builtin +from tensorflow.core.framework import tensor_pb2 from tensorflow.core.protobuf import debug_event_pb2 from tensorflow.core.protobuf import graph_debug_info_pb2 from tensorflow.python.debug.lib import debug_events_writer @@ -61,6 +62,10 @@ def _get_id(): return str(uuid.uuid4()) +def _concrete_tensor_to_proto(tensor): + return tensor_util.make_tensor_proto(tensor.numpy()) + + class _DumpingCallback(object): """An object holding the states surrouding the dumping callback.""" @@ -276,7 +281,6 @@ class _DumpingCallback(object): automatic control dependencies (see `auto_control_deps.py`) instead of tensor overriding. """ - del tensor_ids # Unused currently. # TODO(b/144441464, b/144440920, b/144440922): Make use of it. tensor_debug_mode = self._tensor_debug_mode @@ -293,7 +297,6 @@ class _DumpingCallback(object): instrumented_tensors.append(tensor) continue if is_v1_graph_mode and not tensor.dtype.is_numpy_compatible: - instrumented_tensors.append(tensor) continue # Except in V1 graph mode + control flow, debug_identity_v2 trigger auto @@ -317,6 +320,30 @@ class _DumpingCallback(object): debug_tensor.op) instrumented_tensors.append(identity) return instrumented_tensors + elif tensor_debug_mode == debug_event_pb2.TensorDebugMode.CURT_HEALTH: + for output_slot, tensor in enumerate(tensors): + if (not self._should_dump_tensor(op_type, tensor.dtype) or + not tensor.dtype.is_floating): + if is_v1_graph_mode: + instrumented_tensors.append(tensor) + continue + debug_tensor = gen_debug_ops.debug_identity_v2( + gen_debug_ops.debug_numeric_summary_v2( + tensor, + tensor_id=tensor_ids[output_slot], + tensor_debug_mode=self._tensor_debug_mode, + output_dtype=dtypes.float64), + tfdbg_context_id=tfdbg_context_id, + op_name=op_name, + output_slot=output_slot, + tensor_debug_mode=self._tensor_debug_mode, + debug_urls=debug_urls) + if is_v1_graph_mode: + identity = array_ops.identity(tensor) + identity.op._add_control_input( # pylint: disable=protected-access + debug_tensor.op) + instrumented_tensors.append(identity) + return instrumented_tensors elif tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_TENSOR: for output_slot, tensor in enumerate(tensors): if (not self._should_dump_tensor(op_type, tensor.dtype) or @@ -377,7 +404,8 @@ class _DumpingCallback(object): output_tensor_ids=output_tensor_ids, tensor_debug_mode=tensor_debug_mode, code_location=self._process_stack_frames()) - elif tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_TENSOR: + elif tensor_debug_mode in (debug_event_pb2.TensorDebugMode.CURT_HEALTH, + debug_event_pb2.TensorDebugMode.FULL_TENSOR): execution_proto = debug_event_pb2.Execution( op_type=op_type, num_outputs=len(tensors), @@ -389,8 +417,20 @@ class _DumpingCallback(object): for tensor in tensors: if (self._should_dump_tensor(op_type, tensor.dtype) and tensor.dtype.is_numpy_compatible): - execution_proto.tensor_protos.append( - tensor_util.make_tensor_proto(tensor.numpy())) + if tensor_debug_mode == debug_event_pb2.TensorDebugMode.CURT_HEALTH: + if tensor.dtype.is_floating: + tensor_proto = _concrete_tensor_to_proto( + gen_debug_ops.debug_numeric_summary_v2( + tensor, + tensor_debug_mode=tensor_debug_mode, + output_dtype=dtypes.float64)) + else: + # A placeholder for non-floating-type output tensors. + tensor_proto = tensor_pb2.TensorProto() + elif tensor_debug_mode == debug_event_pb2.TensorDebugMode.FULL_TENSOR: + tensor_proto = _concrete_tensor_to_proto(tensor) + if tensor_proto: + execution_proto.tensor_protos.append(tensor_proto) return execution_proto else: raise NotImplementedError( @@ -427,6 +467,10 @@ class _DumpingCallback(object): return self._instrument_symbolic_tensors( outputs, op_type, op_name, context_id, output_tensor_ids) else: + if compat.as_bytes(op_type) == b"DebugNumericSummaryV2": + # TODO(b/140334369): Remove this special casing logic once op_callback. + # automatically prevents infinite recursion in eager mode. + return None context_id = self._func_graph_id_from_func_name(op_type) input_ids = [t._id for t in inputs] # pylint:disable=protected-access writer.WriteExecution(self._dump_eager_tensors( @@ -605,10 +649,12 @@ def enable_dump_debug_info(dump_root, tensor_debug_mode = debug_event_pb2.TensorDebugMode.Value(tensor_debug_mode) if tensor_debug_mode not in (debug_event_pb2.TensorDebugMode.NO_TENSOR, + debug_event_pb2.TensorDebugMode.CURT_HEALTH, debug_event_pb2.TensorDebugMode.FULL_TENSOR): raise NotImplementedError( "tfdbg dumping: support for tensor debug mode %s is not " - "implemented yet" % tensor_debug_mode) + "implemented yet" % + debug_event_pb2.TensorDebugMode.Name(tensor_debug_mode)) # Validate the types of tensor_dtypes. if tensor_dtypes is not None: diff --git a/tensorflow/python/debug/lib/dumping_callback_test.py b/tensorflow/python/debug/lib/dumping_callback_test.py index a15fd2c20c1..9400610b946 100644 --- a/tensorflow/python/debug/lib/dumping_callback_test.py +++ b/tensorflow/python/debug/lib/dumping_callback_test.py @@ -87,10 +87,11 @@ class TracingCallbackTest( @parameterized.named_parameters( ("NoTensor", "NO_TENSOR"), + ("CurtHealth", "CURT_HEALTH"), ("FullTensor", "FULL_TENSOR"), ) def testPureEagerOpExecution(self, tensor_debug_mode): - """Test catching Infinity in eager op execution: float32.""" + """Test dumping data from eager op execution: float32.""" writer = dumping_callback.enable_dump_debug_info( self.dump_root, tensor_debug_mode=tensor_debug_mode) @@ -137,6 +138,14 @@ class TracingCallbackTest( # Due to the NO_TENSOR tensor debug mode, tensor_protos ought to # be empty. self.assertFalse(execution.tensor_protos) + elif tensor_debug_mode == "CURT_HEALTH": + self.assertLen(execution.tensor_protos, 1) + if execution.op_type in ("AddV2", "Mul", "RealDiv"): + # 1st element: -1 is the unset tensor_id for eager op execution. + # 2nd element: 0 means there is no inf or nan. + self.assertAllClose( + tensor_util.MakeNdarray(execution.tensor_protos[0]), + [-1.0, 0.0]) elif tensor_debug_mode == "FULL_TENSOR": # Under the FULL_TENSOR mode, the value of the tensor should be # available through `tensor_protos`. @@ -195,6 +204,7 @@ class TracingCallbackTest( @parameterized.named_parameters( ("NoTensor", "NO_TENSOR"), + ("CurtHealth", "CURT_HEALTH"), ("FullTensor", "FULL_TENSOR"), ) @test_util.run_in_graph_and_eager_modes @@ -229,6 +239,7 @@ class TracingCallbackTest( stack_frame_by_id = self._readAndCheckSourceFilesAndStackFrames() (context_ids, op_types, op_name_to_op_type, op_name_to_context_id) = self._readAndCheckGraphsFile(stack_frame_by_id) + self.assertIn("AddV2", op_types) self.assertIn("Log", op_types) self.assertIn("Sin", op_types) @@ -256,6 +267,15 @@ class TracingCallbackTest( for tensor_value in tensor_values: self.assertEqual(tensor_value.dtype, np.float32) self.assertEqual(tensor_value.shape, (0,)) + elif tensor_debug_mode == "CURT_HEALTH": + for tensor_value in tensor_values: + self.assertLen(tensor_value, 2) + # 1st element: tensor_id, should be >= 0. + # TODO(cais): Assert on detailed value once Function-graph association + # is in place. + self.assertGreaterEqual(tensor_value[0], 0) + # 2nd element: 0 means there is no inf or nan. + self.assertEqual(tensor_value[1], 0) elif tensor_debug_mode == "FULL_TENSOR": self.assertAllClose(tensor_values[0], 5.0) # 1st AddV2 op. self.assertAllClose(tensor_values[1], np.log(5.0)) # Log op. @@ -554,6 +574,15 @@ class TracingCallbackTest( for tensor_value in tensor_values: self.assertEqual(tensor_value.dtype, np.float32) self.assertEqual(tensor_value.shape, (0,)) + elif tensor_debug_mode == "CURT_TENSOR": + for tensor_value in tensor_values: + self.assertLen(tensor_value, 2) + # 1st element: tensor_id, should be >= 0. + # TODO(cais): Assert on detailed value once Function-graph association + # is in place. + self.assertGreaterEqual(tensor_value[0], 0) + # 2nd element: 0 means there is no inf or nan. + self.assertEqual(tensor_value[1], 0) elif tensor_debug_mode == "FULL_TENSOR": less_values = [ tensor_values[i] @@ -683,6 +712,7 @@ class TracingCallbackTest( @parameterized.named_parameters( ("NoTensor", "NO_TENSOR"), + ("CurtHealth", "CURT_HEALTH"), ("FullTensor", "FULL_TENSOR"), ) def testMultiThreadedExecutionWithSameSetting(self, tensor_debug_mode): @@ -735,6 +765,15 @@ class TracingCallbackTest( for tensor_value in tensor_values: self.assertEqual(tensor_value.dtype, np.float32) self.assertEqual(tensor_value.shape, (0,)) + elif tensor_debug_mode == "CURT_HEALTH": + for tensor_value in tensor_values: + self.assertLen(tensor_value, 2) + # 1st element: tensor_id, should be >= 0. + # TODO(cais): Assert on detailed value once Function-graph association + # is in place. + self.assertGreaterEqual(tensor_value[0], 0) + # 2nd element: 0 means there is no inf or nan. + self.assertEqual(tensor_value[1], 0) elif tensor_debug_mode == "FULL_TENSOR": mul_values = [ tensor_values[i] diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py index f28b10aaf1d..4fcee63f464 100644 --- a/tensorflow/python/framework/tensor_util.py +++ b/tensorflow/python/framework/tensor_util.py @@ -566,9 +566,9 @@ def MakeNdarray(tensor): """Create a numpy ndarray from a tensor. Create a numpy ndarray with the same shape and data as the tensor. - + For example: - + ```python # Tensor a has shape (2,3) a = tf.constant([[1,2,3],[4,5,6]]) From 3e31f6191ce11483ac8d52f2c471ef7bc1f9175a Mon Sep 17 00:00:00 2001 From: Ruoxin Sang Date: Fri, 6 Dec 2019 13:57:57 -0800 Subject: [PATCH 299/383] Throw an explicit error if user call TPUStrategy experimental_run_v2 in eager mode with a python function. PiperOrigin-RevId: 284256438 Change-Id: I6c0ea86a4cdf84712a1940d44dae8d4044f0fdee --- .../distribute/custom_training_loop_test.py | 5 +- tensorflow/python/distribute/tpu_strategy.py | 60 ++++++++++++-- tensorflow/python/distribute/values_test.py | 79 ++++++++++++++++--- 3 files changed, 123 insertions(+), 21 deletions(-) diff --git a/tensorflow/python/distribute/custom_training_loop_test.py b/tensorflow/python/distribute/custom_training_loop_test.py index 55c2ae6a1ca..9be72c36c5f 100644 --- a/tensorflow/python/distribute/custom_training_loop_test.py +++ b/tensorflow/python/distribute/custom_training_loop_test.py @@ -36,9 +36,8 @@ class InputIterationTest(test.TestCase, parameterized.TestCase): @combinations.generate( combinations.combine( - distribution=strategy_combinations.all_strategies, - mode=["eager"] - )) + distribution=strategy_combinations.strategies_minus_tpu, + mode=["eager"])) def testFullEager(self, distribution): dataset = self._get_dataset() diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py index 2dd4309537a..8f32e8e2226 100644 --- a/tensorflow/python/distribute/tpu_strategy.py +++ b/tensorflow/python/distribute/tpu_strategy.py @@ -37,6 +37,7 @@ from tensorflow.python.distribute import values from tensorflow.python.distribute.cluster_resolver import TPUClusterResolver from tensorflow.python.eager import context from tensorflow.python.eager import def_function +from tensorflow.python.eager import function from tensorflow.python.framework import constant_op from tensorflow.python.framework import device_spec from tensorflow.python.framework import dtypes @@ -82,6 +83,29 @@ def maybe_init_scope(): yield +def validate_experimental_run_function(fn): + """Validate the function passed into strategy.experimental_run_v2.""" + + # We allow three types of functions/objects passed into TPUStrategy + # experimental_run_v2 in eager mode: + # 1. a user annotated tf.function + # 2. a ConcreteFunction, this is mostly what you get from loading a saved + # model. + # 3. a callable object and the `__call__` method itself is a tf.function. + # + # Otherwise we return an error, because we don't support eagerly running + # experimental_run_v2 in TPUStrategy. + + if context.executing_eagerly() and not isinstance( + fn, def_function.Function) and not isinstance( + fn, function.ConcreteFunction) and not (callable(fn) and isinstance( + fn.__call__, def_function.Function)): + raise NotImplementedError( + "TPUStrategy.experimental_run_v2(fn, ...) does not support eager " + "execution. Either convert `fn` into a tf.function or consider " + "calling strategy.experimental_run_v2 inside a tf.function.") + + @tf_export("distribute.experimental.TPUStrategy", v1=[]) class TPUStrategy(distribute_lib.Strategy): """TPU distribution strategy implementation.""" @@ -89,14 +113,36 @@ class TPUStrategy(distribute_lib.Strategy): def __init__(self, tpu_cluster_resolver=None, device_assignment=None): - """Initializes the TPUStrategy object. + """Synchronous training in TPU donuts or Pods. + + To construct a TPUStrategy object, you need to run the + initialization code as below: + + ```python + resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=FLAGS.tpu) + tf.config.experimental_connect_to_cluster(resolver) + tf.tpu.experimental.initialize_tpu_system(resolver) + strategy = tf.distribute.experimental.TPUStrategy(resolver) + ``` + + While using distribution strategies, the variables created within strategy's + scope will be replicated across all the replicas and can be kept in sync + using all-reduce algorithms. + + To run TF2 programs on TPUs, you can either use `.compile` and + `.fit` APIs in `tf.keras` with TPUStrategy, or write your own customized + training loop by calling `strategy.experimental_run_v2` directly. Note that + TPUStrategy doesn't support pure eager execution, so please make sure the + function passed into `strategy.experimental_run_v2` is a `tf.function` or + `strategy.experimental_run_v2` us called inside a `tf.function` if running + in eager mode. Args: tpu_cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver, - which provides information about the TPU cluster. + which provides information about the TPU cluster. device_assignment: Optional `tf.tpu.experimental.DeviceAssignment` to - specify the placement of replicas on the TPU cluster. Currently only - supports the usecase of using a single core within a TPU cluster. + specify the placement of replicas on the TPU cluster. Currently only + supports the usecase of using a single core within a TPU cluster. """ super(TPUStrategy, self).__init__(TPUExtended( self, tpu_cluster_resolver, device_assignment=device_assignment)) @@ -111,6 +157,8 @@ class TPUStrategy(distribute_lib.Strategy): # This implementation runs a single step. It does not use infeed or outfeed. def experimental_run_v2(self, fn, args=(), kwargs=None): """See base class.""" + validate_experimental_run_function(fn) + # Note: the target function is converted to graph even when in Eager mode, # so autograph is on by default here. fn = autograph.tf_convert(fn, ag_ctx.control_status_ctx()) @@ -157,6 +205,8 @@ class TPUStrategyV1(distribute_lib.StrategyV1): # This implementation runs a single step. It does not use infeed or outfeed. def experimental_run_v2(self, fn, args=(), kwargs=None): """See base class.""" + validate_experimental_run_function(fn) + fn = autograph.tf_convert(fn, ag_ctx.control_status_ctx()) return self.extended.tpu_run(fn, args, kwargs) @@ -699,7 +749,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1): ] # Workaround for `tpu.replicate` behaviour when single `Tensor` returned. - if result[0] is None: + if result[0] is None or isinstance(result[0], ops.Operation): replicate_outputs = [None] * len(replicate_outputs) else: replicate_outputs = [ diff --git a/tensorflow/python/distribute/values_test.py b/tensorflow/python/distribute/values_test.py index d97d1155c82..26d0eb3ac32 100644 --- a/tensorflow/python/distribute/values_test.py +++ b/tensorflow/python/distribute/values_test.py @@ -818,13 +818,31 @@ class SyncOnReadVariablePropertiesTest(test.TestCase): self.assertEqual(2., self.evaluate(add1(replica_local))) -@combinations.generate( - combinations.combine( - distribution=[ - strategy_combinations.mirrored_strategy_with_gpu_and_cpu, - strategy_combinations.tpu_strategy, - ], - mode=["graph", "eager"])) +def mirrored_and_tpu_strategy_combinations(): + return combinations.combine( + distribution=[ + strategy_combinations.mirrored_strategy_with_gpu_and_cpu, + strategy_combinations.tpu_strategy, + ], + mode=["graph", "eager"]) + + +def strategy_and_run_tf_function_combinations(): + # Test the combination of different strategies and whether a tf.function + # is passed into strategy.experimental_run_v2.""" + return combinations.combine( + distribution=[ + strategy_combinations.mirrored_strategy_with_gpu_and_cpu, + ], + mode=["graph", "eager"], + experimental_run_tf_function=[True, False]) + combinations.combine( + distribution=[ + strategy_combinations.tpu_strategy, + ], + mode=["graph", "eager"], + experimental_run_tf_function=[True]) + + class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): def _assign_replica_local(self, v, new): @@ -842,6 +860,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): save_path, _ = self._save_return_saver(sess, var) return save_path + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution): with self.cached_session() as sess: v, replica_local = _make_replica_local( @@ -862,6 +881,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): saver.restore(sess, save_path) self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]])) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution): if context.num_gpus() < 1 and context.executing_eagerly(): self.skipTest("A GPU is not available for this test in eager mode.") @@ -978,36 +998,46 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): saver.restore(sess, save_path) self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]])) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution): save_path = self._save_replica_local_mean(distribution) self._restore_replica_local_mean(save_path, distribution) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution): save_path = self._save_replica_local_sum(distribution) self._restore_replica_local_sum(save_path, distribution) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalMeanRestoreNormal(self, distribution): save_path = self._save_replica_local_mean(distribution) self._restore_normal(save_path) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveReplicaLocalSumRestoreNormal(self, distribution): save_path = self._save_replica_local_sum(distribution) self._restore_normal(save_path) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveNormalRestoreReplicaLocalMean(self, distribution): save_path = self._save_normal() self._restore_replica_local_mean(save_path, distribution) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testSaveNormalRestoreReplicaLocalSum(self, distribution): save_path = self._save_normal() self._restore_replica_local_sum(save_path, distribution) - def testAssign(self, distribution): + @combinations.generate(strategy_and_run_tf_function_combinations()) + def testAssign(self, distribution, experimental_run_tf_function): + def assign(fn, v, update_value, cross_replica): update_fn = lambda: getattr(v, fn)(update_value) if cross_replica: return update_fn() else: + if experimental_run_tf_function: + update_fn = def_function.function(update_fn) return distribution.experimental_local_results( distribution.experimental_run_v2(update_fn)) updates = [("assign", 1.), ("assign_add", 1.), ("assign_sub", -1.)] @@ -1033,12 +1063,17 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(self.evaluate(component.read_value()), self.evaluate(array_ops.ones_like(component))) - def testAssignDtypeConversion(self, distribution): + @combinations.generate(strategy_and_run_tf_function_combinations()) + def testAssignDtypeConversion(self, distribution, + experimental_run_tf_function): + def assign(fn, v, update_value, cross_replica): update_fn = lambda: getattr(v, fn)(update_value) if cross_replica: return update_fn() else: + if experimental_run_tf_function: + update_fn = def_function.function(update_fn) return distribution.experimental_local_results( distribution.experimental_run_v2(update_fn)) updates = [("assign", 1), ("assign_add", 1), ("assign_sub", -1)] @@ -1064,6 +1099,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(self.evaluate(component.read_value()), self.evaluate(array_ops.ones_like(component))) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testAssignWithAggregationSum(self, distribution): with distribution.scope(): v = variable_scope.variable( @@ -1076,6 +1112,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): self.assertAllEqual(self.evaluate(component.read_value()), self.evaluate(array_ops.ones_like(component))) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testAssignAddSubWithAggregationSum(self, distribution): with distribution.scope(): v = variable_scope.variable( @@ -1090,7 +1127,9 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): ValueError, "SyncOnReadVariable does not support "): self.evaluate(v.assign_sub(1.)) - def testReadValueInReplicaContext(self, distribution): + @combinations.generate(strategy_and_run_tf_function_combinations()) + def testReadValueInReplicaContext(self, distribution, + experimental_run_tf_function): aggregations = [ variables_lib.VariableAggregation.NONE, variables_lib.VariableAggregation.SUM, @@ -1104,12 +1143,19 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): synchronization=variables_lib.VariableSynchronization.ON_READ, aggregation=aggregation) self.evaluate(variables_lib.global_variables_initializer()) - results = self.evaluate(distribution.experimental_local_results( - distribution.experimental_run_v2(v.read_value))) + if experimental_run_tf_function: + read_var_fn = def_function.function(v.read_value) + else: + read_var_fn = v.read_value + results = self.evaluate( + distribution.experimental_local_results( + distribution.experimental_run_v2(read_var_fn))) for component, value in zip(v._values, results): self.assertAllEqual(self.evaluate(component.read_value()), value) - def testReadValueInCrossReplicaContext(self, distribution): + @combinations.generate(strategy_and_run_tf_function_combinations()) + def testReadValueInCrossReplicaContext(self, distribution, + experimental_run_tf_function): aggregations = [ variables_lib.VariableAggregation.SUM, variables_lib.VariableAggregation.MEAN, @@ -1125,10 +1171,15 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): synchronization=variables_lib.VariableSynchronization.ON_READ, aggregation=aggregation) self.evaluate(variables_lib.global_variables_initializer()) + def assign(v=v): ctx = distribution_strategy_context.get_replica_context() replica_id = ctx.replica_id_in_sync_group return v.assign(math_ops.cast(replica_id, dtypes.float32)) + + if experimental_run_tf_function: + assign = def_function.function(assign) + self.evaluate(distribution.experimental_local_results( distribution.experimental_run_v2(assign))) result = self.evaluate(v.read_value()) @@ -1142,6 +1193,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): expected = 0 self.assertEqual(expected, result, aggregation) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testReadValueWithAggregationNoneInCrossReplicaContext(self, distribution): with distribution.scope(): v = variable_scope.variable( @@ -1153,6 +1205,7 @@ class SyncOnReadVariableTest(test.TestCase, parameterized.TestCase): ValueError, "Could not convert from .* VariableAggregation\\.NONE"): self.evaluate(v.read_value()) + @combinations.generate(mirrored_and_tpu_strategy_combinations()) def testInitializedToSameValueInsideEagerRun(self, distribution): if not context.executing_eagerly(): self.skipTest("eager only") From d58e8594e146636605e2669c646bd0bfcd454e0e Mon Sep 17 00:00:00 2001 From: Francois Chollet Date: Fri, 6 Dec 2019 14:02:05 -0800 Subject: [PATCH 300/383] Move image preprocessing layers to `layers/preprocessing` PiperOrigin-RevId: 284257328 Change-Id: I1118722d3084c456f982a7baaa8b65c97dae3292 --- tensorflow/python/keras/BUILD | 4 ++-- .../keras/layers/{ => preprocessing}/image_preprocessing.py | 0 .../layers/{ => preprocessing}/image_preprocessing_test.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) rename tensorflow/python/keras/layers/{ => preprocessing}/image_preprocessing.py (100%) rename tensorflow/python/keras/layers/{ => preprocessing}/image_preprocessing_test.py (99%) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index d6fb60fd724..b90a208292b 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -450,7 +450,6 @@ py_library( "layers/cudnn_recurrent.py", "layers/dense_attention.py", "layers/embeddings.py", - "layers/image_preprocessing.py", "layers/kernelized.py", "layers/local.py", "layers/merge.py", @@ -458,6 +457,7 @@ py_library( "layers/normalization.py", "layers/normalization_v2.py", "layers/pooling.py", + "layers/preprocessing/image_preprocessing.py", "layers/preprocessing/normalization.py", "layers/preprocessing/normalization_v1.py", "layers/preprocessing/text_vectorization.py", @@ -766,7 +766,7 @@ cuda_py_test( cuda_py_test( name = "image_preprocessing_test", size = "medium", - srcs = ["layers/image_preprocessing_test.py"], + srcs = ["layers/preprocessing/image_preprocessing_test.py"], additional_deps = [ ":keras", "@absl_py//absl/testing:parameterized", diff --git a/tensorflow/python/keras/layers/image_preprocessing.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing.py similarity index 100% rename from tensorflow/python/keras/layers/image_preprocessing.py rename to tensorflow/python/keras/layers/preprocessing/image_preprocessing.py diff --git a/tensorflow/python/keras/layers/image_preprocessing_test.py b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py similarity index 99% rename from tensorflow/python/keras/layers/image_preprocessing_test.py rename to tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py index 672cb181974..19433b8290f 100644 --- a/tensorflow/python/keras/layers/image_preprocessing_test.py +++ b/tensorflow/python/keras/layers/preprocessing/image_preprocessing_test.py @@ -24,7 +24,7 @@ import numpy as np from tensorflow.python.framework import errors from tensorflow.python.keras import keras_parameterized from tensorflow.python.keras import testing_utils -from tensorflow.python.keras.layers import image_preprocessing +from tensorflow.python.keras.layers.preprocessing import image_preprocessing from tensorflow.python.keras.utils.generic_utils import CustomObjectScope from tensorflow.python.ops import image_ops_impl as image_ops from tensorflow.python.ops import stateless_random_ops From 46e7fca1b606bb2a115ec2ee4f751ad2d26834cd Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 14:06:03 -0800 Subject: [PATCH 301/383] Adds a `directory` accessor to CheckpointManager. PiperOrigin-RevId: 284258465 Change-Id: Ibf2627854e0e89816e8aced66c2ca586682a8724 --- tensorflow/python/training/checkpoint_management.py | 4 ++++ .../api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt | 4 ++++ .../api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/tensorflow/python/training/checkpoint_management.py b/tensorflow/python/training/checkpoint_management.py index 8d964f7543a..5e487833879 100644 --- a/tensorflow/python/training/checkpoint_management.py +++ b/tensorflow/python/training/checkpoint_management.py @@ -615,6 +615,10 @@ class CheckpointManager(object): if timestamp > self._last_preserved_timestamp: self._maybe_delete[filename] = timestamp + @property + def directory(self): + return self._directory + @property def latest_checkpoint(self): """The prefix of the most recent checkpoint in `directory`. diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt index d981983e938..86e25d86d53 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-checkpoint-manager.pbtxt @@ -6,6 +6,10 @@ tf_class { name: "checkpoints" mtype: "" } + member { + name: "directory" + mtype: "" + } member { name: "latest_checkpoint" mtype: "" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt index d981983e938..86e25d86d53 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-manager.pbtxt @@ -6,6 +6,10 @@ tf_class { name: "checkpoints" mtype: "" } + member { + name: "directory" + mtype: "" + } member { name: "latest_checkpoint" mtype: "" From 25c58b201f27b30cc2e3247d8d6abe76774a010d Mon Sep 17 00:00:00 2001 From: Peng Wang Date: Fri, 6 Dec 2019 14:15:12 -0800 Subject: [PATCH 302/383] Fixes docstring for tf.math.logical_not PiperOrigin-RevId: 284260332 Change-Id: I2c04fa469773e4b6400c9391d931f167833161a0 --- .../core/api_def/base_api/api_def_LogicalNot.pbtxt | 14 +++++++++++++- .../api_def/python_api/api_def_LogicalNot.pbtxt | 7 +++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt b/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt index af29e920c9b..eaee703cf54 100644 --- a/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_LogicalNot.pbtxt @@ -1,4 +1,16 @@ op { graph_op_name: "LogicalNot" - summary: "Returns the truth value of NOT x element-wise." + in_arg { + name: "x" + description: <>> tf.math.logical_not(tf.constant([True, False])) + + +END } From e7b00a13d4174953cb4f1324bfa1f3165245e462 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 14:18:05 -0800 Subject: [PATCH 303/383] Use named traits in the ODS definition of LLVMFuncOp The "FunctionLike" and "IsIsolatedFromAbove" op traits are now defined as named records in base ODS file. Use those instead of NativeOpTrait referring to the C++ class name in the ODS definition of LLVMFuncOp. NFC. PiperOrigin-RevId: 284260891 Change-Id: I5b611d726101747c46355ba9ba72c5a0f76eee3e --- third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td index 8aa5397651c..959b0e38b85 100644 --- a/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td +++ b/third_party/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td @@ -586,9 +586,7 @@ def LLVM_GlobalOp } def LLVM_LLVMFuncOp - : LLVM_ZeroResultOp<"func", - [NativeOpTrait<"IsIsolatedFromAbove">, - NativeOpTrait<"FunctionLike">, Symbol]>, + : LLVM_ZeroResultOp<"func", [IsolatedFromAbove, FunctionLike, Symbol]>, Arguments<(ins DefaultValuedAttr:$linkage)> { let summary = "LLVM dialect function, has wrapped LLVM IR function type"; From d14373ec7949a501494114d225fdc6fe9560f36f Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 14:22:22 -0800 Subject: [PATCH 304/383] Override EIGEN strong inline for release builds as well. PiperOrigin-RevId: 284261705 Change-Id: I882c786169fb2d51716c884c9b1c91b59ae2df4e --- tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh | 2 +- tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh index 0151854c43c..0152e9decc7 100644 --- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh @@ -104,7 +104,7 @@ if [[ "$RELEASE_BUILD" == 1 ]]; then # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521 # Because this hurts the performance of TF, we don't override it in release build. - export TF_OVERRIDE_EIGEN_STRONG_INLINE=1 + export TF_OVERRIDE_EIGEN_STRONG_INLINE=0 else export TF_OVERRIDE_EIGEN_STRONG_INLINE=1 fi diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh index ff767f4bd47..6dd183ceb87 100644 --- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh +++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh @@ -104,7 +104,7 @@ if [[ "$RELEASE_BUILD" == 1 ]]; then # Overriding eigen strong inline speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521 # Because this hurts the performance of TF, we don't override it in release build. - export TF_OVERRIDE_EIGEN_STRONG_INLINE=1 + export TF_OVERRIDE_EIGEN_STRONG_INLINE=0 else export TF_OVERRIDE_EIGEN_STRONG_INLINE=1 fi From 3cc5bda704f8088b342fd126fc138348a0671e67 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Fri, 6 Dec 2019 14:23:06 -0800 Subject: [PATCH 305/383] Move GPU::LaunchOp to ODS. NFC. Move the definition of the GPU launch opreation from hand-rolled C++ code to ODS framework. This only does the moves, a follow-up is necessary to clean up users of custom functions that could be auto-generated by ODS. PiperOrigin-RevId: 284261856 Change-Id: I3bc93035fc9364c2992ea90129001697fca70cf0 --- third_party/mlir/g3doc/Dialects/GPU.md | 71 --------- .../include/mlir/Dialect/GPU/GPUDialect.h | 68 --------- .../mlir/include/mlir/Dialect/GPU/GPUOps.td | 140 ++++++++++++++++++ .../mlir/lib/Dialect/GPU/IR/GPUDialect.cpp | 64 ++++---- 4 files changed, 175 insertions(+), 168 deletions(-) diff --git a/third_party/mlir/g3doc/Dialects/GPU.md b/third_party/mlir/g3doc/Dialects/GPU.md index d34ce1891e8..bcb677d7660 100644 --- a/third_party/mlir/g3doc/Dialects/GPU.md +++ b/third_party/mlir/g3doc/Dialects/GPU.md @@ -69,77 +69,6 @@ Example: %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) ``` -### `gpu.launch` - -Launch a kernel on the specified grid of thread blocks. The body of the kernel -is defined by the single region that this operation contains. The operation -takes at least six operands, with first three operands being grid sizes along -x,y,z dimensions, the following three arguments being block sizes along x,y,z -dimension, and the remaining operands are arguments of the kernel. When a -lower-dimensional kernel is required, unused sizes must be explicitly set to -`1`. - -The body region has at least _twelve_ arguments, grouped as follows: - -- three arguments that contain block identifiers along x,y,z dimensions; -- three arguments that contain thread identifiers along x,y,z dimensions; -- operands of the `gpu.launch` operation as is, including six leading operands - for grid and block sizes. - -Operations inside the body region, and any operations in the nested regions, are -_not_ allowed to use values defined outside the _body_ region, as if this region -was a function. If necessary, values must be passed as kernel arguments into the -body region. Nested regions inside the kernel body are allowed to use values -defined in their ancestor regions as long as they don't cross the kernel body -region boundary. - -Syntax: - -``` {.ebnf} -operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment - `threads` `(` ssa-id-list `)` `in` ssa-reassignment - (`args` ssa-reassignment `:` type-list)? - region attr-dict? -ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` -``` - -Example: - -```mlir {.mlir} -gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) - threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) - args(%arg0 = %6, %arg1 = 7) : f32, memref { - // Block and thread identifiers, as well as block/grid sizes are - // immediately usable inside body region. - "some_op"(%bx, %tx) : (index, index) -> () - %42 = load %arg1[%bx] : memref -} - -// Generic syntax explains how the pretty syntax maps to the IR structure. -"gpu.launch"(%cst, %cst, %c1, // Grid sizes. - %cst, %c1, %c1, // Block sizes. - %arg0, %arg1) // Actual arguments. - {/*attributes*/} - // All sizes and identifiers have "index" size. - : (index, index, index, index, index, index, f32, memref) -> () { -// The operation passes block and thread identifiers, followed by grid and block -// sizes, followed by actual arguments to the entry block of the region. -^bb0(%bx : index, %by : index, %bz : index, - %tx : index, %ty : index, %tz : index, - %num_bx : index, %num_by : index, %num_bz : index, - %num_tx : index, %num_ty : index, %num_tz : index, - %arg0 : f32, %arg1 : memref): - "some_op"(%bx, %tx) : (index, index) -> () - %3 = "std.load"(%arg1, %bx) : (memref, index) -> f32 -} -``` - -Rationale: using operation/block arguments gives analyses a clear way of -understanding that a value has additional semantics (e.g., we will need to know -what value corresponds to threadIdx.x for coalescing). We can recover these -properties by analyzing the operations producing values, but it is easier just -to have that information by construction. - ### `gpu.launch_func` Launch a kernel function on the specified grid of thread blocks. `gpu.launch` diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h index 194dd9c1e1d..3d63a45b8ef 100644 --- a/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h +++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUDialect.h @@ -77,74 +77,6 @@ struct KernelDim3 { Value *z; }; -/// GPU kernel launch operation. Takes a 3D grid of thread blocks as leading -/// operands, followed by kernel data operands. Has one region representing -/// the kernel to be executed. This region is not allowed to use values defined -/// outside it. -class LaunchOp : public Op::Impl, - OpTrait::ZeroResult, OpTrait::IsIsolatedFromAbove> { -public: - using Op::Op; - - static void build(Builder *builder, OperationState &result, Value *gridSizeX, - Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX, - Value *blockSizeY, Value *blockSizeZ, - ArrayRef operands); - - /// Get the kernel region. - Region &getBody(); - - /// Get the SSA values corresponding to kernel block identifiers. - KernelDim3 getBlockIds(); - /// Get the SSA values corresponding to kernel thread identifiers. - KernelDim3 getThreadIds(); - /// Get the SSA values corresponding to kernel grid size. - KernelDim3 getGridSize(); - /// Get the SSA values corresponding to kernel block size. - KernelDim3 getBlockSize(); - /// Get the operand values passed as kernel arguments. - operand_range getKernelOperandValues(); - /// Get the operand types passed as kernel arguments. - operand_type_range getKernelOperandTypes(); - - /// Get the SSA values passed as operands to specify the grid size. - KernelDim3 getGridSizeOperandValues(); - /// Get the SSA values passed as operands to specify the block size. - KernelDim3 getBlockSizeOperandValues(); - - /// Get the SSA values of the kernel arguments. - llvm::iterator_range getKernelArguments(); - - LogicalResult verify(); - - /// Custom syntax support. - void print(OpAsmPrinter &p); - static ParseResult parse(OpAsmParser &parser, OperationState &result); - - static StringRef getOperationName() { return "gpu.launch"; } - - /// Erase the `index`-th kernel argument. Both the entry block argument and - /// the operand will be dropped. The block argument must not have any uses. - void eraseKernelArgument(unsigned index); - - /// Append canonicalization patterns to `results`. - static void getCanonicalizationPatterns(OwningRewritePatternList &results, - MLIRContext *context); - -private: - static StringRef getBlocksKeyword() { return "blocks"; } - static StringRef getThreadsKeyword() { return "threads"; } - static StringRef getArgsKeyword() { return "args"; } - - /// The number of launch configuration operands, placed at the leading - /// positions of the operand list. - static constexpr unsigned kNumConfigOperands = 6; - - /// The number of region attributes containing the launch configuration, - /// placed in the leading positions of the argument list. - static constexpr unsigned kNumConfigRegionAttributes = 12; -}; - /// Operation to launch a kernel given as outlined function. class LaunchFuncOp : public Op::Impl, OpTrait::ZeroResult> { diff --git a/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td b/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td index fcaa77ce779..9b4e21800bd 100644 --- a/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td +++ b/third_party/mlir/include/mlir/Dialect/GPU/GPUOps.td @@ -181,6 +181,146 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> { let parser = [{ return parseGPUFuncOp(parser, result); }]; } +def GPU_LaunchOp : GPU_Op<"launch", [IsolatedFromAbove]>, + Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ, + Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ, + Variadic:$operands)>, + Results<(outs)> { + let summary = "GPU kernel launch operation"; + + let description = [{ + Launch a kernel on the specified grid of thread blocks. The body of the + kernel is defined by the single region that this operation contains. The + operation takes at least six operands, with first three operands being grid + sizes along x,y,z dimensions, the following three arguments being block + sizes along x,y,z dimension, and the remaining operands are arguments of the + kernel. When a lower-dimensional kernel is required, unused sizes must be + explicitly set to `1`. + + The body region has at least _twelve_ arguments, grouped as follows: + + - three arguments that contain block identifiers along x,y,z dimensions; + - three arguments that contain thread identifiers along x,y,z dimensions; + - operands of the `gpu.launch` operation as is, including six leading + operands for grid and block sizes. + + Operations inside the body region, and any operations in the nested regions, + are _not_ allowed to use values defined outside the _body_ region, as if + this region was a function. If necessary, values must be passed as kernel + arguments into the body region. Nested regions inside the kernel body are + allowed to use values defined in their ancestor regions as long as they + don't cross the kernel body region boundary. + + Syntax: + + ``` {.ebnf} + operation ::= `gpu.launch` `block` `(` ssa-id-list `)` `in` ssa-reassignment + `threads` `(` ssa-id-list `)` `in` ssa-reassignment + (`args` ssa-reassignment `:` type-list)? + region attr-dict? + ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` + ``` + + Example: + + ```mlir {.mlir} + gpu.launch blocks(%bx, %by, %bz) in (%sz_bx = %0, %sz_by = %1, %sz_bz = %2) + threads(%tx, %ty, %tz) in (%sz_tx = %3, %sz_ty = %4, %sz_tz = %5) + args(%arg0 = %6, %arg1 = 7) : f32, memref { + // Block and thread identifiers, as well as block/grid sizes are + // immediately usable inside body region. + "some_op"(%bx, %tx) : (index, index) -> () + %42 = load %arg1[%bx] : memref + } + + // Generic syntax explains how the pretty syntax maps to the IR structure. + "gpu.launch"(%cst, %cst, %c1, // Grid sizes. + %cst, %c1, %c1, // Block sizes. + %arg0, %arg1) // Actual arguments. + {/*attributes*/} + // All sizes and identifiers have "index" size. + : (index, index, index, index, index, index, f32, memref) + -> () { + // The operation passes block and thread identifiers, followed by grid and + // block sizes, followed by actual arguments to the entry block of the + // region. + ^bb0(%bx : index, %by : index, %bz : index, + %tx : index, %ty : index, %tz : index, + %num_bx : index, %num_by : index, %num_bz : index, + %num_tx : index, %num_ty : index, %num_tz : index, + %arg0 : f32, %arg1 : memref): + "some_op"(%bx, %tx) : (index, index) -> () + %3 = "std.load"(%arg1, %bx) : (memref, index) -> f32 + } + ``` + + Rationale: using operation/block arguments gives analyses a clear way of + understanding that a value has additional semantics (e.g., we will need to + know what value corresponds to threadIdx.x for coalescing). We can recover + these properties by analyzing the operations producing values, but it is + easier just to have that information by construction. + }]; + + let regions = (region AnyRegion:$body); + + let skipDefaultBuilders = 1; + + let builders = [ + OpBuilder<"Builder *builder, OperationState &result, Value *gridSizeX," + "Value *gridSizeY, Value *gridSizeZ, Value *blockSizeX," + "Value *blockSizeY, Value *blockSizeZ," + "ArrayRef operands"> + ]; + + let hasCanonicalizer = 1; + + let extraClassDeclaration = [{ + /// Get the kernel region. + Region &getBody(); + + /// Get the SSA values corresponding to kernel block identifiers. + KernelDim3 getBlockIds(); + /// Get the SSA values corresponding to kernel thread identifiers. + KernelDim3 getThreadIds(); + /// Get the SSA values corresponding to kernel grid size. + KernelDim3 getGridSize(); + /// Get the SSA values corresponding to kernel block size. + KernelDim3 getBlockSize(); + /// Get the operand values passed as kernel arguments. + operand_range getKernelOperandValues(); + /// Get the operand types passed as kernel arguments. + operand_type_range getKernelOperandTypes(); + + /// Get the SSA values passed as operands to specify the grid size. + KernelDim3 getGridSizeOperandValues(); + /// Get the SSA values passed as operands to specify the block size. + KernelDim3 getBlockSizeOperandValues(); + + /// Get the SSA values of the kernel arguments. + llvm::iterator_range getKernelArguments(); + + /// Erase the `index`-th kernel argument. Both the entry block argument and + /// the operand will be dropped. The block argument must not have any uses. + void eraseKernelArgument(unsigned index); + + static StringRef getBlocksKeyword() { return "blocks"; } + static StringRef getThreadsKeyword() { return "threads"; } + static StringRef getArgsKeyword() { return "args"; } + + /// The number of launch configuration operands, placed at the leading + /// positions of the operand list. + static constexpr unsigned kNumConfigOperands = 6; + + /// The number of region attributes containing the launch configuration, + /// placed in the leading positions of the argument list. + static constexpr unsigned kNumConfigRegionAttributes = 12; + }]; + + let parser = [{ return parseLaunchOp(parser, result); }]; + let printer = [{ printLaunchOp(p, *this); }]; + let verifier = [{ return ::verify(*this); }]; +} + def GPU_ReturnOp : GPU_Op<"return", [Terminator]>, Arguments<(ins)>, Results<(outs)> { let summary = "Terminator for GPU launch regions."; diff --git a/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp index 38998b968ad..87b170b6da8 100644 --- a/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/third_party/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -46,7 +46,7 @@ bool GPUDialect::isKernel(Operation *op) { GPUDialect::GPUDialect(MLIRContext *context) : Dialect(getDialectName(), context) { - addOperations(); @@ -244,19 +244,20 @@ llvm::iterator_range LaunchOp::getKernelArguments() { return llvm::drop_begin(args, LaunchOp::kNumConfigRegionAttributes); } -LogicalResult LaunchOp::verify() { +LogicalResult verify(LaunchOp op) { // Kernel launch takes kNumConfigOperands leading operands for grid/block // sizes and transforms them into kNumConfigRegionAttributes region arguments // for block/thread identifiers and grid/block sizes. - if (!getBody().empty()) { - Block &entryBlock = getBody().front(); - if (entryBlock.getNumArguments() != kNumConfigOperands + getNumOperands()) - return emitOpError("unexpected number of region arguments"); + if (!op.getBody().empty()) { + Block &entryBlock = op.getBody().front(); + if (entryBlock.getNumArguments() != + LaunchOp::kNumConfigOperands + op.getNumOperands()) + return op.emitOpError("unexpected number of region arguments"); } // Block terminators without successors are expected to exit the kernel region // and must be `gpu.launch`. - for (Block &block : getBody()) { + for (Block &block : op.getBody()) { if (block.empty()) continue; if (block.back().getNumSuccessors() != 0) @@ -265,8 +266,8 @@ LogicalResult LaunchOp::verify() { return block.back() .emitError("expected 'gpu.terminator' or a terminator with " "successors") - .attachNote(getLoc()) - << "in '" << getOperationName() << "' body region"; + .attachNote(op.getLoc()) + << "in '" << LaunchOp::getOperationName() << "' body region"; } } @@ -285,27 +286,31 @@ static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size, p << *size.z << " = " << *operands[2] << ')'; } -void LaunchOp::print(OpAsmPrinter &p) { - SmallVector operandContainer(operand_begin(), operand_end()); +void printLaunchOp(OpAsmPrinter &p, LaunchOp op) { + SmallVector operandContainer(op.operand_begin(), + op.operand_end()); ArrayRef operands(operandContainer); // Print the launch configuration. - p << getOperationName() << ' ' << getBlocksKeyword(); - printSizeAssignment(p, getGridSize(), operands.take_front(3), getBlockIds()); - p << ' ' << getThreadsKeyword(); - printSizeAssignment(p, getBlockSize(), operands.slice(3, 3), getThreadIds()); + p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword(); + printSizeAssignment(p, op.getGridSize(), operands.take_front(3), + op.getBlockIds()); + p << ' ' << op.getThreadsKeyword(); + printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3), + op.getThreadIds()); // From now on, the first kNumConfigOperands operands corresponding to grid // and block sizes are irrelevant, so we can drop them. - operands = operands.drop_front(kNumConfigOperands); + operands = operands.drop_front(LaunchOp::kNumConfigOperands); // Print the data argument remapping. - if (!getBody().empty() && !operands.empty()) { - p << ' ' << getArgsKeyword() << '('; + if (!op.getBody().empty() && !operands.empty()) { + p << ' ' << op.getArgsKeyword() << '('; for (unsigned i = 0, e = operands.size(); i < e; ++i) { if (i != 0) p << ", "; - p << *getBody().front().getArgument(kNumConfigRegionAttributes + i) + p << *op.getBody().front().getArgument( + LaunchOp::kNumConfigRegionAttributes + i) << " = " << *operands[i]; } p << ") "; @@ -321,8 +326,8 @@ void LaunchOp::print(OpAsmPrinter &p) { } } - p.printRegion(getBody(), /*printEntryBlockArgs=*/false); - p.printOptionalAttrDict(getAttrs()); + p.printRegion(op.getBody(), /*printEntryBlockArgs=*/false); + p.printOptionalAttrDict(op.getAttrs()); } // Parse the size assignment blocks for blocks and threads. These have the form @@ -361,10 +366,10 @@ parseSizeAssignment(OpAsmParser &parser, // (`args` ssa-reassignment `:` type-list)? // region attr-dict? // ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)` -ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { +ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) { // Sizes of the grid and block. - SmallVector sizes( - kNumConfigOperands); + SmallVector sizes( + LaunchOp::kNumConfigOperands); MutableArrayRef sizesRef(sizes); // Actual (data) operands passed to the kernel. @@ -372,7 +377,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // Region arguments to be created. SmallVector regionArgs( - kNumConfigRegionAttributes); + LaunchOp::kNumConfigRegionAttributes); MutableArrayRef regionArgsRef(regionArgs); // Parse the size assignment segments: the first segment assigns grid sizes @@ -380,11 +385,11 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // sizes and defines values for thread identifiers. In the region argument // list, identifiers precede sizes, and block-related values precede // thread-related values. - if (parser.parseKeyword(getBlocksKeyword().data()) || + if (parser.parseKeyword(LaunchOp::getBlocksKeyword().data()) || parseSizeAssignment(parser, sizesRef.take_front(3), regionArgsRef.slice(6, 3), regionArgsRef.slice(0, 3)) || - parser.parseKeyword(getThreadsKeyword().data()) || + parser.parseKeyword(LaunchOp::getThreadsKeyword().data()) || parseSizeAssignment(parser, sizesRef.drop_front(3), regionArgsRef.slice(9, 3), regionArgsRef.slice(3, 3)) || @@ -397,7 +402,7 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // so is the trailing type list. Parse it as well and use the parsed types // to resolve the operands passed to the kernel arguments. SmallVector dataTypes; - if (!parser.parseOptionalKeyword(getArgsKeyword())) { + if (!parser.parseOptionalKeyword(LaunchOp::getArgsKeyword())) { llvm::SMLoc argsLoc = parser.getCurrentLocation(); regionArgs.push_back({}); @@ -425,7 +430,8 @@ ParseResult LaunchOp::parse(OpAsmParser &parser, OperationState &result) { // block/thread identifiers and grid/block sizes, all of the `index` type. // Follow the actual kernel arguments. Type index = parser.getBuilder().getIndexType(); - dataTypes.insert(dataTypes.begin(), kNumConfigRegionAttributes, index); + dataTypes.insert(dataTypes.begin(), LaunchOp::kNumConfigRegionAttributes, + index); Region *body = result.addRegion(); return failure(parser.parseRegion(*body, regionArgs, dataTypes) || parser.parseOptionalAttrDict(result.attributes)); From fa7ff04d632bcd0d3299383d93292ade485daea6 Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Fri, 6 Dec 2019 14:23:39 -0800 Subject: [PATCH 306/383] Add TPUReplicatedInput and TPUReplicatedOutput to MLIR Op Definition Spec. Updated associated descriptions for TPUReplicatedInput and TPUReplicatedOutput. PiperOrigin-RevId: 284261948 Change-Id: I16197e103128c7c1153f70367118b129cae6789d --- .../mlir/tensorflow/ir/tf_generated_ops.td | 59 +++++++++++++++++++ .../base_api/api_def_TPUReplicatedInput.pbtxt | 13 ++++ .../api_def_TPUReplicatedOutput.pbtxt | 13 +++- 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td index 164bbe57ee3..865ec5981a4 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td @@ -5499,6 +5499,65 @@ output. For the internal use of the distributed TPU compiler. TF_DerivedResultTypeListAttr Tresults = TF_DerivedResultTypeListAttr<0>; } +def TF_TPUReplicatedInputOp : TF_Op<"TPUReplicatedInput", [NoSideEffect]> { + let summary = "Connects N inputs to an N-way replicated TPU computation."; + + let description = [{ +This operation holds a replicated input to a `tpu.replicate()` computation subgraph. +Each replicated input has the same shape and type alongside the output. + +For example: +``` +%a = "tf.opA"() +%b = "tf.opB"() +%replicated_input = "tf.TPUReplicatedInput"(%a, %b) +%computation = "tf.Computation"(%replicated_input) +``` +The above computation has a replicated input of two replicas. + }]; + + let arguments = (ins + Variadic:$inputs, + + DefaultValuedAttr:$is_mirrored_variable, + DefaultValuedAttr:$index + ); + + let results = (outs + TF_Tensor:$output + ); + + TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>; + TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<0>; +} + +def TF_TPUReplicatedOutputOp : TF_Op<"TPUReplicatedOutput", [NoSideEffect]> { + let summary = "Connects N outputs from an N-way replicated TPU computation."; + + let description = [{ +This operation holds a replicated output from a `tpu.replicate()` computation subgraph. +Each replicated output has the same shape and type alongside the input. + +For example: +``` +%computation = "tf.Computation"() +%replicated_output:2 = "tf.TPUReplicatedOutput"(%computation) +``` +The above computation has a replicated output of two replicas. + }]; + + let arguments = (ins + TF_Tensor:$input + ); + + let results = (outs + Variadic:$outputs + ); + + TF_DerivedResultSizeAttr num_replicas = TF_DerivedResultSizeAttr<0>; + TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>; +} + def TF_TanhOp : TF_Op<"Tanh", [NoSideEffect, SameOperandsAndResultType]> { let summary = "Computes hyperbolic tangent of `x` element-wise."; diff --git a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt index acd52a735cb..d632da17ad9 100644 --- a/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt +++ b/tensorflow/core/api_def/base_api/api_def_TPUReplicatedInput.pbtxt @@ -2,4 +2,17 @@ op { graph_op_name: "TPUReplicatedInput" visibility: HIDDEN summary: "Connects N inputs to an N-way replicated TPU computation." + description: <