From 95a7f08f1eacebb6d436dde8ab3d03e29c5a9536 Mon Sep 17 00:00:00 2001 From: Alex Hoffman Date: Mon, 3 Feb 2020 18:07:42 +0100 Subject: [PATCH 0001/1447] Removed non-existant `arm_cmplx_mag_squared_q10p6.c` from `micro_speech` example Make --- tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc b/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc index 245221aec96..9080e49b0f5 100644 --- a/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc +++ b/tensorflow/lite/micro/examples/micro_speech/CMSIS/Makefile.inc @@ -18,7 +18,6 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),) tensorflow/lite/micro/examples/micro_speech/CMSIS/hanning.h \ tensorflow/lite/micro/examples/micro_speech/CMSIS/sin_1k.h \ third_party/CMSIS_ext/README.md \ - third_party/CMSIS_ext/arm_cmplx_mag_squared_q10p6.h PREPROCESSOR_TEST_SRCS += $(CMSIS_PREPROCESSOR_SRCS) PREPROCESSOR_TEST_HDRS += $(CMSIS_PREPROCESSOR_HDRS) @@ -33,7 +32,6 @@ ifneq ($(filter CMSIS,$(ALL_TAGS)),) MICRO_SPEECH_HDRS += $(CMSIS_PREPROCESSOR_HDRS) THIRD_PARTY_CC_SRCS += \ - $(MAKEFILE_DIR)/downloads/CMSIS_ext/arm_cmplx_mag_squared_q10p6.c \ $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/BasicMathFunctions/arm_mult_q15.c \ $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c \ $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c \ From 62ae2478670ff9b8f0c5b553afab9444b5dc59ae Mon Sep 17 00:00:00 2001 From: zilinzhu Date: Wed, 15 Apr 2020 14:53:59 +0800 Subject: [PATCH 0002/1447] fix bug in keras when only pass run options will trigger segmentation fault --- tensorflow/python/client/tf_session_helper.cc | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/tensorflow/python/client/tf_session_helper.cc b/tensorflow/python/client/tf_session_helper.cc index 78a1613c86c..67dddebe602 100644 --- a/tensorflow/python/client/tf_session_helper.cc +++ b/tensorflow/python/client/tf_session_helper.cc @@ -235,18 +235,13 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle, } } - // Allocate a RunMetadata protobuf object to receive the metadata, - // if the caller is expecting any. - std::unique_ptr run_metadata_proto; - if (run_metadata != nullptr) { - run_metadata_proto.reset(new RunMetadata); - } + RunMetadata run_metadata_proto; // Run the callable. std::vector output_tensors; Py_BEGIN_ALLOW_THREADS; s = session->RunCallable(handle, input_tensors, &output_tensors, - run_metadata_proto.get()); + &run_metadata_proto); Py_END_ALLOW_THREADS; if (!s.ok()) { @@ -256,7 +251,7 @@ void RunCallableHelper(tensorflow::Session* session, int64_t handle, // If requested, serialize the RunMetadata to pass it back to the caller. if (run_metadata != nullptr) { - s = MessageToBuffer(*run_metadata_proto, run_metadata); + s = MessageToBuffer(run_metadata_proto, run_metadata); if (!s.ok()) { Set_TF_Status_from_Status(out_status, s); return; From 93787127d8edb9cfb740ebd14451fbd838a4f8b9 Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Thu, 7 May 2020 05:54:17 -0400 Subject: [PATCH 0003/1447] We need to bring in the classes from advanced_activations if there are no custom objects specified. When no custom objects are specified, our module_objects/globals() in activations.deserialize() won't contain any advanced_activations. --- tensorflow/python/keras/activations.py | 12 +++++++++++- tensorflow/python/keras/activations_test.py | 7 +++++++ .../python/keras/layers/advanced_activations.py | 2 ++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index 0ee4a91f417..9b958af9321 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -26,6 +26,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.util import dispatch from tensorflow.python.util.tf_export import keras_export +from tensorflow.python.keras.layers import advanced_activations # b/123041942 # In TF 2.x, if the `tf.nn.softmax` is used as an activation function in Keras @@ -454,9 +455,18 @@ def deserialize(name, custom_objects=None): ValueError: `Unknown activation function` if the input string does not denote any defined Tensorflow activation function. """ + globs = globals() + + # only replace missing activations, when there are no custom objects + if custom_objects is None: + advanced_activations_globs = advanced_activations.get_globals() + for key,val in advanced_activations_globs.items(): + if key not in globs: + globs[key] = val + return deserialize_keras_object( name, - module_objects=globals(), + module_objects=globs, custom_objects=custom_objects, printable_module_name='activation function') diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py index f951076efbb..756ab131148 100644 --- a/tensorflow/python/keras/activations_test.py +++ b/tensorflow/python/keras/activations_test.py @@ -64,12 +64,19 @@ class KerasActivationsTest(test.TestCase, parameterized.TestCase): activation = advanced_activations.LeakyReLU(alpha=0.1) layer = core.Dense(3, activation=activation) config = serialization.serialize(layer) + # with custom objects deserialized_layer = serialization.deserialize( config, custom_objects={'LeakyReLU': activation}) self.assertEqual(deserialized_layer.__class__.__name__, layer.__class__.__name__) self.assertEqual(deserialized_layer.activation.__class__.__name__, activation.__class__.__name__) + # without custom objects + deserialized_layer = serialization.deserialize(config) + self.assertEqual(deserialized_layer.__class__.__name__, + layer.__class__.__name__) + self.assertEqual(deserialized_layer.activation.__class__.__name__, + activation.__class__.__name__) def test_softmax(self): x = backend.placeholder(ndim=2) diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py index 7cb40c172b7..762c66461fb 100644 --- a/tensorflow/python/keras/layers/advanced_activations.py +++ b/tensorflow/python/keras/layers/advanced_activations.py @@ -28,6 +28,8 @@ from tensorflow.python.keras.utils import tf_utils from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import keras_export +def get_globals(): + return globals() @keras_export('keras.layers.LeakyReLU') class LeakyReLU(Layer): From b42f30e171ec7db7c8b91a2f5c5b61a72eaa125f Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Wed, 3 Jun 2020 14:01:06 -0400 Subject: [PATCH 0004/1447] no need for if statement since custom object dict is checked before module objects --- tensorflow/python/keras/activations.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index 9b958af9321..c1b89346e5a 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -457,12 +457,11 @@ def deserialize(name, custom_objects=None): """ globs = globals() - # only replace missing activations, when there are no custom objects - if custom_objects is None: - advanced_activations_globs = advanced_activations.get_globals() - for key,val in advanced_activations_globs.items(): - if key not in globs: - globs[key] = val + # only replace missing activations + advanced_activations_globs = advanced_activations.get_globals() + for key,val in advanced_activations_globs.items(): + if key not in globs: + globs[key] = val return deserialize_keras_object( name, From bb3cec3b33aa5cb8a2a8d4918ae4de203dd941e1 Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Wed, 3 Jun 2020 16:33:38 -0400 Subject: [PATCH 0005/1447] fixing pylint issues --- tensorflow/python/keras/activations.py | 2 +- tensorflow/python/keras/layers/advanced_activations.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index c1b89346e5a..960f5b18819 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -459,7 +459,7 @@ def deserialize(name, custom_objects=None): # only replace missing activations advanced_activations_globs = advanced_activations.get_globals() - for key,val in advanced_activations_globs.items(): + for key, val in advanced_activations_globs.items(): if key not in globs: globs[key] = val diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py index 762c66461fb..058ed0c8f51 100644 --- a/tensorflow/python/keras/layers/advanced_activations.py +++ b/tensorflow/python/keras/layers/advanced_activations.py @@ -28,9 +28,11 @@ from tensorflow.python.keras.utils import tf_utils from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import keras_export + def get_globals(): return globals() + @keras_export('keras.layers.LeakyReLU') class LeakyReLU(Layer): """Leaky version of a Rectified Linear Unit. From 99d2416b614c94d9b4e74fb334ea81c3e8e15635 Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Fri, 5 Jun 2020 18:20:16 -0400 Subject: [PATCH 0006/1447] New feature. Use new param log_all in CSVLogger to log all elements in training even if some epochs don't contain the same elements. --- tensorflow/python/keras/callbacks.py | 35 +++++++++++++++++++---- tensorflow/python/keras/callbacks_test.py | 6 +++- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py index 1bca5419774..90e2b8003a7 100644 --- a/tensorflow/python/keras/callbacks.py +++ b/tensorflow/python/keras/callbacks.py @@ -2381,12 +2381,18 @@ class CSVLogger(Callback): separator: String used to separate elements in the CSV file. append: Boolean. True: append if file exists (useful for continuing training). False: overwrite existing file. + log_all: Boolean. True: log all elements, even elements that are + only recorded every x epochs (ex. validation sometimes is + only recorded every validation_freq). False: Don't log all + elements, only log the elements that are present in every epoch. """ - def __init__(self, filename, separator=',', append=False): + def __init__(self, filename, separator=',', append=False, log_all=False): self.sep = separator self.filename = path_to_string(filename) self.append = append + self.log_all = log_all + self._row_dicts = [] self.writer = None self.keys = None self.append_header = True @@ -2424,6 +2430,10 @@ class CSVLogger(Callback): if self.keys is None: self.keys = sorted(logs.keys()) + elif self.log_all and len(self.keys) < len(logs.keys()): + # have to make a new writer to accommodate for the new keys + self.keys = sorted(logs.keys()) + self.writer = None if self.model.stop_training: # We set NA so that csv parsers do not fail for this last epoch. @@ -2442,15 +2452,30 @@ class CSVLogger(Callback): self.csv_file, fieldnames=fieldnames, dialect=CustomDialect) - if self.append_header: + # if user wants to log all, then we append_header + # at the end of training + if self.append_header and not self.log_all: self.writer.writeheader() row_dict = collections.OrderedDict({'epoch': epoch}) - row_dict.update((key, handle_value(logs[key])) for key in self.keys) - self.writer.writerow(row_dict) - self.csv_file.flush() + row_dict.update((key, handle_value(logs[key])) + for key in self.keys if key in logs) + # if user wants to log all, then we write all rows to csv file + # at the end of training + if not self.log_all: + self.writer.writerow(row_dict) + self.csv_file.flush() + else: + self._row_dicts.append(row_dict) def on_train_end(self, logs=None): + if self.log_all: + if self.append_header: + self.writer.writeheader() + self.writer.writerows(self._row_dicts) + self._row_dicts = [] + self.csv_file.flush() + self.csv_file.close() self.writer = None diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index 28f85304688..be86e19c7a2 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -1243,7 +1243,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): self.assertTrue(hasattr(reduce_on_plateau, 'min_delta')) self.assertEqual(reduce_on_plateau.min_delta, 1e-13) - def test_CSVLogger(self): + def test_CSVLogger(self, log_all=False): with self.cached_session(): np.random.seed(1337) temp_dir = self.get_temp_dir() @@ -1306,6 +1306,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): y_train, batch_size=BATCH_SIZE, validation_data=(x_test, y_test), + validation_freq=1 if not log_all else 2, callbacks=cbks, epochs=2, verbose=0) @@ -1320,6 +1321,9 @@ class KerasCallbacksTest(keras_parameterized.TestCase): os.remove(filepath) + def test_CSVLogger_log_all(self): + self.test_CSVLogger(log_all=True) + def test_stop_training_csv(self): # Test that using the CSVLogger callback with the TerminateOnNaN callback # does not result in invalid CSVs. From f268f059e16e619cab89a347d0ca597945e7f0d2 Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Fri, 5 Jun 2020 18:29:08 -0400 Subject: [PATCH 0007/1447] Revert "New feature. Use new param log_all in CSVLogger to log all elements in training even if some epochs don't contain the same elements." This reverts commit 204913109700abfa7fd620bf05c4603dc7795f34. --- tensorflow/python/keras/callbacks.py | 35 ++++------------------- tensorflow/python/keras/callbacks_test.py | 6 +--- 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py index 90e2b8003a7..1bca5419774 100644 --- a/tensorflow/python/keras/callbacks.py +++ b/tensorflow/python/keras/callbacks.py @@ -2381,18 +2381,12 @@ class CSVLogger(Callback): separator: String used to separate elements in the CSV file. append: Boolean. True: append if file exists (useful for continuing training). False: overwrite existing file. - log_all: Boolean. True: log all elements, even elements that are - only recorded every x epochs (ex. validation sometimes is - only recorded every validation_freq). False: Don't log all - elements, only log the elements that are present in every epoch. """ - def __init__(self, filename, separator=',', append=False, log_all=False): + def __init__(self, filename, separator=',', append=False): self.sep = separator self.filename = path_to_string(filename) self.append = append - self.log_all = log_all - self._row_dicts = [] self.writer = None self.keys = None self.append_header = True @@ -2430,10 +2424,6 @@ class CSVLogger(Callback): if self.keys is None: self.keys = sorted(logs.keys()) - elif self.log_all and len(self.keys) < len(logs.keys()): - # have to make a new writer to accommodate for the new keys - self.keys = sorted(logs.keys()) - self.writer = None if self.model.stop_training: # We set NA so that csv parsers do not fail for this last epoch. @@ -2452,30 +2442,15 @@ class CSVLogger(Callback): self.csv_file, fieldnames=fieldnames, dialect=CustomDialect) - # if user wants to log all, then we append_header - # at the end of training - if self.append_header and not self.log_all: + if self.append_header: self.writer.writeheader() row_dict = collections.OrderedDict({'epoch': epoch}) - row_dict.update((key, handle_value(logs[key])) - for key in self.keys if key in logs) - # if user wants to log all, then we write all rows to csv file - # at the end of training - if not self.log_all: - self.writer.writerow(row_dict) - self.csv_file.flush() - else: - self._row_dicts.append(row_dict) + row_dict.update((key, handle_value(logs[key])) for key in self.keys) + self.writer.writerow(row_dict) + self.csv_file.flush() def on_train_end(self, logs=None): - if self.log_all: - if self.append_header: - self.writer.writeheader() - self.writer.writerows(self._row_dicts) - self._row_dicts = [] - self.csv_file.flush() - self.csv_file.close() self.writer = None diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index be86e19c7a2..28f85304688 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -1243,7 +1243,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): self.assertTrue(hasattr(reduce_on_plateau, 'min_delta')) self.assertEqual(reduce_on_plateau.min_delta, 1e-13) - def test_CSVLogger(self, log_all=False): + def test_CSVLogger(self): with self.cached_session(): np.random.seed(1337) temp_dir = self.get_temp_dir() @@ -1306,7 +1306,6 @@ class KerasCallbacksTest(keras_parameterized.TestCase): y_train, batch_size=BATCH_SIZE, validation_data=(x_test, y_test), - validation_freq=1 if not log_all else 2, callbacks=cbks, epochs=2, verbose=0) @@ -1321,9 +1320,6 @@ class KerasCallbacksTest(keras_parameterized.TestCase): os.remove(filepath) - def test_CSVLogger_log_all(self): - self.test_CSVLogger(log_all=True) - def test_stop_training_csv(self): # Test that using the CSVLogger callback with the TerminateOnNaN callback # does not result in invalid CSVs. From ce78181d264092f18fb42fe237a94bbcaafe9a08 Mon Sep 17 00:00:00 2001 From: Yiwen Li Date: Thu, 25 Jun 2020 15:45:47 -0700 Subject: [PATCH 0008/1447] Fix docs for model_from_config --- tensorflow/python/keras/saving/model_config.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/saving/model_config.py b/tensorflow/python/keras/saving/model_config.py index 63f82b404a4..9fa8f1f4105 100644 --- a/tensorflow/python/keras/saving/model_config.py +++ b/tensorflow/python/keras/saving/model_config.py @@ -31,9 +31,15 @@ except ImportError: # pylint: enable=g-import-not-at-top -@keras_export('keras.models.model_from_config') +@keras_export('keras.models.from_config') def model_from_config(config, custom_objects=None): """Instantiates a Keras model from its config. + + Usage: + # for a Functional API model + >>> tf.keras.Model().from_config(model.get_config()) + # for a Sequential model + >>> tf.keras.Sequential().from_config(model.get_config()) Arguments: config: Configuration dictionary. From 6ab2c5215a3d111bfc0e3c17734301e389628190 Mon Sep 17 00:00:00 2001 From: Elena Zhelezina Date: Mon, 13 Jul 2020 16:48:14 +0100 Subject: [PATCH 0009/1447] Added 16x8 op tests for conv_activations. Change-Id: Ie5e53c48284eac0ad80b75b1514cd5e3e4494232 --- tensorflow/lite/testing/op_tests/conv_activation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/lite/testing/op_tests/conv_activation.py b/tensorflow/lite/testing/op_tests/conv_activation.py index 1ee1210ec9e..c612026a5bf 100644 --- a/tensorflow/lite/testing/op_tests/conv_activation.py +++ b/tensorflow/lite/testing/op_tests/conv_activation.py @@ -40,6 +40,7 @@ def make_conv_activation_tests(activation_op): "constant_filter": [True, False], "channel_multiplier": [1, 2], "fully_quantize": [False], + "quant_16x8": [False], "dynamic_range_quantize": [False], }, # TODO(b/134702301): The fully_quantize param is just ignored by the @@ -55,6 +56,7 @@ def make_conv_activation_tests(activation_op): "constant_filter": [True], "channel_multiplier": [1, 2], "fully_quantize": [True], + "quant_16x8": [False, True], "dynamic_range_quantize": [False], }, { @@ -67,6 +69,7 @@ def make_conv_activation_tests(activation_op): "constant_filter": [True], "channel_multiplier": [1, 2], "fully_quantize": [False], + "quant_16x8": [False], "dynamic_range_quantize": [True], }, ] From 546ba296655be51b9306641c144cd1d80be00413 Mon Sep 17 00:00:00 2001 From: Ty Mick Date: Tue, 21 Jul 2020 12:36:33 -0400 Subject: [PATCH 0010/1447] Fix markdown bullet list --- tensorflow/python/keras/saving/save.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/keras/saving/save.py b/tensorflow/python/keras/saving/save.py index 9c83914d380..b5073820487 100644 --- a/tensorflow/python/keras/saving/save.py +++ b/tensorflow/python/keras/saving/save.py @@ -66,9 +66,9 @@ def save_model(model, The saved model contains: - - the model's configuration (topology) - - the model's weights - - the model's optimizer's state (if any) + - the model's configuration (topology) + - the model's weights + - the model's optimizer's state (if any) Thus the saved model can be reinstantiated in the exact same state, without any of the code From 7248c71c11c24ac3af26f1fbd7ea09570cec7b08 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Wed, 5 Aug 2020 11:58:33 +0100 Subject: [PATCH 0011/1447] Add int16x8 support for BATCH_MATMUL operator --- tensorflow/lite/kernels/batch_matmul.cc | 91 +++++++++++++++---- tensorflow/lite/kernels/batch_matmul_test.cc | 35 ++++++- .../kernels/internal/reference/batch_matmul.h | 43 ++++----- tensorflow/lite/kernels/register.cc | 2 +- tensorflow/lite/toco/tflite/op_version.cc | 1 + .../lite/tools/versioning/op_version.cc | 2 +- .../lite/tools/versioning/runtime_version.cc | 1 + 7 files changed, 133 insertions(+), 42 deletions(-) diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc index a414a226504..9b888820f89 100644 --- a/tensorflow/lite/kernels/batch_matmul.cc +++ b/tensorflow/lite/kernels/batch_matmul.cc @@ -294,7 +294,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { // Note that quantized inference requires that all tensors have their // parameters set. This is usually done during quantized training. - if (lhs_data->type == kTfLiteInt8) { + if (lhs_data->type == kTfLiteInt8 || lhs_data->type == kTfLiteInt16) { double real_multiplier = 0.0; TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler( context, lhs_data, rhs_data, output, &real_multiplier)); @@ -302,16 +302,34 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { QuantizeMultiplier(real_multiplier, &op_data->output_multiplier, &exponent); op_data->output_shift = exponent; // BatchMatMul has no fused activation functions. Therefore, set - // output activation min and max to min and max of int8_t type, - // respecitvely. - op_data->output_activation_min = std::numeric_limits::min(); - op_data->output_activation_max = std::numeric_limits::max(); + // output activation min and max to min and max of int8_t or int16_t + // type. + if (lhs_data->type == kTfLiteInt8) { + op_data->output_activation_min = std::numeric_limits::min(); + op_data->output_activation_max = std::numeric_limits::max(); + } else { + op_data->output_activation_min = std::numeric_limits::min(); + op_data->output_activation_max = std::numeric_limits::max(); + } + } + + if (lhs_data->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, lhs_data->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, rhs_data->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); } TF_LITE_ENSURE(context, lhs_data->type == kTfLiteFloat32 || - lhs_data->type == kTfLiteInt8); + lhs_data->type == kTfLiteInt8 || + lhs_data->type == kTfLiteInt16); TF_LITE_ENSURE(context, rhs_data->type == kTfLiteFloat32 || - rhs_data->type == kTfLiteInt8); + rhs_data->type == kTfLiteInt8 || + rhs_data->type == kTfLiteInt16); + // Either we have a hybrid quantization with a float32 and an int8 input, + // otherwise both inputs should be of the same type. + TF_LITE_ENSURE(context, (lhs_data->type == kTfLiteFloat32 && + rhs_data->type == kTfLiteInt8) || + lhs_data->type == rhs_data->type); // Support dimensions between 2 and 4, inclusive. TF_LITE_ENSURE(context, NumDimensions(lhs_data) >= 2); TF_LITE_ENSURE(context, NumDimensions(lhs_data) <= 4); @@ -382,9 +400,14 @@ TfLiteStatus TransposeRowsColumns(TfLiteContext* context, tensor_in, GetTensorData(tensor_in), tensor_out, GetTensorData(tensor_out)); return kTfLiteOk; + } else if (tensor_in->type == kTfLiteInt16) { + TransposeRowsColumnsImpl( + tensor_in, GetTensorData(tensor_in), tensor_out, + GetTensorData(tensor_out)); + return kTfLiteOk; } else { - TF_LITE_KERNEL_LOG(context, - "Can only transpose tensors with float and int8 type."); + TF_LITE_KERNEL_LOG( + context, "Can only transpose tensors with float, int8 and int16 type."); return kTfLiteError; } } @@ -481,10 +504,10 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data, op_params.rhs_cacheable = IsConstantTensor(rhs); if (kernel_type == kReference) { - reference_ops::BatchMatMul(op_params, rhs_shape, GetTensorData(rhs), - lhs_shape, GetTensorData(lhs), - GetTensorShape(output), - GetTensorData(output)); + reference_ops::BatchMatMul( + op_params, rhs_shape, GetTensorData(rhs), lhs_shape, + GetTensorData(lhs), GetTensorShape(output), + GetTensorData(output)); } else { optimized_ops::BatchMatMul(op_params, rhs_shape, GetTensorData(rhs), lhs_shape, GetTensorData(lhs), @@ -495,13 +518,40 @@ TfLiteStatus EvalInt8(TfLiteContext* context, const OpData* data, return kTfLiteOk; } +template +TfLiteStatus EvalInt16(TfLiteContext* context, const OpData* data, + const RuntimeShape& lhs_shape, const TfLiteTensor* lhs, + const RuntimeShape& rhs_shape, const TfLiteTensor* rhs, + const RuntimeShape& output_shape, TfLiteTensor* output) { + // Reuse params struct from FullyConnected Op. + FullyConnectedParams op_params; + int32_t input_offset = -lhs->params.zero_point; + int32_t filter_offset = -rhs->params.zero_point; + int32_t output_offset = output->params.zero_point; + op_params.input_offset = input_offset; + op_params.weights_offset = filter_offset; + op_params.output_offset = output_offset; + op_params.output_multiplier = data->output_multiplier; + op_params.output_shift = data->output_shift; + op_params.quantized_activation_min = data->output_activation_min; + op_params.quantized_activation_max = data->output_activation_max; + + // optimized_ops not yet implemnted for int16_t, use reference_ops in all + // cases. + reference_ops::BatchMatMul( + op_params, rhs_shape, GetTensorData(rhs), lhs_shape, + GetTensorData(lhs), GetTensorShape(output), + GetTensorData(output)); + return kTfLiteOk; +} + template TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, OpData* data, const RuntimeShape& lhs_shape, const TfLiteTensor* lhs, const RuntimeShape& rhs_shape, const TfLiteTensor* rhs, TfLiteTensor* output) { - if (lhs->type == kTfLiteFloat32) { + if (lhs->type == kTfLiteFloat32 && rhs->type == kTfLiteInt8) { TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/2); TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/3); TfLiteTensor* accum_scratch = GetTemporary(context, node, /*index=*/4); @@ -510,12 +560,16 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, return EvalHybrid( context, node, data, lhs_shape, lhs, rhs_shape, rhs, input_quantized, scaling_factors, accum_scratch, row_sums, input_offsets, output); - } else if (lhs->type == kTfLiteInt8) { + } else if (lhs->type == kTfLiteInt8 && rhs->type == kTfLiteInt8) { return EvalInt8(context, data, lhs_shape, lhs, rhs_shape, rhs, GetTensorShape(output), output); + } else if (lhs->type == kTfLiteInt16 && rhs->type == kTfLiteInt16) { + return EvalInt16(context, data, lhs_shape, lhs, rhs_shape, rhs, + GetTensorShape(output), output); } else { TF_LITE_KERNEL_LOG( - context, "Currently only hybrid and int8 quantization is supported.\n"); + context, + "Currently only hybrid, int8 and int16 quantization are supported.\n"); return kTfLiteError; } return kTfLiteOk; @@ -524,7 +578,7 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* rhs) { TfLiteTensor* transposed_rhs = GetTemporary(context, node, 1); - if (rhs->type == kTfLiteInt8) { + if (rhs->type == kTfLiteInt8 || rhs->type == kTfLiteInt16) { // Get the quantization params from the RHS tensor. transposed_rhs->params.scale = rhs->params.scale; transposed_rhs->params.zero_point = rhs->params.zero_point; @@ -535,7 +589,7 @@ TfLiteTensor* GetTempRhs(TfLiteContext* context, TfLiteNode* node, TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node, const TfLiteTensor* lhs) { TfLiteTensor* transposed_lhs = GetTemporary(context, node, 0); - if (lhs->type == kTfLiteInt8) { + if (lhs->type == kTfLiteInt8 || lhs->type == kTfLiteInt16) { // Get the quantization params from the LHS tensor. transposed_lhs->params.scale = lhs->params.scale; transposed_lhs->params.zero_point = lhs->params.zero_point; @@ -602,6 +656,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { } break; case kTfLiteInt8: + case kTfLiteInt16: EvalQuantized(context, node, op_data, lhs_shape, lhs_tensor, rhs_shape, rhs_tensor, output); break; diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc index 98df8ebe3db..0c6f92ce8bf 100644 --- a/tensorflow/lite/kernels/batch_matmul_test.cc +++ b/tensorflow/lite/kernels/batch_matmul_test.cc @@ -483,7 +483,12 @@ class QuantizedBatchMatMulOpModel : public SingleOpModel { input_size_ = total_input_size / batches_; lhs_id_ = AddInput(lhs); - rhs_id_ = AddInput({lhs.type, {input_size_, units_}, lhs.min, lhs.max}); + rhs_id_ = AddInput({lhs.type, + {input_size_, units_}, + 0, + 0, + GetScale(lhs_id_), + GetZeroPoint(lhs_id_)}); output_id_ = AddOutput(output); @@ -553,6 +558,34 @@ TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) { EXPECT_THAT(m.GetOutput(), ElementsAre(22, 22, 22, 56, 56, 56)); } +TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt16) { + const float inputs_scale = 2.0 * 10 / std::numeric_limits::max(); + const float output_scale = 1.0; + + QuantizedBatchMatMulOpModel m( + /*units=*/3, /*batches*/ 2, + /*lhs=*/ + {TensorType_INT16, {2, 10}, 0, 0, inputs_scale, 0}, + /*output=*/ + {TensorType_INT16, {}, 0, 0, output_scale, 0}); + + m.SetWeights({ + 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, + 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10, 10, 10, + }); + + m.SetInput({ + 1, 2, 3, 4, 5, 6, 7, 8, -9, -10, // b = 0 + 1, 2, 3, 4, 5, 6, 7, -8, 9, -10, // b = 1 + }); + + m.Invoke(); + + EXPECT_THAT(m.GetDequantizedOutput(), + ElementsAreArray(ArrayFloatNear({23, 23, 23, 57, 57, 57}))); + EXPECT_THAT(m.GetOutput(), ElementsAre(23, 23, 23, 57, 57, 57)); +} + INSTANTIATE_TEST_SUITE_P( QuantizedBatchMatMulOpTest, QuantizedBatchMatMulOpTest, ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap))); diff --git a/tensorflow/lite/kernels/internal/reference/batch_matmul.h b/tensorflow/lite/kernels/internal/reference/batch_matmul.h index 24c3ffe3d7e..f06199c7700 100644 --- a/tensorflow/lite/kernels/internal/reference/batch_matmul.h +++ b/tensorflow/lite/kernels/internal/reference/batch_matmul.h @@ -217,10 +217,11 @@ inline void BatchMatMul(const RuntimeShape& lhs_shape, const int8_t* lhs_data, } } +template inline void BatchMatMul(const FullyConnectedParams& params, - const RuntimeShape& lhs_shape, const int8_t* lhs_data, - const RuntimeShape& rhs_shape, const int8_t* rhs_data, - const RuntimeShape& output_shape, int8_t* output_data) { + const RuntimeShape& lhs_shape, const T* lhs_data, + const RuntimeShape& rhs_shape, const T* rhs_data, + const RuntimeShape& output_shape, T* output_data) { const RuntimeShape extended_lhs_shape = RuntimeShape::ExtendedShape(5, lhs_shape); const RuntimeShape extended_rhs_shape = @@ -276,33 +277,33 @@ inline void BatchMatMul(const FullyConnectedParams& params, TFLITE_DCHECK_LE(output_activation_min, output_activation_max); for (int b0 = 0; b0 < batch_dim0; ++b0) { - const int8_t* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); - const int8_t* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); + const T* lhs_ptr0 = lhs_data + (b0 * lhs_ext0); + const T* rhs_ptr0 = rhs_data + (b0 * rhs_ext0); for (int b1 = 0; b1 < batch_dim1; ++b1) { - const int8_t* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; - const int8_t* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; + const T* lhs_ptr1 = lhs_ptr0 + b1 * lhs_ext1; + const T* rhs_ptr1 = rhs_ptr0 + b1 * rhs_ext1; for (int b2 = 0; b2 < batch_dim2; ++b2) { - const int8_t* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; - const int8_t* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; - int8_t* out_ptr = output_data + ((b0 * batch_dim1 * batch_dim2) + - b1 * batch_dim2 + b2) * - lhs_rows * rhs_cols; + const T* lhs_ptr2 = lhs_ptr1 + b2 * lhs_ext2; + const T* rhs_ptr2 = rhs_ptr1 + b2 * rhs_ext2; + T* out_ptr = output_data + + ((b0 * batch_dim1 * batch_dim2) + b1 * batch_dim2 + b2) * + lhs_rows * rhs_cols; for (int j = 0; j < rhs_cols; ++j) { for (int i = 0; i < lhs_rows; ++i) { - int32_t total = 0; + AccumT total = 0; for (int k = 0; k < accum_depth; ++k) { - int32_t lhs_val = lhs_ptr2[accum_depth * i + k]; - int32_t rhs_val = rhs_ptr2[accum_depth * j + k]; + AccumT lhs_val = lhs_ptr2[accum_depth * i + k]; + AccumT rhs_val = rhs_ptr2[accum_depth * j + k]; total += (lhs_val + filter_offset) * (rhs_val + input_offset); } - total = MultiplyByQuantizedMultiplier(total, output_multiplier, - output_shift); - total += output_offset; - total = std::max(total, output_activation_min); - total = std::min(total, output_activation_max); + int32_t total_scaled = MultiplyByQuantizedMultiplier( + total, output_multiplier, output_shift); + total_scaled += output_offset; + total_scaled = std::max(total_scaled, output_activation_min); + total_scaled = std::min(total_scaled, output_activation_max); const int idx = lhs_rows * j + i; - out_ptr[idx] = static_cast(total); + out_ptr[idx] = static_cast(total_scaled); } } } diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc index 1d1db9e0403..28a0f3fd849 100644 --- a/tensorflow/lite/kernels/register.cc +++ b/tensorflow/lite/kernels/register.cc @@ -291,7 +291,7 @@ BuiltinOpResolver::BuiltinOpResolver() { AddBuiltin(BuiltinOperator_SEGMENT_SUM, Register_SEGMENT_SUM()); AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL(), /* min_version = */ 1, - /* max_version = */ 2); + /* max_version = */ 3); AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY()); // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that // custom ops aren't always included by default. diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc index 222be969560..5908ccf07ba 100644 --- a/tensorflow/lite/toco/tflite/op_version.cc +++ b/tensorflow/lite/toco/tflite/op_version.cc @@ -66,6 +66,7 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) { {{OperatorType::kBatchToSpaceND, 1}, "1.6.0"}, {{OperatorType::kBatchToSpaceND, 2}, "1.14.0"}, {{OperatorType::kBatchMatMul, 1}, kPendingReleaseOpVersion}, + {{OperatorType::kBatchMatMul, 3}, kPendingReleaseOpVersion}, {{OperatorType::kCast, 1}, "1.5.0"}, {{OperatorType::kConcatenation, 1}, "1.5.0"}, {{OperatorType::kConcatenation, 2}, "1.14.0"}, diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc index ef4825c397e..140fa6e376a 100644 --- a/tensorflow/lite/tools/versioning/op_version.cc +++ b/tensorflow/lite/tools/versioning/op_version.cc @@ -534,6 +534,7 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) { return 1; case BuiltinOperator_CONCATENATION: + case BuiltinOperator_BATCH_MATMUL: case BuiltinOperator_SOFTMAX: case BuiltinOperator_MEAN: case BuiltinOperator_PAD: @@ -576,7 +577,6 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) { case BuiltinOperator_LESS: case BuiltinOperator_LESS_EQUAL: case BuiltinOperator_SELECT: - case BuiltinOperator_BATCH_MATMUL: if (op_sig.input_types.at(0) == TensorType_INT8) { return 2; } diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc index 5a454224b92..56ea12db43f 100644 --- a/tensorflow/lite/tools/versioning/runtime_version.cc +++ b/tensorflow/lite/tools/versioning/runtime_version.cc @@ -59,6 +59,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code, {{BuiltinOperator_AVERAGE_POOL_2D, 3}, "2.3.0"}, {{BuiltinOperator_BATCH_MATMUL, 1}, "2.3.0"}, {{BuiltinOperator_BATCH_MATMUL, 2}, "2.3.0"}, + {{BuiltinOperator_BATCH_MATMUL, 3}, kPendingReleaseVersion}, {{BuiltinOperator_CONV_2D, 1}, "1.5.0"}, {{BuiltinOperator_CONV_2D, 2}, "1.14.0"}, {{BuiltinOperator_CONV_2D, 3}, "1.14.0"}, From e9a7eec1dc6f2e2e72f468c167b93795ff60544b Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Thu, 7 May 2020 05:54:17 -0400 Subject: [PATCH 0012/1447] We need to bring in the classes from advanced_activations if there are no custom objects specified. When no custom objects are specified, our module_objects/globals() in activations.deserialize() won't contain any advanced_activations. --- tensorflow/python/keras/activations.py | 12 +++++++++++- tensorflow/python/keras/activations_test.py | 7 +++++++ .../python/keras/layers/advanced_activations.py | 2 ++ 3 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index fe0bf5977f9..28d6f18dcf8 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -26,6 +26,7 @@ from tensorflow.python.ops import math_ops from tensorflow.python.ops import nn from tensorflow.python.util import dispatch from tensorflow.python.util.tf_export import keras_export +from tensorflow.python.keras.layers import advanced_activations # b/123041942 # In TF 2.x, if the `tf.nn.softmax` is used as an activation function in Keras @@ -525,9 +526,18 @@ def deserialize(name, custom_objects=None): ValueError: `Unknown activation function` if the input string does not denote any defined Tensorflow activation function. """ + globs = globals() + + # only replace missing activations, when there are no custom objects + if custom_objects is None: + advanced_activations_globs = advanced_activations.get_globals() + for key,val in advanced_activations_globs.items(): + if key not in globs: + globs[key] = val + return deserialize_keras_object( name, - module_objects=globals(), + module_objects=globs, custom_objects=custom_objects, printable_module_name='activation function') diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py index ddd3863a3f6..e2bdec0dd45 100644 --- a/tensorflow/python/keras/activations_test.py +++ b/tensorflow/python/keras/activations_test.py @@ -65,12 +65,19 @@ class KerasActivationsTest(test.TestCase, parameterized.TestCase): activation = advanced_activations.LeakyReLU(alpha=0.1) layer = core.Dense(3, activation=activation) config = serialization.serialize(layer) + # with custom objects deserialized_layer = serialization.deserialize( config, custom_objects={'LeakyReLU': activation}) self.assertEqual(deserialized_layer.__class__.__name__, layer.__class__.__name__) self.assertEqual(deserialized_layer.activation.__class__.__name__, activation.__class__.__name__) + # without custom objects + deserialized_layer = serialization.deserialize(config) + self.assertEqual(deserialized_layer.__class__.__name__, + layer.__class__.__name__) + self.assertEqual(deserialized_layer.activation.__class__.__name__, + activation.__class__.__name__) def test_softmax(self): x = backend.placeholder(ndim=2) diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py index e4323b45dc4..e9ce23654fd 100644 --- a/tensorflow/python/keras/layers/advanced_activations.py +++ b/tensorflow/python/keras/layers/advanced_activations.py @@ -29,6 +29,8 @@ from tensorflow.python.keras.utils import tf_utils from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import keras_export +def get_globals(): + return globals() @keras_export('keras.layers.LeakyReLU') class LeakyReLU(Layer): From f9ff67c03adf5f0a7b9e0455f26486902f9d4e8a Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Wed, 3 Jun 2020 14:01:06 -0400 Subject: [PATCH 0013/1447] no need for if statement since custom object dict is checked before module objects --- tensorflow/python/keras/activations.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index 28d6f18dcf8..32e54f8059f 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -528,12 +528,11 @@ def deserialize(name, custom_objects=None): """ globs = globals() - # only replace missing activations, when there are no custom objects - if custom_objects is None: - advanced_activations_globs = advanced_activations.get_globals() - for key,val in advanced_activations_globs.items(): - if key not in globs: - globs[key] = val + # only replace missing activations + advanced_activations_globs = advanced_activations.get_globals() + for key,val in advanced_activations_globs.items(): + if key not in globs: + globs[key] = val return deserialize_keras_object( name, From 7b8d3d5894e95b0774ca8615b8b542824a973448 Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Wed, 3 Jun 2020 16:33:38 -0400 Subject: [PATCH 0014/1447] fixing pylint issues --- tensorflow/python/keras/activations.py | 2 +- tensorflow/python/keras/layers/advanced_activations.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py index 32e54f8059f..37e119b24ca 100644 --- a/tensorflow/python/keras/activations.py +++ b/tensorflow/python/keras/activations.py @@ -530,7 +530,7 @@ def deserialize(name, custom_objects=None): # only replace missing activations advanced_activations_globs = advanced_activations.get_globals() - for key,val in advanced_activations_globs.items(): + for key, val in advanced_activations_globs.items(): if key not in globs: globs[key] = val diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py index e9ce23654fd..456b6758dc6 100644 --- a/tensorflow/python/keras/layers/advanced_activations.py +++ b/tensorflow/python/keras/layers/advanced_activations.py @@ -29,9 +29,11 @@ from tensorflow.python.keras.utils import tf_utils from tensorflow.python.ops import math_ops from tensorflow.python.util.tf_export import keras_export + def get_globals(): return globals() + @keras_export('keras.layers.LeakyReLU') class LeakyReLU(Layer): """Leaky version of a Rectified Linear Unit. From 5f631dd558f84eb3c15912be4ca7558a89ac856d Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Fri, 5 Jun 2020 18:20:16 -0400 Subject: [PATCH 0015/1447] New feature. Use new param log_all in CSVLogger to log all elements in training even if some epochs don't contain the same elements. --- tensorflow/python/keras/callbacks.py | 35 +++++++++++++++++++---- tensorflow/python/keras/callbacks_test.py | 6 +++- 2 files changed, 35 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py index 3469ccb68ef..08e54323b08 100644 --- a/tensorflow/python/keras/callbacks.py +++ b/tensorflow/python/keras/callbacks.py @@ -2486,12 +2486,18 @@ class CSVLogger(Callback): separator: String used to separate elements in the CSV file. append: Boolean. True: append if file exists (useful for continuing training). False: overwrite existing file. + log_all: Boolean. True: log all elements, even elements that are + only recorded every x epochs (ex. validation sometimes is + only recorded every validation_freq). False: Don't log all + elements, only log the elements that are present in every epoch. """ - def __init__(self, filename, separator=',', append=False): + def __init__(self, filename, separator=',', append=False, log_all=False): self.sep = separator self.filename = path_to_string(filename) self.append = append + self.log_all = log_all + self._row_dicts = [] self.writer = None self.keys = None self.append_header = True @@ -2529,6 +2535,10 @@ class CSVLogger(Callback): if self.keys is None: self.keys = sorted(logs.keys()) + elif self.log_all and len(self.keys) < len(logs.keys()): + # have to make a new writer to accommodate for the new keys + self.keys = sorted(logs.keys()) + self.writer = None if self.model.stop_training: # We set NA so that csv parsers do not fail for this last epoch. @@ -2547,15 +2557,30 @@ class CSVLogger(Callback): self.csv_file, fieldnames=fieldnames, dialect=CustomDialect) - if self.append_header: + # if user wants to log all, then we append_header + # at the end of training + if self.append_header and not self.log_all: self.writer.writeheader() row_dict = collections.OrderedDict({'epoch': epoch}) - row_dict.update((key, handle_value(logs[key])) for key in self.keys) - self.writer.writerow(row_dict) - self.csv_file.flush() + row_dict.update((key, handle_value(logs[key])) + for key in self.keys if key in logs) + # if user wants to log all, then we write all rows to csv file + # at the end of training + if not self.log_all: + self.writer.writerow(row_dict) + self.csv_file.flush() + else: + self._row_dicts.append(row_dict) def on_train_end(self, logs=None): + if self.log_all: + if self.append_header: + self.writer.writeheader() + self.writer.writerows(self._row_dicts) + self._row_dicts = [] + self.csv_file.flush() + self.csv_file.close() self.writer = None diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index 9fd8bf86609..933ce15fe8f 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -1301,7 +1301,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): self.assertTrue(hasattr(reduce_on_plateau, 'min_delta')) self.assertEqual(reduce_on_plateau.min_delta, 1e-13) - def test_CSVLogger(self): + def test_CSVLogger(self, log_all=False): with self.cached_session(): np.random.seed(1337) temp_dir = self.get_temp_dir() @@ -1364,6 +1364,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): y_train, batch_size=BATCH_SIZE, validation_data=(x_test, y_test), + validation_freq=1 if not log_all else 2, callbacks=cbks, epochs=2, verbose=0) @@ -1378,6 +1379,9 @@ class KerasCallbacksTest(keras_parameterized.TestCase): os.remove(filepath) + def test_CSVLogger_log_all(self): + self.test_CSVLogger(log_all=True) + def test_stop_training_csv(self): # Test that using the CSVLogger callback with the TerminateOnNaN callback # does not result in invalid CSVs. From b4b65a40882907a4655d65d82567463ee50f2177 Mon Sep 17 00:00:00 2001 From: PiyushDatta Date: Fri, 5 Jun 2020 18:29:08 -0400 Subject: [PATCH 0016/1447] Revert "New feature. Use new param log_all in CSVLogger to log all elements in training even if some epochs don't contain the same elements." This reverts commit 204913109700abfa7fd620bf05c4603dc7795f34. --- tensorflow/python/keras/callbacks.py | 35 ++++------------------- tensorflow/python/keras/callbacks_test.py | 6 +--- 2 files changed, 6 insertions(+), 35 deletions(-) diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py index 08e54323b08..3469ccb68ef 100644 --- a/tensorflow/python/keras/callbacks.py +++ b/tensorflow/python/keras/callbacks.py @@ -2486,18 +2486,12 @@ class CSVLogger(Callback): separator: String used to separate elements in the CSV file. append: Boolean. True: append if file exists (useful for continuing training). False: overwrite existing file. - log_all: Boolean. True: log all elements, even elements that are - only recorded every x epochs (ex. validation sometimes is - only recorded every validation_freq). False: Don't log all - elements, only log the elements that are present in every epoch. """ - def __init__(self, filename, separator=',', append=False, log_all=False): + def __init__(self, filename, separator=',', append=False): self.sep = separator self.filename = path_to_string(filename) self.append = append - self.log_all = log_all - self._row_dicts = [] self.writer = None self.keys = None self.append_header = True @@ -2535,10 +2529,6 @@ class CSVLogger(Callback): if self.keys is None: self.keys = sorted(logs.keys()) - elif self.log_all and len(self.keys) < len(logs.keys()): - # have to make a new writer to accommodate for the new keys - self.keys = sorted(logs.keys()) - self.writer = None if self.model.stop_training: # We set NA so that csv parsers do not fail for this last epoch. @@ -2557,30 +2547,15 @@ class CSVLogger(Callback): self.csv_file, fieldnames=fieldnames, dialect=CustomDialect) - # if user wants to log all, then we append_header - # at the end of training - if self.append_header and not self.log_all: + if self.append_header: self.writer.writeheader() row_dict = collections.OrderedDict({'epoch': epoch}) - row_dict.update((key, handle_value(logs[key])) - for key in self.keys if key in logs) - # if user wants to log all, then we write all rows to csv file - # at the end of training - if not self.log_all: - self.writer.writerow(row_dict) - self.csv_file.flush() - else: - self._row_dicts.append(row_dict) + row_dict.update((key, handle_value(logs[key])) for key in self.keys) + self.writer.writerow(row_dict) + self.csv_file.flush() def on_train_end(self, logs=None): - if self.log_all: - if self.append_header: - self.writer.writeheader() - self.writer.writerows(self._row_dicts) - self._row_dicts = [] - self.csv_file.flush() - self.csv_file.close() self.writer = None diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py index 933ce15fe8f..9fd8bf86609 100644 --- a/tensorflow/python/keras/callbacks_test.py +++ b/tensorflow/python/keras/callbacks_test.py @@ -1301,7 +1301,7 @@ class KerasCallbacksTest(keras_parameterized.TestCase): self.assertTrue(hasattr(reduce_on_plateau, 'min_delta')) self.assertEqual(reduce_on_plateau.min_delta, 1e-13) - def test_CSVLogger(self, log_all=False): + def test_CSVLogger(self): with self.cached_session(): np.random.seed(1337) temp_dir = self.get_temp_dir() @@ -1364,7 +1364,6 @@ class KerasCallbacksTest(keras_parameterized.TestCase): y_train, batch_size=BATCH_SIZE, validation_data=(x_test, y_test), - validation_freq=1 if not log_all else 2, callbacks=cbks, epochs=2, verbose=0) @@ -1379,9 +1378,6 @@ class KerasCallbacksTest(keras_parameterized.TestCase): os.remove(filepath) - def test_CSVLogger_log_all(self): - self.test_CSVLogger(log_all=True) - def test_stop_training_csv(self): # Test that using the CSVLogger callback with the TerminateOnNaN callback # does not result in invalid CSVs. From 41f57af899a69a82a037d0689745694a77ac50a9 Mon Sep 17 00:00:00 2001 From: piyushdatta Date: Thu, 20 Aug 2020 20:55:28 -0400 Subject: [PATCH 0017/1447] added advanced_activations into activation lib dependancies --- tensorflow/python/keras/BUILD | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD index d8eff0f2260..1ece307f142 100755 --- a/tensorflow/python/keras/BUILD +++ b/tensorflow/python/keras/BUILD @@ -119,6 +119,7 @@ py_library( deps = [ ":backend", "//tensorflow/python/keras/utils:engine_utils", + "//tensorflow/python/keras/layers:advanced_activations", ], ) From 3ade2efec2e90c6237de32a19680caaa3ebc2845 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 8 Aug 2020 00:47:35 +0000 Subject: [PATCH 0018/1447] Fix segmentation fault in tf.image.crop_and_resize when boxes is inf or nan This fix tries to address the issue raised in 42129 where segmentation fault happened in tf.image.crop_and_resize when boxes is inf or nan. This fix adds the check to make sure boxes is not inf or nan (isfinite) This fix fixes 42129. Signed-off-by: Yong Tang --- tensorflow/core/kernels/image/crop_and_resize_op.cc | 13 +++++++++++++ tensorflow/python/ops/image_ops_test.py | 12 ++++++++++++ 2 files changed, 25 insertions(+) diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc index 1979b0514c6..3c4419a870f 100644 --- a/tensorflow/core/kernels/image/crop_and_resize_op.cc +++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc @@ -71,6 +71,18 @@ static inline Status ParseAndCheckBoxSizes(const Tensor& boxes, if (boxes.dim_size(1) != 4) { return errors::InvalidArgument("boxes must have 4 columns"); } + for (int64 i = 0; i < *num_boxes; i++) { + for (int64 j = 0; j < 4; j++) { + if (!isfinite(boxes.tensor()(i, j))) { + return errors::InvalidArgument( + "boxes values must be finite, received boxes[", i, "]: ", + boxes.tensor()(i, 0), ", ", + boxes.tensor()(i, 1), ", ", + boxes.tensor()(i, 2), ", ", + boxes.tensor()(i, 3)); + } + } + } // The shape of 'box_index' is [num_boxes]. if (box_index.dims() != 1) { return errors::InvalidArgument("box_index must be 1-D", @@ -256,6 +268,7 @@ struct CropAndResize { continue; } if (method_name == "bilinear") { + const int top_y_index = floorf(in_y); const int bottom_y_index = ceilf(in_y); const float y_lerp = in_y - top_y_index; diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py index 751a8a00758..aae2946535a 100644 --- a/tensorflow/python/ops/image_ops_test.py +++ b/tensorflow/python/ops/image_ops_test.py @@ -5663,6 +5663,18 @@ class DecodeImageTest(test_util.TensorFlowTestCase): self.assertAllEqual(list(image2.shape), [12, 40, 20, 3]) self.assertAllEqual(image2, image3) + def testImageCropAndResize(self): + # Test case for GitHub issue 42129 + message = "boxes values must be finite" + with self.assertRaisesRegex( + (errors.InvalidArgumentError, ValueError), message): + v = image_ops_impl.crop_and_resize_v2( + image=array_ops.zeros((2, 1, 1, 1)), + boxes=[[1.0e+40, 0, 0, 0]], + box_indices=[1], + crop_size=[1, 1]) + self.evaluate(v) + if __name__ == "__main__": googletest.main() From 14589c930ab6a10cf1442ec8e94f4628227a753c Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Tue, 25 Aug 2020 17:58:39 +0000 Subject: [PATCH 0019/1447] Change to use Eigen to allow the check working on GPU Signed-off-by: Yong Tang --- .../core/kernels/image/crop_and_resize_op.cc | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc index 3c4419a870f..97f92113ec0 100644 --- a/tensorflow/core/kernels/image/crop_and_resize_op.cc +++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc @@ -71,17 +71,9 @@ static inline Status ParseAndCheckBoxSizes(const Tensor& boxes, if (boxes.dim_size(1) != 4) { return errors::InvalidArgument("boxes must have 4 columns"); } - for (int64 i = 0; i < *num_boxes; i++) { - for (int64 j = 0; j < 4; j++) { - if (!isfinite(boxes.tensor()(i, j))) { - return errors::InvalidArgument( - "boxes values must be finite, received boxes[", i, "]: ", - boxes.tensor()(i, 0), ", ", - boxes.tensor()(i, 1), ", ", - boxes.tensor()(i, 2), ", ", - boxes.tensor()(i, 3)); - } - } + Eigen::Tensor check = boxes.tensor().isfinite().all(); + if (!check()) { + return errors::InvalidArgument("boxes values must be finite"); } // The shape of 'box_index' is [num_boxes]. if (box_index.dims() != 1) { From abfdbf5647188ba6cb7c4e8652c54d247c1caca8 Mon Sep 17 00:00:00 2001 From: Mihai Maruseac Date: Thu, 27 Aug 2020 18:45:08 +0000 Subject: [PATCH 0020/1447] Update tensorflow/core/kernels/image/crop_and_resize_op.cc --- tensorflow/core/kernels/image/crop_and_resize_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc index 97f92113ec0..18a66afe073 100644 --- a/tensorflow/core/kernels/image/crop_and_resize_op.cc +++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc @@ -260,7 +260,6 @@ struct CropAndResize { continue; } if (method_name == "bilinear") { - const int top_y_index = floorf(in_y); const int bottom_y_index = ceilf(in_y); const float y_lerp = in_y - top_y_index; From 9dcc615feebf86e755bdc89d3589e91585b5c376 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Fri, 28 Aug 2020 16:07:32 +0100 Subject: [PATCH 0021/1447] Fix the inputs_scale in the test --- tensorflow/lite/kernels/batch_matmul_test.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/lite/kernels/batch_matmul_test.cc b/tensorflow/lite/kernels/batch_matmul_test.cc index 0c6f92ce8bf..7abef73d5a2 100644 --- a/tensorflow/lite/kernels/batch_matmul_test.cc +++ b/tensorflow/lite/kernels/batch_matmul_test.cc @@ -559,15 +559,16 @@ TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt8) { } TEST_P(QuantizedBatchMatMulOpTest, SimpleTestQuantizedInt16) { - const float inputs_scale = 2.0 * 10 / std::numeric_limits::max(); + const float inputs_scale = 10.0 / std::numeric_limits::max(); const float output_scale = 1.0; + const int32_t zero_point = 0; QuantizedBatchMatMulOpModel m( /*units=*/3, /*batches*/ 2, /*lhs=*/ - {TensorType_INT16, {2, 10}, 0, 0, inputs_scale, 0}, + {TensorType_INT16, {2, 10}, 0, 0, inputs_scale, zero_point}, /*output=*/ - {TensorType_INT16, {}, 0, 0, output_scale, 0}); + {TensorType_INT16, {}, 0, 0, output_scale, zero_point}); m.SetWeights({ 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, From 7cb3064b6d8a491acb110c73c0960111bbb60319 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer Date: Sat, 29 Aug 2020 14:21:58 +0100 Subject: [PATCH 0022/1447] Clarify transpose unsupported tensor type error message --- tensorflow/lite/kernels/batch_matmul.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc index 9b888820f89..b813cf32552 100644 --- a/tensorflow/lite/kernels/batch_matmul.cc +++ b/tensorflow/lite/kernels/batch_matmul.cc @@ -407,7 +407,7 @@ TfLiteStatus TransposeRowsColumns(TfLiteContext* context, return kTfLiteOk; } else { TF_LITE_KERNEL_LOG( - context, "Can only transpose tensors with float, int8 and int16 type."); + context, "Can only transpose tensors with float, int8 or int16 type."); return kTfLiteError; } } From 5b4cae0a21df79d82799f7834b5bad96b04e3cc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Mon, 31 Aug 2020 14:30:37 +0200 Subject: [PATCH 0023/1447] TFLu: Fix Ethos-U build issue The issue is that a binary need to be built twice, since it depends on recursive_find. The driver is not fully downloaded when recursive_find is called. The solution is to call the download script immediately via the make's shell function. Also add missing exit to error case in download script. --- .../micro/tools/make/download_and_extract.sh | 1 + .../lite/micro/tools/make/ext_libs/ethosu.inc | 22 ++++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh index e72fd7a0184..7bdfde2abe2 100755 --- a/tensorflow/lite/micro/tools/make/download_and_extract.sh +++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh @@ -223,6 +223,7 @@ download_and_extract() { fi else echo "Error unsupported archive type. Failed to extract tool after download." + exit 1 fi rm -rf ${tempdir2} ${tempdir} diff --git a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc index 44de3ebfc7c..acb4d32ab65 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc @@ -1,19 +1,31 @@ ifneq ($(filter ethos-u,$(ALL_TAGS)),) - # Don't want -lm flag - MICROLITE_LIBS := + MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) ifneq (,$(filter $(TARGET_ARCH), x86_64)) $(error target architecture x86_64 not supported) endif - THIRD_PARTY_DOWNLOADS += \ - $(eval $(call add_third_party_download,$(ETHOSU_URL),$(ETHOSU_MD5),ethosu,)) ETHOSU_DRIVER_PATH = $(MAKEFILE_DIR)/downloads/ethosu + # The driver need to be downloaded before the recursive_find below. + # That won't happen with the standard way of downloading by generating a + # target(call add_third_party_download), so instead use the shell function. + NEED_DOWNLOAD := YES + ifeq ($(NEED_DOWNLOAD),$(shell test -d $(ETHOSU_DRIVER_PATH) || echo $(NEED_DOWNLOAD))) + DOWNLOAD_SCRIPT := ./tensorflow/lite/micro/tools/make/download_and_extract.sh + DOWNLOAD_OK := OK + DOWNLOAD_STATUS := $(shell $(DOWNLOAD_SCRIPT) $(ETHOSU_URL) $(ETHOSU_MD5) $(ETHOSU_DRIVER_PATH) >&2 && echo $(DOWNLOAD_OK)) + ifneq ($(DOWNLOAD_OK),$(DOWNLOAD_STATUS)) + $(error $(DOWNLOAD_SCRIPT) failed) + endif + endif + # Currently there is a dependency to CMSIS-NN THIRD_PARTY_DOWNLOADS += \ $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis)) - CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/ + ifeq ($(CMSIS_PATH),) + CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/ + endif THIRD_PARTY_CC_HDRS += $(call recursive_find,$(CMSIS_PATH)/CMSIS/Core/Include,*.h) THIRD_PARTY_CC_HDRS += $(call recursive_find,$(ETHOSU_DRIVER_PATH)/include,*.h) From c800c529f8233004195206c3bf0c0e717e88c7ec Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 9 Sep 2020 11:39:32 +0200 Subject: [PATCH 0024/1447] TFLu: Don't construct PoolParams if not needed in CMSIS int8 kernel --- .../lite/micro/kernels/cmsis-nn/pooling.cc | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc index 4229b2c244c..cd2d799e734 100644 --- a/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc +++ b/tensorflow/lite/micro/kernels/cmsis-nn/pooling.cc @@ -99,17 +99,17 @@ void AverageEvalQuantized(TfLiteContext* context, const TfLiteNode* node, TfLiteEvalTensor* output) { TFLITE_DCHECK(input->type == kTfLiteUInt8 || input->type == kTfLiteInt8); - PoolParams op_params; - op_params.stride_height = params->stride_height; - op_params.stride_width = params->stride_width; - op_params.filter_height = params->filter_height; - op_params.filter_width = params->filter_width; - op_params.padding_values.height = data.padding.height; - op_params.padding_values.width = data.padding.width; - op_params.quantized_activation_min = data.activation_min; - op_params.quantized_activation_max = data.activation_max; - if (input->type == kTfLiteUInt8) { + PoolParams op_params; + op_params.stride_height = params->stride_height; + op_params.stride_width = params->stride_width; + op_params.filter_height = params->filter_height; + op_params.filter_width = params->filter_width; + op_params.padding_values.height = data.padding.height; + op_params.padding_values.width = data.padding.width; + op_params.quantized_activation_min = data.activation_min; + op_params.quantized_activation_max = data.activation_max; + reference_ops::AveragePool(op_params, tflite::micro::GetTensorShape(input), tflite::micro::GetTensorData(input), tflite::micro::GetTensorShape(output), From 3e761638397df30229ac3dfe85d573a41d37ac6d Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Fri, 11 Sep 2020 14:00:00 +0100 Subject: [PATCH 0025/1447] Ensure that all TFLite int16x8 operators check that the zero-point is null --- tensorflow/lite/kernels/activations.cc | 7 +++ tensorflow/lite/kernels/activations_test.cc | 53 ++++++++++++------ tensorflow/lite/kernels/concatenation.cc | 9 ++++ tensorflow/lite/kernels/concatenation_test.cc | 21 +++----- tensorflow/lite/kernels/conv.cc | 7 ++- tensorflow/lite/kernels/depthwise_conv.cc | 7 ++- tensorflow/lite/kernels/dequantize.cc | 4 ++ tensorflow/lite/kernels/dequantize_test.cc | 4 +- tensorflow/lite/kernels/quantize.cc | 7 +++ tensorflow/lite/kernels/quantize_test.cc | 43 +++++++-------- tensorflow/lite/kernels/reduce.cc | 6 +++ tensorflow/lite/kernels/reduce_test.cc | 54 ++++++++++--------- 12 files changed, 139 insertions(+), 83 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index 654ccbc27ec..fe40bf47a64 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -357,6 +357,12 @@ TfLiteStatus LeakyReluPrepare(TfLiteContext* context, TfLiteNode* node) { QuantizeMultiplier(identity_multiplier, &data->output_multiplier_identity, &data->output_shift_identity); } + + if (output->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + } + return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); } @@ -585,6 +591,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) { } if (input->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); data->params.exp_lut = data->exp_lut; diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc index d8f883b9c1d..c7db45341e8 100644 --- a/tensorflow/lite/kernels/activations_test.cc +++ b/tensorflow/lite/kernels/activations_test.cc @@ -552,7 +552,9 @@ TEST(QuantizedActivationsOpTest, LeakyReluUint8) { template void QuantizedActivationsOpTestLeakyRelu() { const float kMin = -1; - const float kMax = 127.f / 128.f; + const float kMax = + std::numeric_limits::max() / + static_cast(std::numeric_limits::max() + 1); QuantizedActivationsOpModel m( /*input=*/{tensor_type, {5, 5}, 5 * kMin, 5 * kMax}, 0.1); @@ -1219,9 +1221,12 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt8) { // Test quantized softmax with int16 input and output. With the same input as in // QuantizedActivationsOpTest.Softmax2D, the dequantized output is identical. TEST(QuantizedActivationsOpTest, Softmax1DInt16) { - QuantizedActivationsOpModel m(1, - /*input=*/{TensorType_INT16, {3}, -3, 3}, - /*output_type-*/ TensorType_INT16); + const float kMin = -1; + const float kMax = 32767.f / 32768.f; + QuantizedActivationsOpModel m( + 1, + /*input=*/{TensorType_INT16, {3}, 3 * kMin, 3 * kMax}, + /*output_type-*/ TensorType_INT16); m.SetInput({1, 2, 3}); m.Invoke(); EXPECT_THAT( @@ -1231,9 +1236,11 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt16) { } TEST(QuantizedActivationsOpTest, Softmax1DInt16ZeroElement) { - QuantizedActivationsOpModel m(0.1, - /*input=*/{TensorType_INT16, {1}, -1, 1}, - TensorType_INT16); + const float kMin = -1; + const float kMax = 32767.f / 32768.f; + QuantizedActivationsOpModel m( + 0.1, + /*input=*/{TensorType_INT16, {1}, 1 * kMin, 1 * kMax}, TensorType_INT16); m.SetInput({0}); m.Invoke(); EXPECT_THAT(m.GetDequantizedOutput(), @@ -1241,9 +1248,12 @@ TEST(QuantizedActivationsOpTest, Softmax1DInt16ZeroElement) { } TEST(QuantizedActivationsOpTest, Softmax2DInt16) { - QuantizedActivationsOpModel m(0.1, - /*input=*/{TensorType_INT16, {2, 4}, -10, 10}, - TensorType_INT16); + const float kMin = -1; + const float kMax = 32767.f / 32768.f; + QuantizedActivationsOpModel m( + 0.1, + /*input=*/{TensorType_INT16, {2, 4}, 10 * kMin, 10 * kMax}, + TensorType_INT16); m.SetInput({ 0, -6, 2, 4, // 3, -2, 10, 1, // @@ -1258,9 +1268,10 @@ TEST(QuantizedActivationsOpTest, Softmax2DInt16) { kQuantizedToleranceInt16))); // Same input, but a different shape. - QuantizedActivationsOpModel m2(0.1, - /*input=*/{TensorType_INT16, {4, 2}, -10, 10}, - TensorType_INT16); + QuantizedActivationsOpModel m2( + 0.1, + /*input=*/{TensorType_INT16, {4, 2}, 10 * kMin, 10 * kMax}, + TensorType_INT16); m2.SetInput({ 0, -6, // 2, 4, // @@ -1280,9 +1291,12 @@ TEST(QuantizedActivationsOpTest, Softmax2DInt16) { } TEST(QuantizedActivationsOpTest, Softmax3DInt16) { + const float kMin = -1; + const float kMax = 32767.f / 32768.f; QuantizedActivationsOpModel m( 1, - /*input=*/{TensorType_INT16, {1, 2, 4}, -10, 10}, TensorType_INT16); + /*input=*/{TensorType_INT16, {1, 2, 4}, 10 * kMin, 10 * kMax}, + TensorType_INT16); m.SetInput({ 0, -6, 2, 4, // depth = 0 3, -2, 10, 1, // depth = 1 @@ -1299,7 +1313,8 @@ TEST(QuantizedActivationsOpTest, Softmax3DInt16) { // Same input, but a different shape. QuantizedActivationsOpModel m2( 1, - /*input=*/{TensorType_INT16, {4, 1, 2}, -10, 10}, TensorType_INT16); + /*input=*/{TensorType_INT16, {4, 1, 2}, 10 * kMin, 10 * kMax}, + TensorType_INT16); m2.SetInput({ 0, -6, // 2, 4, // @@ -1321,9 +1336,12 @@ TEST(QuantizedActivationsOpTest, Softmax3DInt16) { // Test quantized softmax with int16 input and output. With the same input as in // QuantizedActivationsOpTest.Softmax4D, the dequantized output is identical. TEST(QuantizedActivationsOpTest, Softmax4DInt16) { + const float kMin = -1; + const float kMax = 32767.f / 32768.f; QuantizedActivationsOpModel m( 0.1, - /*input=*/{TensorType_INT16, {1, 2, 1, 4}, -10, 10}, TensorType_INT16); + /*input=*/{TensorType_INT16, {1, 2, 1, 4}, 10 * kMin, 10 * kMax}, + TensorType_INT16); m.SetInput({ 0, -6, 2, 4, // depth = 0 3, -2, 10, 1, // depth = 1 @@ -1340,7 +1358,8 @@ TEST(QuantizedActivationsOpTest, Softmax4DInt16) { // Same input, but a different shape. QuantizedActivationsOpModel m2( 0.1, - /*input=*/{TensorType_INT16, {4, 1, 1, 2}, -10, 10}, TensorType_INT16); + /*input=*/{TensorType_INT16, {4, 1, 1, 2}, 10 * kMin, 10 * kMax}, + TensorType_INT16); m2.SetInput({ 0, -6, // 2, 4, // diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc index 5d5f06ba013..acf9780a5bf 100644 --- a/tensorflow/lite/kernels/concatenation.cc +++ b/tensorflow/lite/kernels/concatenation.cc @@ -95,6 +95,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } } + if (input_type == kTfLiteInt16) { + // Make sure there all Int16 inputs have a null zero-point. + for (int i = 0; i < node->inputs->size; ++i) { + const TfLiteTensor* t = GetInput(context, node, i); + TF_LITE_ENSURE_EQ(context, t->params.zero_point, 0); + } + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + } + return context->ResizeTensor(context, output, output_size); } diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc index 4e362598aae..1d899c3c840 100644 --- a/tensorflow/lite/kernels/concatenation_test.cc +++ b/tensorflow/lite/kernels/concatenation_test.cc @@ -287,8 +287,13 @@ TYPED_TEST_CASE(ConcatenationOpTestTyped, TestTypes); TYPED_TEST(ConcatenationOpTestTyped, FourInputsQuantizedInt8) { using TestType = typename TestFixture::TestType; + const float kMin = -1; + const float kMax = + std::numeric_limits::max() / + static_cast(std::numeric_limits::max() + 1); + QuantizedConcatenationOpModel m0( - {TestFixture::tensor_type, {2, 1, 2}, -12.7, 12.8}, + {TestFixture::tensor_type, {2, 1, 2}, 12.8 * kMin, 12.8 * kMax}, /*axis=*/2, /*num_inputs=*/4); @@ -302,20 +307,6 @@ TYPED_TEST(ConcatenationOpTestTyped, FourInputsQuantizedInt8) { 1, 3, 1.1, 3.1, 1.2, 3.2, 1.3, 3.3, // 4, 7, 4.1, 7.1, 4.2, 7.2, 4.3, 7.3 // }))); - - if (TestFixture::tensor_type == TensorType_INT8) { - EXPECT_THAT(m0.GetOutput(), ElementsAreArray({ - 9, 29, 10, 30, 11, 31, 12, 32, // - 39, 69, 40, 70, 41, 71, 42, 72, // - })); - } - - if (TestFixture::tensor_type == TensorType_INT16) { - EXPECT_THAT(m0.GetOutput(), - ElementsAreArray({2441, 7581, 2698, 7838, 2955, // - 8095, 3212, 8352, 10151, 17861, // - 10408, 18118, 10665, 18375, 10922, 18632})); - } } TEST(ConcatenationOpTest, FourInputsQuantizedMixedRange) { diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc index 1b12945b2f3..eb5010231a9 100644 --- a/tensorflow/lite/kernels/conv.cc +++ b/tensorflow/lite/kernels/conv.cc @@ -333,6 +333,11 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context, input_type == kTfLiteInt8 || input_type == kTfLiteInt16); TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type); + if (input_type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + } + const TfLiteTensor* bias = nullptr; // TODO(ahentz): At this point the optimized versions require 'bias'. We can @@ -347,8 +352,6 @@ TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context, } else if (input_type == kTfLiteInt16) { TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64); TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0); - TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); - TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); } else { TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type); } diff --git a/tensorflow/lite/kernels/depthwise_conv.cc b/tensorflow/lite/kernels/depthwise_conv.cc index 961a987cf02..79af27fcaef 100644 --- a/tensorflow/lite/kernels/depthwise_conv.cc +++ b/tensorflow/lite/kernels/depthwise_conv.cc @@ -128,6 +128,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { filter->type == data_type || data_type == kTfLiteInt16); } + if (data_type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + } + // Filter in DepthwiseConv is expected to be [1, H, W, O]. TF_LITE_ENSURE_EQ(context, SizeOfDimension(filter, 0), 1); @@ -139,8 +144,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } else if (data_type == kTfLiteInt16) { TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt64); TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0); - TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); - TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); } else { TF_LITE_ENSURE_TYPES_EQ(context, bias->type, data_type); } diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc index a2a1bd495cf..9aa752406c4 100644 --- a/tensorflow/lite/kernels/dequantize.cc +++ b/tensorflow/lite/kernels/dequantize.cc @@ -60,6 +60,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { op_context.input->type == kTfLiteInt16 || op_context.input->type == kTfLiteFloat16); + if (op_context.input->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0); + } + op_context.output->type = kTfLiteFloat32; // If the input tensor is constant, we can persist the dequantized value in // the output tensor. Otherwise we run dequantize upon each eval. diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc index da795474400..e609d32aa50 100644 --- a/tensorflow/lite/kernels/dequantize_test.cc +++ b/tensorflow/lite/kernels/dequantize_test.cc @@ -108,8 +108,8 @@ TEST(DequantizeOpTest, Float16) { } TEST(DequantizeOpTest, Int16) { - DequantizeOpModel m(TensorType_INT16, {2, 5}, 0.5, -1, 4); - m.SetInput({-130, -127, -126, -125, -124, 123, 124, 125, 126, 130}); + DequantizeOpModel m(TensorType_INT16, {2, 5}, 0.5, 0, 4); + m.SetInput({-129, -126, -125, -124, -123, 124, 125, 126, 127, 131}); m.Invoke(); EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear( diff --git a/tensorflow/lite/kernels/quantize.cc b/tensorflow/lite/kernels/quantize.cc index 1779500e6a2..c305501269a 100644 --- a/tensorflow/lite/kernels/quantize.cc +++ b/tensorflow/lite/kernels/quantize.cc @@ -133,6 +133,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { &data->output_shift); } + if (input->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); + } + if (output->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); + } + return context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)); } diff --git a/tensorflow/lite/kernels/quantize_test.cc b/tensorflow/lite/kernels/quantize_test.cc index d7392b3e3ea..b0df3a99646 100644 --- a/tensorflow/lite/kernels/quantize_test.cc +++ b/tensorflow/lite/kernels/quantize_test.cc @@ -91,26 +91,27 @@ TEST(QuantizeOpTest, INT16) { 12700, 12800})); } -// rescale factor is around 2 +// Input scale 1.000000, output scale 0.500000, input zeropoint 0, output +// zeropoint 0 TEST(QuantizeOpTest, Int16Int16) { - QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -16383, 16384}, - {TensorType_INT16, {1, 1, 2, 5}, 0, 16384}); + QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 1.0, 0}, + {TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0}); m.SetInputAndQuantize({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.Invoke(); EXPECT_THAT(m.GetOutput(), - ElementsAreArray({-32764, -32760, -32756, -32752, -32748, -32744, - -32740, -32736, -32732, -32728})); + ElementsAreArray({2, 4, 6, 8, 10, 12, 14, 16, 18, 20})); } -// zero point is -1, scale is 0.5 +// Input scale 0.500000, output scale 0.500000, input zeropoint 0, output +// zeropoint 0 TEST(QuantizeOpTest, Int16Int16SameScale) { - QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -16384, 16384}, - {TensorType_INT16, {1, 1, 2, 5}, -16384, 16384}); + QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0}, + {TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0}); m.SetInputAndQuantize({0, 1, 2, 3, 4, 5, 6, 7, 8, 37767}); m.Invoke(); EXPECT_THAT(m.GetOutput(), - ElementsAreArray({-1, 1, 3, 5, 7, 9, 11, 13, 15, 32767})); + ElementsAreArray({0, 2, 4, 6, 8, 10, 12, 14, 16, 32767})); } // Input scale 0.500000, output scale 0.500000, input zeropoint -1, output @@ -408,24 +409,24 @@ TEST(QuantizeOpTest, Uint8Int8SmallerScale) { ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19})); } -// Input scale 0.500000, output scale 0.500000, input zeropoint -1, output +// Input scale 0.500000, output scale 0.500000, input zeropoint 0, output // zeropoint -1 TEST(QuantizeOpTest, Int16Int8SameScale) { - QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -63.5, 64}, - {TensorType_INT8, {1, 1, 2, 5}, -63.5, 64}); + QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0}, + {TensorType_INT8, {1, 1, 2, 5}, 0, 0, 0.5, -1}); - // Input will quantized to {1,3,5,7,9,11,13,15,17,19}. + // Input will quantized to {2,4,6,8,10,12,14,16,18,20}. m.SetInputAndQuantize({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.Invoke(); EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 5, 7, 9, 11, 13, 15, 17, 19})); } -// Input scale 0.500000, output scale 1.000000, input zeropoint -1, output +// Input scale 0.500000, output scale 1.000000, input zeropoint 0, output // zeropoint -1 TEST(QuantizeOpTest, Int16Int8LargerScale) { - QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -63.5, 64}, - {TensorType_INT8, {1, 1, 2, 5}, -127, 128}); + QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 0.5, 0}, + {TensorType_INT8, {1, 1, 2, 5}, 0, 0, 1.0, -1}); m.SetInputAndQuantize({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.Invoke(); @@ -433,11 +434,11 @@ TEST(QuantizeOpTest, Int16Int8LargerScale) { ElementsAreArray({0, 1, 2, 3, 4, 5, 6, 7, 8, 9})); } -// Input scale 1.000000, output scale 0.500000, input zeropoint -1, output +// Input scale 1.000000, output scale 0.500000, input zeropoint 0, output // zeropoint -1 TEST(QuantizeOpTest, Int16Int8SmallerScale) { - QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, -127, 128}, - {TensorType_INT8, {1, 1, 2, 5}, -63.5, 64}); + QuantizeOpModel m({TensorType_INT16, {1, 1, 2, 5}, 0, 0, 1.0, 0}, + {TensorType_INT8, {1, 1, 2, 5}, 0, 0, 0.5, -1}); m.SetInputAndQuantize({1, 2, 3, 4, 5, 6, 7, 8, 9, 10}); m.Invoke(); @@ -447,8 +448,8 @@ TEST(QuantizeOpTest, Int16Int8SmallerScale) { // Same as previous test, except more data to hit the neon path. TEST(QuantizeOpTest, Int16Int8SmallerScaleNeonPath) { - QuantizeOpModel m({TensorType_INT16, {1, 1, 4, 5}, -127, 128}, - {TensorType_INT8, {1, 1, 4, 5}, -63.5, 64}); + QuantizeOpModel m({TensorType_INT16, {1, 1, 4, 5}, 0, 0, 1.0, 0}, + {TensorType_INT8, {1, 1, 4, 5}, 0, 0, 0.5, -1}); m.SetInputAndQuantize( {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1}); diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc index 10fd7b02b61..acb49e0932b 100644 --- a/tensorflow/lite/kernels/reduce.cc +++ b/tensorflow/lite/kernels/reduce.cc @@ -254,6 +254,12 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) { QuantizeMultiplier(real_multiplier, &data->multiplier, &exponent); data->shift = exponent; } + + if (op_context.input->type == kTfLiteInt16) { + TF_LITE_ENSURE_EQ(context, op_context.input->params.zero_point, 0); + TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point, 0); + } + TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2); if (!IsConstantTensor(op_context.axis)) { SetTensorToDynamic(temp_sum); diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc index fd5724a102b..5363f64974b 100644 --- a/tensorflow/lite/kernels/reduce_test.cc +++ b/tensorflow/lite/kernels/reduce_test.cc @@ -52,6 +52,24 @@ class BaseOpModel : public SingleOpModel { int Input() { return input_; } + protected: + TensorData& SymmetricInt16Scaling(TensorData& tensor) { + // Symmetric range and null zero-point is required for INT16 tensors. As + // SingleOpModel::QuantizationParams calculates the scale on an asymmetric + // base [int_type::min, int_type::max], manually calculate the scale on a + // symmetric range [int_type::min+1, int_type::max] to ensure a null + // zero-point. + if (tensor.type == TensorType_INT16) { + CHECK_EQ(std::abs(tensor.min), tensor.max); + tensor.scale = tensor.max / std::numeric_limits::max(); + tensor.zero_point = 0; + tensor.min = 0; + tensor.max = 0; + } + + return tensor; + } + protected: int input_; int axis_; @@ -61,12 +79,12 @@ class BaseOpModel : public SingleOpModel { // Model for the tests case where axis is a const tensor. class MeanOpConstModel : public BaseOpModel { public: - MeanOpConstModel(const TensorData& input, const TensorData& output, + MeanOpConstModel(TensorData input, TensorData output, std::initializer_list axis_shape, std::initializer_list axis, bool keep_dims) { - input_ = AddInput(input); + input_ = AddInput(SymmetricInt16Scaling(input)); axis_ = AddConstInput(TensorType_INT32, axis, axis_shape); - output_ = AddOutput(output); + output_ = AddOutput(SymmetricInt16Scaling(output)); SetBuiltinOp(BuiltinOperator_MEAN, BuiltinOptions_ReducerOptions, CreateReducerOptions(builder_, keep_dims).Union()); BuildInterpreter({GetShape(input_)}); @@ -439,13 +457,10 @@ TEST(ConstUint8MeanOpTest, KeepDims) { template void MeanOpConstModelTest() { - float kQuantizedTolerance = GetTolerance(-5.0, 5.0); + float kQuantizedTolerance = GetTolerance(-255.0, 255.0); std::vector data = {105.0, 71.0, 233.0, 92.0, 227.0, 11.0, 14.0, 43.0}; - - float scale = tensor_dtype == TensorType_INT16 ? 255 / 32767.0f : 0.0f; - - MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, 0.0, 255.0, scale, 0}, - {tensor_dtype, {1, 2, 4}, 0.0, 255.0, scale, 0}, {1}, {1}, + MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -255.0, 255.0}, + {tensor_dtype, {1, 2, 4}, -255, 255.0}, {1}, {1}, false); m.QuantizeAndPopulate(m.Input(), data); m.Invoke(); @@ -468,11 +483,8 @@ template void ConstMeanOpTestNonSameScale() { float kQuantizedTolerance = GetTolerance(-5.0, 5.0); std::vector data = {0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8}; - - float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f; - - MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -1.0, 1.0, scale, 0}, - {tensor_dtype, {1, 2}, -5.0, 5.0, scale, 0}, {2}, {1, 3}, + MeanOpConstModel m({tensor_dtype, {1, 1, 2, 4}, -1.0, 1.0}, + {tensor_dtype, {1, 2}, -5.0, 5.0}, {2}, {1, 3}, false); m.QuantizeAndPopulate(m.Input(), data); m.Invoke(); @@ -495,15 +507,12 @@ TEST_F(ConstMeanOpTestNonSameScale, NonSpecialAxisNonSameScaleInt16) { template void MeanOpTestQuantizedSameScale() { float kQuantizedTolerance = GetTolerance(-5.0, 5.0); - - float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f; - std::vector data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1, 0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9, 0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3, 0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4}; - MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0, scale, 0}, - {tensor_dtype, {2}, -1.0, 1.0, scale, 0}, {2}, {1, 2}, + MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0}, + {tensor_dtype, {2}, -1.0, 1.0}, {2}, {1, 2}, true); m.QuantizeAndPopulate(m.Input(), data); m.Invoke(); @@ -527,15 +536,12 @@ TEST_F(MeanOpTestQuantizedSameScale, QuantizedSameScaleInt16) { template void MeanOpTestQuantizedDifferentScale() { float kQuantizedTolerance = GetTolerance(-5.0, 5.0); - - float scale = tensor_dtype == TensorType_INT16 ? 1 / 32767.f : 0.0f; - std::vector data = {0.1, 0.2, 0.3, 0.4, 0.2, 0.3, 0.4, 0.5, 0.1, 0.1, 0.1, 0.1, 0.4, 0.2, 0.2, 0.2, 0.9, 0.9, 0.9, 0.9, 0.2, 0.3, 0.7, 0.7, 0.1, 0.1, 0.3, 0.3, 0.1, 0.2, 0.3, 0.4, 0.1, 0.2, 0.3, 0.4}; - MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0, scale, 0}, - {tensor_dtype, {2}, -4.0, 4.0, scale, 0}, {2}, {1, 2}, + MeanOpConstModel m({tensor_dtype, {1, 2, 2, 9}, -1.0, 1.0}, + {tensor_dtype, {2}, -4.0, 4.0}, {2}, {1, 2}, true); m.QuantizeAndPopulate(m.Input(), data); m.Invoke(); From c20adea7680e2d43f64d39c7179fa59bc2dcfe2b Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Fri, 11 Sep 2020 14:05:14 +0100 Subject: [PATCH 0026/1447] PRELU operator doesn't support int16 yet, remove kTfLiteInt16 from PreluPrepare --- tensorflow/lite/kernels/activations.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc index fe40bf47a64..cfc1fe17cf3 100644 --- a/tensorflow/lite/kernels/activations.cc +++ b/tensorflow/lite/kernels/activations.cc @@ -667,8 +667,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) { output->type = input->type; - if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || - output->type == kTfLiteInt16) { + if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8) { // prelu(x) = x if x >= 0 else x * alpha. // So if we translate that for quantized computation: // From d8ec97466cbae03171944638af1ab9ce7ced36ca Mon Sep 17 00:00:00 2001 From: Eugene Kuznetsov Date: Wed, 22 Jan 2020 12:16:47 -0800 Subject: [PATCH 0027/1447] Fixing and enabling TopK on ROCm --- tensorflow/core/kernels/gpu_prim.h | 14 ++++- tensorflow/core/kernels/in_topk_op_test.cc | 4 +- tensorflow/core/kernels/topk_op.cc | 4 +- tensorflow/core/kernels/topk_op_gpu.h | 58 +++++++++++-------- .../core/kernels/topk_op_gpu_double.cu.cc | 4 +- .../core/kernels/topk_op_gpu_float.cu.cc | 4 +- .../core/kernels/topk_op_gpu_half.cu.cc | 4 +- .../core/kernels/topk_op_gpu_int16.cu.cc | 4 +- .../core/kernels/topk_op_gpu_int32.cu.cc | 4 +- .../core/kernels/topk_op_gpu_int64.cu.cc | 4 +- .../core/kernels/topk_op_gpu_int8.cu.cc | 4 +- .../core/kernels/topk_op_gpu_uint16.cu.cc | 4 +- .../core/kernels/topk_op_gpu_uint8.cu.cc | 4 +- .../python/kernel_tests/topk_op_test.py | 12 ++-- 14 files changed, 76 insertions(+), 52 deletions(-) diff --git a/tensorflow/core/kernels/gpu_prim.h b/tensorflow/core/kernels/gpu_prim.h index 82fcb21e0ac..85d6a6050b7 100644 --- a/tensorflow/core/kernels/gpu_prim.h +++ b/tensorflow/core/kernels/gpu_prim.h @@ -31,17 +31,27 @@ limitations under the license, the license you must see. #include "third_party/gpus/cuda/include/cusparse.h" namespace gpuprim = ::cub; + +// Required for sorting Eigen::half +namespace cub { +template <> +struct NumericTraits + : BaseTraits {}; +} // namespace cub + #elif TENSORFLOW_USE_ROCM #include "rocm/include/hipcub/hipcub.hpp" namespace gpuprim = ::hipcub; +// Required for sorting Eigen::half namespace rocprim { namespace detail { template <> struct radix_key_codec_base - : radix_key_codec_floating {}; + : radix_key_codec_floating {}; }; // namespace detail }; // namespace rocprim -#endif // GOOGLE_CUDA + +#endif // TENSORFLOW_USE_ROCM #endif // TENSORFLOW_CORE_KERNELS_GPU_PRIM_H_ diff --git a/tensorflow/core/kernels/in_topk_op_test.cc b/tensorflow/core/kernels/in_topk_op_test.cc index aacecb08bbe..e6baa92b764 100644 --- a/tensorflow/core/kernels/in_topk_op_test.cc +++ b/tensorflow/core/kernels/in_topk_op_test.cc @@ -76,9 +76,9 @@ static Graph* InTopK(int num_targets, int num_classes, T top_k) { BM_InTopK(int64, 64, 1000, 10, cpu); BM_InTopK(int64, 64, 10000, 10, cpu); -#ifdef GOOGLE_CUDA +#ifdef GOOGLE_CUDA || TENSORFLOW_USE_ROCM BM_InTopK(int64, 64, 1000, 10, gpu); BM_InTopK(int64, 64, 10000, 10, gpu); -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // namespace tensorflow diff --git a/tensorflow/core/kernels/topk_op.cc b/tensorflow/core/kernels/topk_op.cc index 50325b7bcfe..2b9c2181a2d 100644 --- a/tensorflow/core/kernels/topk_op.cc +++ b/tensorflow/core/kernels/topk_op.cc @@ -244,7 +244,7 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_KERNELS); #undef REGISTER_KERNELS_NAME #undef REGISTER_KERNELS -#ifdef GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM namespace functor { #define DECLARE_GPU_SPEC(T) \ @@ -277,6 +277,6 @@ TF_CALL_GPU_NUMBER_TYPES(REGISTER_KERNELS); TF_CALL_INTEGRAL_TYPES(REGISTER_KERNELS); #undef REGISTER_KERNELS -#endif // end GOOGLE_CUDA +#endif // end GOOGLE_CUDA || TENSORFLOW_USE_ROCM } // end namespace tensorflow diff --git a/tensorflow/core/kernels/topk_op_gpu.h b/tensorflow/core/kernels/topk_op_gpu.h index d26dd7a8bc3..0cb448bb86e 100644 --- a/tensorflow/core/kernels/topk_op_gpu.h +++ b/tensorflow/core/kernels/topk_op_gpu.h @@ -15,11 +15,12 @@ limitations under the License. #ifndef TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_ #define TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include +#include #include #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" @@ -34,15 +35,6 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/gpu_kernel_helper.h" -#if GOOGLE_CUDA -// Required for sorting Eigen::half -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub -#endif // GOOGLE_CUDA - namespace tensorflow { typedef Eigen::GpuDevice GPUDevice; @@ -93,7 +85,7 @@ struct IndirectLinearData { Entry* const backing_data; }; -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM template struct StridedData { typedef impl::Entry Entry; @@ -115,6 +107,7 @@ template ::Entry Entry; const Data data; + __device__ IndexedHeap(const Data& d) : data(d) {} __device__ bool is_above(int left, int right) { T left_value = data.get_value(left); @@ -337,12 +330,21 @@ __device__ void mergeShards(int num_shards, int k, } } +#if GOOGLE_CUDA extern __shared__ char shared_memory[]; +#endif template -__global__ void TopKKernel(const T* __restrict__ input, int length, int k, - bool sorted, T* __restrict__ output, - int* __restrict__ indices) { +#if TENSORFLOW_USE_ROCM +__attribute__((amdgpu_flat_work_group_size(1, 256))) +#endif +__global__ void TopKKernel( + const T* __restrict__ input, int length, int k, bool sorted, + T* __restrict__ output, int* __restrict__ indices) { +#if TENSORFLOW_USE_ROCM + HIP_DYNAMIC_SHARED(char, shared_memory); +#endif + const int batch_index = blockIdx.x; const T* batch_input = input + batch_index * length; @@ -370,7 +372,7 @@ __global__ void TopKKernel(const T* __restrict__ input, int length, int k, } template -cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards, +cudaError LaunchTopKKernel(const gpuStream_t& stream, int num_shards, const T* input, int batch_size, int length, int k, bool sorted, T* output, int* indices) { // This code assumes that k is small enough that the computation @@ -395,9 +397,17 @@ cudaError LaunchTopKKernel(const cudaStream_t& stream, int num_shards, } if (num_shards <= 0) { num_shards = 1; +#if GOOGLE_CUDA } else if (num_shards > 1024) { num_shards = 1024; } +#else + // ROCm can't execute with 1024 and requires an explicit + // amdgpu_flat_work_group_size attribute with >256 + } else if (num_shards > 256) { + num_shards = 256; + } +#endif } // We are limited by the amount of shared memory we have per block. auto shared_memory_size = (num_shards + 1) * k * sizeof(Entry); @@ -448,9 +458,9 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows, input_indices_t.device(d) = input_indices_t.generate(ColumnIndexCreator(num_cols)); - cub::CountingInputIterator counting_iter(0); - cub::TransformInputIterator> + gpuprim::CountingInputIterator counting_iter(0); + gpuprim::TransformInputIterator> segment_offsets_t(counting_iter, SegmentOffsetCreator(num_cols)); Tensor temp_values; @@ -472,7 +482,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows, sorted_values_ptr = temp_values.flat().data(); } - auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + auto err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending( /* d_temp_storage */ nullptr, /* temp_storage_bytes */ temp_storage_bytes, /* d_keys_in */ input, @@ -489,7 +499,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows, if (err != cudaSuccess) { return errors::Internal( "TopKOp: Could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate " + "cub::gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to " + "calculate " "temp_storage_bytes, status: ", cudaGetErrorString(err)); } @@ -497,7 +508,7 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows, TF_RETURN_IF_ERROR(ctx->allocate_temp( DT_INT8, TensorShape({static_cast(temp_storage_bytes)}), &temp_storage)); - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + err = gpuprim::DeviceSegmentedRadixSort::SortPairsDescending( /* d_temp_storage */ temp_storage.flat().data(), /* temp_storage_bytes */ temp_storage_bytes, /* d_keys_in */ input, @@ -514,7 +525,8 @@ Status LaunchSortKernel(OpKernelContext* ctx, const T* input, int num_rows, if (err != cudaSuccess) { return errors::Internal( "TopKOp: Could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, " + "cub::gpuprim::DeviceSegmentedRadixSort::SortPairsDescending to sort " + "input, " "temp_storage_bytes: ", temp_storage_bytes, ", status: ", cudaGetErrorString(err)); } @@ -567,6 +579,6 @@ struct TopKFunctor { } // end namespace functor } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM #endif // TENSORFLOW_CORE_KERNELS_TOPK_OP_GPU_H_ diff --git a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc index 8a5a7e71b1b..787aafdfd07 100644 --- a/tensorflow/core/kernels/topk_op_gpu_double.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_double.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc index 0b69396bb13..10d106248f9 100644 --- a/tensorflow/core/kernels/topk_op_gpu_float.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_float.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc index e53586aeca2..bde26cb0951 100644 --- a/tensorflow/core/kernels/topk_op_gpu_half.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_half.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc index 5bd310523c9..fba39300700 100644 --- a/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_int16.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc index 55b393a0c02..a017234597d 100644 --- a/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_int32.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc index 3e4a7750563..ed9f6ea52c6 100644 --- a/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_int64.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc index ac73cd170b8..647700ebcda 100644 --- a/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_int8.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc index bc64a2ecd63..41ab6ffa601 100644 --- a/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_uint16.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -27,4 +27,4 @@ template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc index fc1a8a2c8cc..6d544291fed 100644 --- a/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc +++ b/tensorflow/core/kernels/topk_op_gpu_uint8.cu.cc @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#if GOOGLE_CUDA +#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM #define EIGEN_USE_GPU #include "tensorflow/core/kernels/topk_op.h" @@ -25,4 +25,4 @@ using Eigen::GpuDevice; template struct functor::TopKFunctor; } // namespace tensorflow -#endif // GOOGLE_CUDA +#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py index eb74d96786b..b64488b042d 100644 --- a/tensorflow/python/kernel_tests/topk_op_test.py +++ b/tensorflow/python/kernel_tests/topk_op_test.py @@ -102,11 +102,13 @@ class TopKTest(test.TestCase): self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]]) def testTop3(self): - k = 5 - inputs = np.random.permutation(np.linspace(0, 100, 6140, dtype=np.float64)) - indices = np.argsort(-inputs)[:k] - values = -np.sort(-inputs)[:k] - self._validateTopK(inputs, k, values, indices) + for k in range(3, 11, 2): + for dim in range(512, 12288, 512): + inputs = np.random.permutation(np.linspace(0, 100, dim, + dtype=np.float64)) + indices = np.argsort(-inputs)[:k] + values = -np.sort(-inputs)[:k] + self._validateTopK(inputs, k, values, indices) def testTop1AllNan(self): inputs = [[np.NaN, np.NaN], [np.NaN, np.NaN]] From aaea82e6bcd0948a9b2bf10396684c2f01fb60ea Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 6 Jul 2020 21:09:32 +1000 Subject: [PATCH 0028/1447] Add cublasLt wrappers to stream_executor - Adds ThenBlasLtMatmul routines that behave similarly to ThenBlasGemmWithAlgorithm but call into the cublasLt library and allow separation of plan creation and execution. - A list of heuristically-prioritized opaque algorithm objects can be obtained via GetBlasLtMatmulAlgorithms. - These routines are only supported when the CUDA version is >= 11.0. --- tensorflow/stream_executor/blas.cc | 25 + tensorflow/stream_executor/blas.h | 218 ++++++ tensorflow/stream_executor/cuda/BUILD | 24 + .../stream_executor/cuda/cublasLt_11_0.inc | 415 +++++++++++ .../stream_executor/cuda/cublasLt_stub.cc | 59 ++ tensorflow/stream_executor/cuda/cuda_blas.cc | 680 +++++++++++++++++- tensorflow/stream_executor/cuda/cuda_blas.h | 30 + .../platform/default/dlopen_checker.cc | 3 +- .../platform/default/dso_loader.cc | 9 + .../platform/default/dso_loader.h | 2 + tensorflow/stream_executor/stream.cc | 137 ++++ tensorflow/stream_executor/stream.h | 50 ++ .../stream_executor/stream_executor_pimpl.cc | 43 ++ .../stream_executor/stream_executor_pimpl.h | 29 + third_party/gpus/cuda/BUILD.tpl | 8 + third_party/gpus/cuda_configure.bzl | 12 + 16 files changed, 1741 insertions(+), 3 deletions(-) create mode 100644 tensorflow/stream_executor/cuda/cublasLt_11_0.inc create mode 100644 tensorflow/stream_executor/cuda/cublasLt_stub.cc diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc index f499b3003d0..f55e318e88b 100644 --- a/tensorflow/stream_executor/blas.cc +++ b/tensorflow/stream_executor/blas.cc @@ -95,5 +95,30 @@ std::ostream& operator<<(std::ostream& os, ComputationType ty) { return os << ComputationTypeString(ty); } +string DataTypeString(DataType ty) { + switch (ty) { + case DataType::kF16: + return "f16"; + case DataType::kF32: + return "f32"; + case DataType::kF64: + return "f64"; + case DataType::kI8: + return "i8"; + case DataType::kI32: + return "i32"; + case DataType::kComplexF32: + return "complex f32"; + case DataType::kComplexF64: + return "complex f64"; + default: + LOG(FATAL) << "Unknown DataType " << static_cast(ty); + } +} + +std::ostream& operator<<(std::ostream& os, DataType ty) { + return os << DataTypeString(ty); +} + } // namespace blas } // namespace stream_executor diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 5018d487ed1..583fba2a505 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -101,6 +101,10 @@ enum class ComputationType { kI32, // 32-bit integer kComplexF32, // Complex number comprised of two f32s. kComplexF64, // Complex number comprised of two f64s. + // The below values are only supported for BlasLt routines (both real and + // complex). + kF32FastTF32, // 32-bit floating-point with reduced (>=10-bit) mantissa + kF32FastBF16, // 32-bit floating-point with reduced (7-bit) mantissa }; // Converts a ComputationType to a string. @@ -108,6 +112,61 @@ std::string ComputationTypeString(ComputationType ty); std::ostream &operator<<(std::ostream &os, ComputationType ty); +// Type with which inputs and outputs of a blaslt routine are performed. +enum class DataType { + kF16, // 16-bit floating-point + kF32, // 32-bit floating-point + kF64, // 64-bit floating-point + kI8, // 8-bit integer + kI32, // 32-bit integer + kComplexF32, // Complex number comprised of two f32s + kComplexF64, // Complex number comprised of two f64s +}; + +// Describes the type of pointers for the scaling factors alpha and beta in +// blaslt routines. +enum class PointerMode { + kHost, + kDevice, +}; + +// Converts a ComputationType to a string. +string DataTypeString(DataType ty); + +std::ostream &operator<<(std::ostream &os, DataType ty); + +// Converts a compile-time type to a DataType value. +template +struct ToDataType {}; +template <> +struct ToDataType { + static constexpr const DataType value = DataType::kF16; +}; +template <> +struct ToDataType { + static constexpr const DataType value = DataType::kF32; +}; +template <> +struct ToDataType { + static constexpr const DataType value = DataType::kF64; +}; +template <> +struct ToDataType { + static constexpr const DataType value = DataType::kI8; +}; +template <> +struct ToDataType { + static constexpr const DataType value = DataType::kI32; +}; +template <> +struct ToDataType> { + static constexpr const DataType value = DataType::kComplexF32; +}; +template <> +struct ToDataType> { + static constexpr const DataType value = DataType::kComplexF64; +}; + // Opaque identifier for an "algorithm" used by a blas routine. This functions // as a hint to the blas library. typedef int64 AlgorithmType; @@ -163,6 +222,19 @@ class AlgorithmConfig { AlgorithmType algorithm_; }; +struct IBlasLtMatmulPlan { + virtual ~IBlasLtMatmulPlan() {} +}; + +struct IBlasLtMatmulAlgorithm { + virtual ~IBlasLtMatmulAlgorithm() {} + // Returns the index of the algorithm within the list returned by + // GetBlasLtMatmulAlgorithms. + virtual AlgorithmType index() const = 0; + // Returns the workspace size required by the algorithm in bytes. + virtual size_t workspace_size() const = 0; +}; + // BLAS support interface -- this can be derived from a GPU executor when the // underlying platform has an BLAS library implementation available. See // StreamExecutor::AsBlas(). @@ -1383,6 +1455,93 @@ class BlasSupport { const DeviceMemory> &a, int lda, DeviceMemory> *b, int ldb) = 0; + // Creates a backend-specific plan object for a blaslt matmul operation, which + // can then be passed to DoBlasLtMatmul(). When possible, plans should be + // created once and reused for multiple calls to DoBlasLtMatmul(). + // Returns a null pointer on failure. + std::unique_ptr CreateBlasLtMatmulPlan( + blas::DataType ab_type, blas::DataType c_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, int64 lda, int64 ldb, int64 ldc) { + return CreateBlasLtMatmulPlanStridedBatched( + ab_type, c_type, computation_type, pointer_mode, transa, transb, m, n, + k, 1, lda, 0, ldb, 0, ldc, 0); + } + + // A more general version of CreateBlasLtMatmulPlan supporting + // batched operations. + virtual std::unique_ptr + CreateBlasLtMatmulPlanStridedBatched( + blas::DataType ab_type, blas::DataType c_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, + int64 stride_b, int64 ldc, int64 stride_c) = 0; + + // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are + // returned in the order of increasing estimated compute time according to an + // internal heuristic. The first returned algorithm can be used as the default + // algorithm if no autotuning is to be performed. + virtual bool GetBlasLtMatmulAlgorithms( + const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, + int max_algorithm_count, + std::vector>* + out_algorithms) = 0; + + // Executes a blaslt matmul operation on the stream. If output_profile_result + // is not nullptr, the operation is profiled, error messages are + // suppressed, and output_profile_result->algorithm() is set to + // algorithm->index(). + virtual bool DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr) = 0; + virtual bool DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, const DeviceMemory& b, + const HostOrDeviceScalar& beta, DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr) = 0; + virtual bool DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr) = 0; + virtual bool DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr) = 0; + virtual bool DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr) = 0; + virtual bool DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr) = 0; + virtual port::Status GetVersion(std::string *version) = 0; protected: @@ -2196,6 +2355,65 @@ class BlasSupport { uint64 n, std::complex alpha, \ const DeviceMemory> &a, int lda, \ DeviceMemory> *b, int ldb) override; \ + std::unique_ptr \ + CreateBlasLtMatmulPlanStridedBatched( \ + blas::DataType ab_type, blas::DataType cd_type, \ + blas::ComputationType computation_type, blas::PointerMode pointer_mode, \ + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, \ + uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, \ + int64 stride_b, int64 ldc, int64 stride_c) override; \ + bool GetBlasLtMatmulAlgorithms( \ + const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, \ + int max_algorithm_count, \ + std::vector>* \ + out_algorithms) override; \ + bool DoBlasLtMatmul( \ + Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ + const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ + const DeviceMemory& b, const HostOrDeviceScalar& beta, \ + DeviceMemory* c, ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, \ + blas::ProfileResult* output_profile_result = nullptr) override; \ + bool DoBlasLtMatmul( \ + Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ + const HostOrDeviceScalar& alpha, \ + const DeviceMemory& a, const DeviceMemory& b, \ + const HostOrDeviceScalar& beta, \ + DeviceMemory* c, ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, \ + blas::ProfileResult* output_profile_result) override; \ + bool DoBlasLtMatmul( \ + Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ + const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ + const DeviceMemory& b, const HostOrDeviceScalar& beta, \ + DeviceMemory* c, ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, \ + blas::ProfileResult* output_profile_result) override; \ + bool DoBlasLtMatmul( \ + Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ + const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ + const DeviceMemory& b, const HostOrDeviceScalar& beta, \ + DeviceMemory* c, ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, \ + blas::ProfileResult* output_profile_result) override; \ + bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ + const HostOrDeviceScalar>& alpha, \ + const DeviceMemory>& a, \ + const DeviceMemory>& b, \ + const HostOrDeviceScalar>& beta, \ + DeviceMemory>* c, \ + ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, \ + blas::ProfileResult* output_profile_result) override; \ + bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ + const HostOrDeviceScalar>& alpha, \ + const DeviceMemory>& a, \ + const DeviceMemory>& b, \ + const HostOrDeviceScalar>& beta, \ + DeviceMemory>* c, \ + ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, \ + blas::ProfileResult* output_profile_result) override; \ port::Status GetVersion(std::string *version) override; } // namespace blas diff --git a/tensorflow/stream_executor/cuda/BUILD b/tensorflow/stream_executor/cuda/BUILD index dccdab8877e..87cb64490a6 100644 --- a/tensorflow/stream_executor/cuda/BUILD +++ b/tensorflow/stream_executor/cuda/BUILD @@ -242,6 +242,29 @@ alias( visibility = ["//visibility:public"], ) +cc_library( + name = "cublasLt_stub", + srcs = if_cuda_is_configured(["cublasLt_stub.cc"]), + textual_hdrs = glob(["cublasLt_*.inc"]), + deps = if_cuda_is_configured([ + # LINT.IfChange + "@local_config_cuda//cuda:cublas_headers", + # LINT.ThenChange(//tensorflow/copy.bara.sky:cublasLt_headers) + "@local_config_cuda//cuda:cuda_headers", + "//tensorflow/stream_executor/lib", + "//tensorflow/stream_executor/platform:dso_loader", + ]), +) + +alias( + name = "cublasLt_lib", + actual = select({ + "//tensorflow:oss": ":cublasLt_stub", + "//conditions:default": "@local_config_cuda//cuda:cublasLt", + }), + visibility = ["//visibility:public"], +) + cc_library( name = "cublas_plugin", srcs = if_cuda_is_configured(["cuda_blas.cc"]), @@ -249,6 +272,7 @@ cc_library( visibility = ["//visibility:public"], deps = if_cuda_is_configured([ ":cublas_lib", + ":cublasLt_lib", ":cuda_activation", ":cuda_gpu_executor", ":cuda_platform_id", diff --git a/tensorflow/stream_executor/cuda/cublasLt_11_0.inc b/tensorflow/stream_executor/cuda/cublasLt_11_0.inc new file mode 100644 index 00000000000..819dfced4ff --- /dev/null +++ b/tensorflow/stream_executor/cuda/cublasLt_11_0.inc @@ -0,0 +1,415 @@ +// Auto-generated, do not edit. + +extern "C" { + +cublasStatus_t CUBLASWINAPI +cublasLtCreate(cublasLtHandle_t *lightHandle) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t *); + static auto func_ptr = LoadSymbol("cublasLtCreate"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle); +} + +cublasStatus_t CUBLASWINAPI +cublasLtDestroy(cublasLtHandle_t lightHandle) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t); + static auto func_ptr = LoadSymbol("cublasLtDestroy"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle); +} + +size_t CUBLASWINAPI +cublasLtGetVersion(void) { + using FuncPtr = size_t (CUBLASWINAPI *)(); + static auto func_ptr = LoadSymbol("cublasLtGetVersion"); + if (!func_ptr) return 0; + return func_ptr(); +} + +size_t CUBLASWINAPI +cublasLtGetCudartVersion(void) { + using FuncPtr = size_t (CUBLASWINAPI *)(); + static auto func_ptr = LoadSymbol("cublasLtGetCudartVersion"); + if (!func_ptr) return 0; + return func_ptr(); +} + +cublasStatus_t CUBLASWINAPI +cublasLtGetProperty(libraryPropertyType type, int *value) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(libraryPropertyType, int *); + static auto func_ptr = LoadSymbol("cublasLtGetProperty"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(type, value); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmul(cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t computeDesc, + const void *alpha, /* host or device pointer */ + const void *A, + cublasLtMatrixLayout_t Adesc, + const void *B, + cublasLtMatrixLayout_t Bdesc, + const void *beta, /* host or device pointer */ + const void *C, + cublasLtMatrixLayout_t Cdesc, + void *D, + cublasLtMatrixLayout_t Ddesc, + const cublasLtMatmulAlgo_t *algo, + void *workspace, + size_t workspaceSizeInBytes, + cudaStream_t stream) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t, cublasLtMatmulDesc_t, const void *, const void *, cublasLtMatrixLayout_t, const void *, cublasLtMatrixLayout_t, const void *, const void *, cublasLtMatrixLayout_t, void *, cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, void *, size_t, cudaStream_t); + static auto func_ptr = LoadSymbol("cublasLtMatmul"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle, computeDesc, alpha, A, Adesc, B, Bdesc, beta, C, Cdesc, D, Ddesc, algo, workspace, workspaceSizeInBytes, stream); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixTransform(cublasLtHandle_t lightHandle, + cublasLtMatrixTransformDesc_t transformDesc, + const void *alpha, /* host or device pointer */ + const void *A, + cublasLtMatrixLayout_t Adesc, + const void *beta, /* host or device pointer */ + const void *B, + cublasLtMatrixLayout_t Bdesc, + void *C, + cublasLtMatrixLayout_t Cdesc, + cudaStream_t stream) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t, cublasLtMatrixTransformDesc_t, const void *, const void *, cublasLtMatrixLayout_t, const void *, const void *, cublasLtMatrixLayout_t, void *, cublasLtMatrixLayout_t, cudaStream_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixTransform"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle, transformDesc, alpha, A, Adesc, beta, B, Bdesc, C, Cdesc, stream); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixLayoutInit_internal( // + cublasLtMatrixLayout_t matLayout, + size_t size, + cudaDataType type, + uint64_t rows, + uint64_t cols, + int64_t ld) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatrixLayout_t, size_t, cudaDataType, uint64_t, uint64_t, int64_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixLayoutInit_internal"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matLayout, size, type, rows, cols, ld); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixLayoutCreate( // + cublasLtMatrixLayout_t *matLayout, + cudaDataType type, + uint64_t rows, + uint64_t cols, + int64_t ld) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatrixLayout_t *, cudaDataType, uint64_t, uint64_t, int64_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixLayoutCreate"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matLayout, type, rows, cols, ld); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatrixLayout_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixLayoutDestroy"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matLayout); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixLayoutSetAttribute( // + cublasLtMatrixLayout_t matLayout, + cublasLtMatrixLayoutAttribute_t attr, + const void *buf, + size_t sizeInBytes) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, const void *, size_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixLayoutSetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matLayout, attr, buf, sizeInBytes); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixLayoutGetAttribute( // + cublasLtMatrixLayout_t matLayout, + cublasLtMatrixLayoutAttribute_t attr, + void *buf, + size_t sizeInBytes, + size_t *sizeWritten) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatrixLayout_t, cublasLtMatrixLayoutAttribute_t, void *, size_t, size_t *); + static auto func_ptr = LoadSymbol("cublasLtMatrixLayoutGetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matLayout, attr, buf, sizeInBytes, sizeWritten); +} + +cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal( // + cublasLtMatmulDesc_t matmulDesc, + size_t size, + cublasComputeType_t computeType, + cudaDataType_t scaleType) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatmulDesc_t, size_t, cublasComputeType_t, cudaDataType_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulDescInit_internal"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matmulDesc, size, computeType, scaleType); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulDescCreate(cublasLtMatmulDesc_t *matmulDesc, cublasComputeType_t computeType, cudaDataType_t scaleType) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatmulDesc_t *, cublasComputeType_t, cudaDataType_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulDescCreate"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matmulDesc, computeType, scaleType); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatmulDesc_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulDescDestroy"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matmulDesc); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulDescSetAttribute( // + cublasLtMatmulDesc_t matmulDesc, + cublasLtMatmulDescAttributes_t attr, + const void *buf, + size_t sizeInBytes) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, const void *, size_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulDescSetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matmulDesc, attr, buf, sizeInBytes); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulDescGetAttribute( // + cublasLtMatmulDesc_t matmulDesc, + cublasLtMatmulDescAttributes_t attr, + void *buf, + size_t sizeInBytes, + size_t *sizeWritten) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatmulDesc_t, cublasLtMatmulDescAttributes_t, void *, size_t, size_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulDescGetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(matmulDesc, attr, buf, sizeInBytes, sizeWritten); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixTransformDescInit_internal(cublasLtMatrixTransformDesc_t transformDesc, size_t size, cudaDataType scaleType) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t, size_t, cudaDataType); + static auto func_ptr = LoadSymbol("cublasLtMatrixTransformDescInit_internal"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(transformDesc, size, scaleType); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixTransformDescCreate(cublasLtMatrixTransformDesc_t *transformDesc, cudaDataType scaleType) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t *, cudaDataType); + static auto func_ptr = LoadSymbol("cublasLtMatrixTransformDescCreate"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(transformDesc, scaleType); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixTransformDescDestroy(cublasLtMatrixTransformDesc_t transformDesc) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatrixTransformDesc_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixTransformDescDestroy"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(transformDesc); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixTransformDescSetAttribute( // + cublasLtMatrixTransformDesc_t transformDesc, + cublasLtMatrixTransformDescAttributes_t attr, + const void *buf, + size_t sizeInBytes) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t, const void *, size_t); + static auto func_ptr = LoadSymbol("cublasLtMatrixTransformDescSetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(transformDesc, attr, buf, sizeInBytes); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatrixTransformDescGetAttribute( // + cublasLtMatrixTransformDesc_t transformDesc, + cublasLtMatrixTransformDescAttributes_t attr, + void *buf, + size_t sizeInBytes, + size_t *sizeWritten) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatrixTransformDesc_t, cublasLtMatrixTransformDescAttributes_t, void *, size_t, size_t *); + static auto func_ptr = LoadSymbol("cublasLtMatrixTransformDescGetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(transformDesc, attr, buf, sizeInBytes, sizeWritten); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulPreferenceInit_internal(cublasLtMatmulPreference_t pref, size_t size) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatmulPreference_t, size_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulPreferenceInit_internal"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(pref, size); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t *pref) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatmulPreference_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulPreferenceCreate"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(pref); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatmulPreference_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulPreferenceDestroy"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(pref); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulPreferenceSetAttribute( // + cublasLtMatmulPreference_t pref, + cublasLtMatmulPreferenceAttributes_t attr, + const void *buf, + size_t sizeInBytes) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, const void *, size_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulPreferenceSetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(pref, attr, buf, sizeInBytes); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulPreferenceGetAttribute( // + cublasLtMatmulPreference_t pref, + cublasLtMatmulPreferenceAttributes_t attr, + void *buf, + size_t sizeInBytes, + size_t *sizeWritten) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtMatmulPreference_t, cublasLtMatmulPreferenceAttributes_t, void *, size_t, size_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulPreferenceGetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(pref, attr, buf, sizeInBytes, sizeWritten); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoGetHeuristic( + cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t operationDesc, + cublasLtMatrixLayout_t Adesc, + cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, + cublasLtMatrixLayout_t Ddesc, + cublasLtMatmulPreference_t preference, + int requestedAlgoCount, + cublasLtMatmulHeuristicResult_t heuristicResultsArray[], + int *returnAlgoCount) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatmulPreference_t, int, cublasLtMatmulHeuristicResult_t [], int *); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoGetHeuristic"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, preference, requestedAlgoCount, heuristicResultsArray, returnAlgoCount); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoGetIds( + cublasLtHandle_t lightHandle, + cublasComputeType_t computeType, + cudaDataType_t scaleType, + cudaDataType_t Atype, + cudaDataType_t Btype, + cudaDataType_t Ctype, + cudaDataType_t Dtype, + int requestedAlgoCount, + int algoIdsArray[], + int *returnAlgoCount) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t, int, int [], int *); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoGetIds"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, requestedAlgoCount, algoIdsArray, returnAlgoCount); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoInit ( cublasLtHandle_t lightHandle, + cublasComputeType_t computeType, + cudaDataType_t scaleType, + cudaDataType_t Atype, + cudaDataType_t Btype, + cudaDataType_t Ctype, + cudaDataType_t Dtype, + int algoId, + cublasLtMatmulAlgo_t *algo) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtHandle_t, cublasComputeType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t, cudaDataType_t, int, cublasLtMatmulAlgo_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoInit"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle, computeType, scaleType, Atype, Btype, Ctype, Dtype, algoId, algo); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoCheck( // + cublasLtHandle_t lightHandle, + cublasLtMatmulDesc_t operationDesc, + cublasLtMatrixLayout_t Adesc, + cublasLtMatrixLayout_t Bdesc, + cublasLtMatrixLayout_t Cdesc, + cublasLtMatrixLayout_t Ddesc, + const cublasLtMatmulAlgo_t *algo, ///< may point to result->algo + cublasLtMatmulHeuristicResult_t *result) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(// + cublasLtHandle_t, cublasLtMatmulDesc_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, cublasLtMatrixLayout_t, const cublasLtMatmulAlgo_t *, ///< may point to result->algo + cublasLtMatmulHeuristicResult_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoCheck"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(lightHandle, operationDesc, Adesc, Bdesc, Cdesc, Ddesc, algo, result); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoCapGetAttribute( + const cublasLtMatmulAlgo_t *algo, + cublasLtMatmulAlgoCapAttributes_t attr, + void *buf, + size_t sizeInBytes, + size_t *sizeWritten) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoCapAttributes_t, void *, size_t, size_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoCapGetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoConfigSetAttribute( + cublasLtMatmulAlgo_t *algo, + cublasLtMatmulAlgoConfigAttributes_t attr, + const void *buf, + size_t sizeInBytes) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t, const void *, size_t); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoConfigSetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(algo, attr, buf, sizeInBytes); +} + +cublasStatus_t CUBLASWINAPI +cublasLtMatmulAlgoConfigGetAttribute( + const cublasLtMatmulAlgo_t *algo, + cublasLtMatmulAlgoConfigAttributes_t attr, + void *buf, + size_t sizeInBytes, + size_t *sizeWritten) { + using FuncPtr = cublasStatus_t (CUBLASWINAPI *)(const cublasLtMatmulAlgo_t *, cublasLtMatmulAlgoConfigAttributes_t, void *, size_t, size_t *); + static auto func_ptr = LoadSymbol("cublasLtMatmulAlgoConfigGetAttribute"); + if (!func_ptr) return GetSymbolNotFoundError(); + return func_ptr(algo, attr, buf, sizeInBytes, sizeWritten); +} + +} // extern "C" diff --git a/tensorflow/stream_executor/cuda/cublasLt_stub.cc b/tensorflow/stream_executor/cuda/cublasLt_stub.cc new file mode 100644 index 00000000000..aae8a94285b --- /dev/null +++ b/tensorflow/stream_executor/cuda/cublasLt_stub.cc @@ -0,0 +1,59 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ +#include "third_party/gpus/cuda/include/cublasLt.h" +#include "third_party/gpus/cuda/include/cuda.h" +#include "tensorflow/stream_executor/lib/env.h" +#include "tensorflow/stream_executor/platform/dso_loader.h" + +// Implements the cuBLASLt API by forwarding to cuBLASLt loaded from the DSO. + +namespace { +// Returns DSO handle or null if loading the DSO fails. +void* GetDsoHandle() { +#ifdef PLATFORM_GOOGLE + return nullptr; +#else + static auto handle = []() -> void* { + auto handle_or = + stream_executor::internal::DsoLoader::GetCublasLtDsoHandle(); + if (!handle_or.ok()) return nullptr; + return handle_or.ValueOrDie(); + }(); + return handle; +#endif +} + +template +T LoadSymbol(const char* symbol_name) { + void* symbol = nullptr; + if (auto handle = GetDsoHandle()) { + stream_executor::port::Env::Default() + ->GetSymbolFromLibrary(handle, symbol_name, &symbol) + .IgnoreError(); + } + return reinterpret_cast(symbol); +} + +void LogFatalSymbolNotFound(const char* symbol_name) { + LOG(FATAL) << symbol_name << " symbol not found."; +} + +cublasStatus_t GetSymbolNotFoundError() { return CUBLAS_STATUS_INTERNAL_ERROR; } +} // namespace + +// We only use cublasLt from CUDA 11.0 onward. +#if CUDA_VERSION >= 11000 +#include "tensorflow/stream_executor/cuda/cublasLt_11_0.inc" +#endif diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 4b659bb81e1..565a1c02fb4 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -14,6 +14,7 @@ limitations under the License. ==============================================================================*/ #include "third_party/gpus/cuda/include/cublas_v2.h" +#include "third_party/gpus/cuda/include/cublasLt.h" #include "third_party/gpus/cuda/include/cuda.h" #define SE_CUDA_DATA_HALF CUDA_R_16F @@ -226,17 +227,38 @@ bool CUDABlas::Init() { return false; } +#if CUDA_VERSION >= 11000 + ret = cublasLtCreate(&blasLt_); + if (ret != CUBLAS_STATUS_SUCCESS) { + LOG(ERROR) << "failed to create cublasLt handle: " << ToString(ret); + return false; + } +#endif // CUDA_VERSION >= 11000 + return true; } -CUDABlas::CUDABlas(gpu::GpuExecutor *parent) - : parent_(CHECK_NOTNULL(parent)), blas_(nullptr) {} +CUDABlas::CUDABlas(gpu::GpuExecutor* parent) + : parent_(CHECK_NOTNULL(parent)), + blas_(nullptr) +#if CUDA_VERSION >= 11000 + , + blasLt_(nullptr) +#endif +{ +} CUDABlas::~CUDABlas() { if (blas_ != nullptr) { gpu::ScopedActivateExecutorContext sac{parent_}; cublasDestroy(blas_); } +#if CUDA_VERSION >= 11000 + if (blasLt_ != nullptr) { + gpu::ScopedActivateExecutorContext sac{parent_}; + cublasLtDestroy(blasLt_); + } +#endif } bool CUDABlas::SetStream(Stream *stream) { @@ -253,6 +275,13 @@ bool CUDABlas::SetStream(Stream *stream) { return true; } +cudaStream_t CUDABlas::CUDAStream(Stream* stream) { + CHECK(stream != nullptr); + CHECK(AsGpuStreamValue(stream) != nullptr); + gpu::ScopedActivateExecutorContext sac{parent_}; + return AsGpuStreamValue(stream); +} + namespace { // Helper functions transforming blas arguments into cuBLAS arguments. @@ -381,6 +410,82 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) { return CUDA_C_32F; case blas::ComputationType::kComplexF64: return CUDA_C_64F; + case blas::ComputationType::kF32FastTF32: // fall-through + case blas::ComputationType::kF32FastBF16: + // These cases are currently only supported in the blasLt routines, which + // use CUBLASComputationType() instead. + LOG(FATAL) << "Invalid value of blas::ComputationType."; + } +} + +#if CUDA_VERSION >= 11000 +cublasComputeType_t CUBLASComputationType(blas::ComputationType ty) { + switch (ty) { + case blas::ComputationType::kF16: + return CUBLAS_COMPUTE_16F; + case blas::ComputationType::kF32: // fall-through + case blas::ComputationType::kComplexF32: + return CUBLAS_COMPUTE_32F; + case blas::ComputationType::kF64: // fall-through + case blas::ComputationType::kComplexF64: + return CUBLAS_COMPUTE_64F; + case blas::ComputationType::kI32: + return CUBLAS_COMPUTE_32I; + case blas::ComputationType::kF32FastTF32: + return CUBLAS_COMPUTE_32F_FAST_TF32; + case blas::ComputationType::kF32FastBF16: + return CUBLAS_COMPUTE_32F_FAST_16BF; + } +} +#endif // CUDA_VERSION >= 11000 + +blas::DataType GetScaleType(blas::DataType data_type, + blas::ComputationType compute_type) { + bool is_complex = data_type == blas::DataType::kComplexF32 || + data_type == blas::DataType::kComplexF64; + switch (compute_type) { + case blas::ComputationType::kF16: + return blas::DataType::kF16; + case blas::ComputationType::kF32: // fall-through + case blas::ComputationType::kComplexF32: // fall-through + case blas::ComputationType::kF32FastTF32: // fall-through + case blas::ComputationType::kF32FastBF16: + return is_complex ? blas::DataType::kComplexF32 : blas::DataType::kF32; + case blas::ComputationType::kF64: // fall-through + case blas::ComputationType::kComplexF64: + return is_complex ? blas::DataType::kComplexF64 : blas::DataType::kF64; + case blas::ComputationType::kI32: + return blas::DataType::kI32; + } +} + +#if CUDA_VERSION >= 11000 +cublasLtPointerMode_t CUBLASPointerMode(blas::PointerMode pointer_mode) { + switch (pointer_mode) { + case blas::PointerMode::kHost: + return CUBLASLT_POINTER_MODE_HOST; + case blas::PointerMode::kDevice: + return CUBLASLT_POINTER_MODE_DEVICE; + } +} +#endif // CUDA_VERSION >= 11000 + +cudaDataType_t GetCUDADataType(blas::DataType ty) { + switch (ty) { + case blas::DataType::kF16: + return CUDA_R_16F; + case blas::DataType::kF32: + return CUDA_R_32F; + case blas::DataType::kF64: + return CUDA_R_64F; + case blas::DataType::kI8: + return CUDA_R_8I; + case blas::DataType::kI32: + return CUDA_R_32I; + case blas::DataType::kComplexF32: + return CUDA_C_32F; + case blas::DataType::kComplexF64: + return CUDA_C_64F; } } } // namespace @@ -2912,6 +3017,577 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, GpuComplex(GpuMemoryMutable(b)), ldb); } +// We only use cublasLt from CUDA 11.0 onward. +#if CUDA_VERSION >= 11000 + +namespace { + +template +inline bool SetCublasLtAttr(cublasLtMatrixLayout_t handle, + cublasLtMatrixLayoutAttribute_t attr, + const T& value) { + cublasStatus_t status = + cublasLtMatrixLayoutSetAttribute(handle, attr, &value, sizeof(T)); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatrixLayoutSetAttribute(attr=" << attr + << ", value=" << value << ") failed: " << ToString(status); + return false; + } + return true; +} + +template +inline bool SetCublasLtAttr(cublasLtMatmulAlgo_t* handle, + cublasLtMatmulAlgoConfigAttributes_t attr, + const T& value) { + cublasStatus_t status = + cublasLtMatmulAlgoConfigSetAttribute(handle, attr, &value, sizeof(T)); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulAlgoConfigSetAttribute(attr=" << attr + << ", value=" << value << ") failed: " << ToString(status); + return false; + } + return true; +} + +template +inline bool SetCublasLtAttr(cublasLtMatmulPreference_t handle, + cublasLtMatmulPreferenceAttributes_t attr, + const T& value) { + cublasStatus_t status = + cublasLtMatmulPreferenceSetAttribute(handle, attr, &value, sizeof(value)); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulPreferenceSetAttribute(attr=" << attr + << ", value=" << value << ") failed: " << ToString(status); + return false; + } + return true; +} + +template +inline bool GetCublasLtAttr(const cublasLtMatmulAlgo_t* handle, + cublasLtMatmulAlgoConfigAttributes_t attr, + T* value) { + auto mutable_handle = const_cast(handle); + size_t bytes_written = 0; + return cublasLtMatmulAlgoConfigGetAttribute(mutable_handle, attr, value, + sizeof(T), &bytes_written) == + CUBLAS_STATUS_SUCCESS && + bytes_written == sizeof(T); +} + +template +inline bool SetCublasLtAttr(cublasLtMatmulDesc_t handle, + cublasLtMatmulDescAttributes_t attr, + const T& value) { + cublasStatus_t status = + cublasLtMatmulDescSetAttribute(handle, attr, &value, sizeof(value)); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulDescSetAttribute(attr=" << attr + << ", value=" << value << ") failed: " << ToString(status); + return false; + } + return true; +} + +struct MatmulDescDestroyer { + void operator()(cublasLtMatmulDesc_t matmul_desc) const { + cublasLtMatmulDescDestroy(matmul_desc); + } +}; +struct LayoutDestroyer { + void operator()(cublasLtMatrixLayout_t layout) const { + cublasLtMatrixLayoutDestroy(layout); + } +}; +struct MatmulPreferenceDestroyer { + void operator()(cublasLtMatmulPreference_t matmul_pref) const { + cublasLtMatmulPreferenceDestroy(matmul_pref); + } +}; +using UniqueOpDesc = + std::unique_ptr::type, + MatmulDescDestroyer>; +using UniqueLayoutDesc = + std::unique_ptr::type, + LayoutDestroyer>; +using UniqueMatmulPreference = + std::unique_ptr::type, + MatmulPreferenceDestroyer>; + +UniqueOpDesc CreateCublasLtOperationDesc( + blas::ComputationType computation_type, blas::DataType scale_type, + blas::PointerMode pointer_mode, blas::Transpose transa, + blas::Transpose transb) { + cublasOperation_t cuda_transa = CUDABlasTranspose(transa); + cublasOperation_t cuda_transb = CUDABlasTranspose(transb); + cublasLtMatmulDesc_t desc; + cublasComputeType_t cublas_compute_type = + CUBLASComputationType(computation_type); + cudaDataType_t cuda_scale_type = GetCUDADataType(scale_type); + cublasStatus_t status = + cublasLtMatmulDescCreate(&desc, cublas_compute_type, cuda_scale_type); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulDescCreate(computation_type=" << computation_type + << ") failed: " << ToString(status); + return nullptr; + } + UniqueOpDesc unique_desc(desc); + if (!SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, + CUBLASPointerMode(pointer_mode)) || + !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA, cuda_transa) || + !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB, cuda_transb)) { + return nullptr; + } + return unique_desc; +} + +UniqueLayoutDesc CreateCublasLtLayoutDesc(blas::DataType data_type, uint64 rows, + uint64 cols, int64 ld, int64 stride, + int batch_count) { + cublasLtMatrixLayout_t desc; + cublasStatus_t status = cublasLtMatrixLayoutCreate( + &desc, GetCUDADataType(data_type), rows, cols, ld); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatrixLayoutCreate failed: " << ToString(status); + return nullptr; + } + UniqueLayoutDesc unique_desc(desc); + if (!SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, batch_count) || + !SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, + stride)) { + return nullptr; + } + return unique_desc; +} + +UniqueMatmulPreference CreateCublasLtMatmulPreference( + size_t max_workspace_bytes) { + cublasLtMatmulPreference_t preference; + cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulPreferenceCreate failed: " << ToString(status); + return nullptr; + } + UniqueMatmulPreference unique_preference(preference); + if (!SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + max_workspace_bytes)) { + return nullptr; + } + return unique_preference; +} + +// Helper function to allocate workspace. +port::Status AllocateWorkspace(void** workspace, + ScratchAllocator* scratch_allocator, + size_t num_bytes) { + SE_ASSIGN_OR_RETURN(DeviceMemory workspace_bytes, + scratch_allocator->AllocateBytes(num_bytes)); + *workspace = (void*)GpuMemoryMutable(&workspace_bytes); + return port::Status::OK(); +} + +template +blas::ComputationType ToComputationType(); +template <> +blas::ComputationType ToComputationType() { + return blas::ComputationType::kF16; +} +template <> +blas::ComputationType ToComputationType() { + return blas::ComputationType::kF32; +} +template <> +blas::ComputationType ToComputationType() { + return blas::ComputationType::kF64; +} +template <> +blas::ComputationType ToComputationType>() { + return blas::ComputationType::kComplexF32; +}template <> +blas::ComputationType ToComputationType>() { + return blas::ComputationType::kComplexF64; +} + +class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { + public: + CUDABlasLtMatmulPlan(blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType compute_type, + blas::PointerMode pointer_mode, blas::Transpose transa, + blas::Transpose transb, uint64 m, uint64 n, uint64 k, + int batch_count, int64 lda, int64 stride_a, int64 ldb, + int64 stride_b, int64 ldc, int64 stride_c, int64 ldd, + int64 stride_d); + + cublasLtMatmulDesc_t op_desc() const { return op_desc_.get(); } + cublasLtMatrixLayout_t a_desc() const { return a_desc_.get(); } + cublasLtMatrixLayout_t b_desc() const { return b_desc_.get(); } + cublasLtMatrixLayout_t c_desc() const { return c_desc_.get(); } + cublasLtMatrixLayout_t d_desc() const { return d_desc_.get(); } + bool ok() { return op_desc_ && a_desc_ && b_desc_ && c_desc_ && d_desc_; } + + blas::DataType ab_type() const { return ab_type_; } + blas::DataType cd_type() const { return cd_type_; } + blas::DataType scale_type() const { return scale_type_; } + blas::PointerMode pointer_mode() const { return pointer_mode_; } + + private: + UniqueOpDesc op_desc_; + UniqueLayoutDesc a_desc_; + UniqueLayoutDesc b_desc_; + UniqueLayoutDesc c_desc_; + UniqueLayoutDesc d_desc_; + blas::DataType ab_type_; + blas::DataType cd_type_; + blas::DataType scale_type_; + blas::PointerMode pointer_mode_; +}; + +CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( + blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, + int64 stride_b, int64 ldc, int64 stride_c, int64 ldd, int64 stride_d) + : op_desc_(CreateCublasLtOperationDesc( + computation_type, GetScaleType(cd_type, computation_type), + pointer_mode, transa, transb)), + a_desc_(nullptr), + b_desc_(nullptr), + c_desc_( + CreateCublasLtLayoutDesc(cd_type, m, n, ldc, stride_c, batch_count)), + d_desc_( + CreateCublasLtLayoutDesc(cd_type, m, n, ldd, stride_d, batch_count)), + ab_type_(ab_type), + cd_type_(cd_type), + scale_type_(GetScaleType(cd_type, computation_type)), + pointer_mode_(pointer_mode) { + uint64 rows_a = transa == blas::Transpose::kNoTranspose ? m : k; + uint64 cols_a = transa == blas::Transpose::kNoTranspose ? k : m; + uint64 rows_b = transb == blas::Transpose::kNoTranspose ? k : n; + uint64 cols_b = transb == blas::Transpose::kNoTranspose ? n : k; + a_desc_ = CreateCublasLtLayoutDesc(ab_type, rows_a, cols_a, lda, stride_a, + batch_count); + b_desc_ = CreateCublasLtLayoutDesc(ab_type, rows_b, cols_b, ldb, stride_b, + batch_count); +} + +class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm { + public: + CUDABlasLtMatmulAlgorithm(blas::AlgorithmType index, + cublasLtMatmulAlgo_t algo, size_t workspace_size) + : index_(index), algo_(algo), workspace_size_(workspace_size) {} + + blas::AlgorithmType index() const override { return index_; } + + size_t workspace_size() const override { return workspace_size_; } + + const cublasLtMatmulAlgo_t* algo() const { return &algo_; } + + int algo_id() const { + int id; + GetCublasLtAttr(&algo_, CUBLASLT_ALGO_CONFIG_ID, &id); + return id; + } + + private: + blas::AlgorithmType index_; + cublasLtMatmulAlgo_t algo_; + size_t workspace_size_; +}; + +} // namespace + +#endif // CUDA_VERSION >= 11000 + +std::unique_ptr +CUDABlas::CreateBlasLtMatmulPlanStridedBatched( + blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, + int64 stride_b, int64 ldc, int64 stride_c) { +#if CUDA_VERSION >= 11000 + auto result = std::make_unique( + ab_type, cd_type, computation_type, pointer_mode, transa, transb, m, n, k, + batch_count, lda, stride_a, ldb, stride_b, ldc, stride_c, ldc, stride_c); + if (!result->ok()) { + result.reset(); + } + return result; +#else + return nullptr; +#endif +} + +bool CUDABlas::GetBlasLtMatmulAlgorithms( + const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, + int max_algorithm_count, + std::vector>* + out_algorithms) { +#if CUDA_VERSION >= 11000 + UniqueMatmulPreference preference = + CreateCublasLtMatmulPreference(max_workspace_size); + if (!preference) return false; + + std::vector results(max_algorithm_count); + { + absl::MutexLock lock(&mu_); + + CHECK(blasLt_ != nullptr); + + gpu::ScopedActivateExecutorContext sac{parent_}; + + int found_algorithm_count = 0; + const auto& cuda_plan = *static_cast(plan); + cublasStatus_t status = cublasLtMatmulAlgoGetHeuristic( + blasLt_, cuda_plan.op_desc(), cuda_plan.a_desc(), cuda_plan.b_desc(), + cuda_plan.c_desc(), cuda_plan.d_desc(), preference.get(), + max_algorithm_count, results.data(), &found_algorithm_count); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulAlgoGetHeuristic failed: " << ToString(status); + return false; + } + results.resize(found_algorithm_count); + } + + for (size_t i = 0; i < results.size(); ++i) { + const auto& result = results[i]; + if (result.state != CUBLAS_STATUS_SUCCESS) continue; // Skip failed algos + out_algorithms->emplace_back(std::make_unique( + i, result.algo, result.workspaceSize)); + } + return true; +#else // if CUDA_VERSION < 11000 + return false; +#endif +} + +#if CUDA_VERSION >= 11000 +template +bool CUDABlas::DoBlasLtMatmulInternalImpl( + Stream* stream, bool err_on_failure, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const ABType* a, + const ABType* b, const HostOrDeviceScalar& beta, const CDType* c, + CDType* d, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm) { + const auto& cuda_plan = *static_cast(plan); + const auto& cuda_algo = + *static_cast(algorithm); + + if (cuda_plan.ab_type() != blas::ToDataType::value) { + VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong ab_type: " + "expected " + << blas::ToDataType::value << ", got " + << cuda_plan.ab_type(); + return false; + } + if (cuda_plan.cd_type() != blas::ToDataType::value) { + VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong cd_type: " + "expected " + << blas::ToDataType::value << ", got " + << cuda_plan.cd_type(); + return false; + } + if (cuda_plan.scale_type() != blas::ToDataType::value) { + VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong " + "scale_type: expected " + << blas::ToDataType::value << ", got " + << cuda_plan.cd_type(); + return false; + } + if (alpha.is_pointer() != beta.is_pointer()) { + VLOG(2) << "DoBlasLtMatmul returning false because one of `alpha` " + "and `beta` is a pointer, but the other is not."; + return false; + } + bool is_pointer_mode_host = !alpha.is_pointer(); + if ((cuda_plan.pointer_mode() == blas::PointerMode::kHost) != + is_pointer_mode_host) { + VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong " + "pointer_mode for the given alpha/beta."; + return false; + } + const ScaleType* alpha_ptr = + alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(); + const ScaleType* beta_ptr = + beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(); + + void* workspace = nullptr; + if (cuda_algo.workspace_size()) { + port::Status allocation_status = AllocateWorkspace( + &workspace, scratch_allocator, cuda_algo.workspace_size()); + if (!allocation_status.ok()) { + if (err_on_failure || VLOG_IS_ON(3)) { + LOG(ERROR) + << "Failed to allocate workspace for cublasLtMatmul algo with id: " + << cuda_algo.algo_id() << " requiring " + << cuda_algo.workspace_size() << " bytes of workspace"; + } + return false; + } + } + + cudaStream_t cuda_stream = CUDAStream(stream); + + absl::MutexLock lock(&mu_); + + CHECK(blasLt_ != nullptr); + + gpu::ScopedActivateExecutorContext sac{parent_}; + + cublasStatus_t ret = cublasLtMatmul( + blasLt_, cuda_plan.op_desc(), alpha_ptr, a, cuda_plan.a_desc(), b, + cuda_plan.b_desc(), beta_ptr, c, cuda_plan.c_desc(), d, + cuda_plan.d_desc(), cuda_algo.algo(), workspace, + cuda_algo.workspace_size(), cuda_stream); + if (ret != CUBLAS_STATUS_SUCCESS) { + if (err_on_failure || VLOG_IS_ON(3)) { + LOG(ERROR) << "failed to run cublasLtMatmul routine: " << ToString(ret); + } + return false; + } + return true; +} +#endif // CUDA_VERSION >= 11000 + +template +bool CUDABlas::DoBlasLtMatmulInternal( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + const DeviceMemory& c, DeviceMemory* d, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { +#if CUDA_VERSION >= 11000 + std::unique_ptr timer; + if (output_profile_result) { + timer.reset(new GpuTimer(parent_)); + if (!timer->Init() || !timer->Start(AsGpuStream(stream))) { + return false; + } + } + + bool err_on_failure = timer != nullptr; + bool result = DoBlasLtMatmulInternalImpl( + stream, err_on_failure, plan, alpha, GpuMemory(a), GpuMemory(b), beta, + GpuMemory(c), GpuMemoryMutable(d), scratch_allocator, algorithm); + + if (timer && result) { + // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error + // state. + if (!timer->Stop(AsGpuStream(stream))) { + return false; + } + output_profile_result->set_is_valid(true); + output_profile_result->set_algorithm(algorithm->index()); + output_profile_result->set_elapsed_time_in_ms( + timer->GetElapsedMilliseconds()); + } + return result; +#else // if CUDA_VERSION < 11000 + return false; +#endif +} + +bool CUDABlas::DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, + scratch_allocator, algorithm, + output_profile_result); +} + +bool CUDABlas::DoBlasLtMatmul(Stream* stream, + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { +#if CUDA_VERSION >= 11000 + const auto& cuda_plan = *static_cast(plan); + if (cuda_plan.scale_type() == blas::DataType::kF32) { + // F32* computation types require F32 alpha/beta type, so we must cast them. + if (alpha.is_pointer() || beta.is_pointer()) { + // We cannot easily convert a pointer to f16 memory to a pointer to f32 + // memory from here, so we don't support this for now. + return false; + } + HostOrDeviceScalar float_alpha(static_cast(alpha.value())); + HostOrDeviceScalar float_beta(static_cast(beta.value())); + return DoBlasLtMatmulInternal(stream, plan, float_alpha, a, b, float_beta, + *c, c, scratch_allocator, algorithm, + output_profile_result); + } + return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, + scratch_allocator, algorithm, + output_profile_result); +#else // if CUDA_VERSION < 11000 + return false; +#endif +} + +bool CUDABlas::DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, + scratch_allocator, algorithm, + output_profile_result); +} + +bool CUDABlas::DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, + scratch_allocator, algorithm, + output_profile_result); +} + +bool CUDABlas::DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, + scratch_allocator, algorithm, + output_profile_result); +} + +bool CUDABlas::DoBlasLtMatmul( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, + scratch_allocator, algorithm, + output_profile_result); +} + port::Status CUDABlas::GetVersion(std::string *version) { absl::MutexLock lock(&mu_); diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 9ff63102aaa..351a7778c01 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -22,6 +22,8 @@ limitations under the License. #include "absl/synchronization/mutex.h" #include "third_party/gpus/cuda/include/cublas_v2.h" +#include "third_party/gpus/cuda/include/cublasLt.h" +#include "third_party/gpus/cuda/include/cuda.h" #include "tensorflow/core/platform/thread_annotations.h" #include "tensorflow/stream_executor/blas.h" #include "tensorflow/stream_executor/host_or_device_scalar.h" @@ -71,6 +73,9 @@ class CUDABlas : public blas::BlasSupport { // invoked before calling into cuBLAS. bool SetStream(Stream *stream) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_); + // Returns the underlying CUDA stream. + cudaStream_t CUDAStream(Stream* stream); + // A helper function that calls the real cuBLAS function together with error // handling. // @@ -134,6 +139,26 @@ class CUDABlas : public blas::BlasSupport { const T &beta, DeviceMemory *y, int incy, blas::ProfileResult *output_profile_result); + // Helper function for implementing DoBlasLtMatmul. + template + bool DoBlasLtMatmulInternal( + Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + const DeviceMemory& c, DeviceMemory* d, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result); + + // Helper function for implementing DoBlasLtMatmulInternal. + template + bool DoBlasLtMatmulInternalImpl( + Stream* stream, bool err_on_failure, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const ABType* a, + const ABType* b, const HostOrDeviceScalar& beta, + const CDType* c, CDType* d, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm); + // Guards the cuBLAS handle for this device. absl::Mutex mu_; @@ -144,6 +169,11 @@ class CUDABlas : public blas::BlasSupport { // cuBLAS library handle on the device. cublasHandle_t blas_ TF_GUARDED_BY(mu_); +#if CUDA_VERSION >= 11000 + // cuBLASLt library handle on the device. + cublasLtHandle_t blasLt_ GUARDED_BY(mu_); +#endif + SE_DISALLOW_COPY_AND_ASSIGN(CUDABlas); }; diff --git a/tensorflow/stream_executor/platform/default/dlopen_checker.cc b/tensorflow/stream_executor/platform/default/dlopen_checker.cc index b55c9f53793..7b38dfcfec0 100644 --- a/tensorflow/stream_executor/platform/default/dlopen_checker.cc +++ b/tensorflow/stream_executor/platform/default/dlopen_checker.cc @@ -23,6 +23,7 @@ namespace DsoLoader { port::Status TryDlopenCUDALibraries() { auto cudart_status = GetCudaRuntimeDsoHandle(); auto cublas_status = GetCublasDsoHandle(); + auto cublaslt_status = GetCublasLtDsoHandle(); auto cufft_status = GetCufftDsoHandle(); auto curand_status = GetCurandDsoHandle(); auto cusolver_status = GetCusolverDsoHandle(); @@ -31,7 +32,7 @@ port::Status TryDlopenCUDALibraries() { if (!cudart_status.status().ok() || !cublas_status.status().ok() || !cufft_status.status().ok() || !curand_status.status().ok() || !cusolver_status.status().ok() || !cusparse_status.status().ok() || - !cudnn_status.status().ok()) { + !cudnn_status.status().ok() || !cublaslt_status.status().ok()) { return port::Status(port::error::INTERNAL, absl::StrCat("Cannot dlopen all CUDA libraries.")); } else { diff --git a/tensorflow/stream_executor/platform/default/dso_loader.cc b/tensorflow/stream_executor/platform/default/dso_loader.cc index 70b1ebe070a..66cf3f2b43b 100644 --- a/tensorflow/stream_executor/platform/default/dso_loader.cc +++ b/tensorflow/stream_executor/platform/default/dso_loader.cc @@ -84,6 +84,10 @@ port::StatusOr GetCublasDsoHandle() { return GetDsoHandle("cublas", GetCublasVersion()); } +port::StatusOr GetCublasLtDsoHandle() { + return GetDsoHandle("cublasLt", GetCublasVersion()); +} + port::StatusOr GetCufftDsoHandle() { return GetDsoHandle("cufft", GetCufftVersion()); } @@ -160,6 +164,11 @@ port::StatusOr GetCublasDsoHandle() { return *result; } +port::StatusOr GetCublasLtDsoHandle() { + static auto result = new auto(DsoLoader::GetCublasLtDsoHandle()); + return *result; +} + port::StatusOr GetCurandDsoHandle() { static auto result = new auto(DsoLoader::GetCurandDsoHandle()); return *result; diff --git a/tensorflow/stream_executor/platform/default/dso_loader.h b/tensorflow/stream_executor/platform/default/dso_loader.h index 91138f713bd..7f087349fcf 100644 --- a/tensorflow/stream_executor/platform/default/dso_loader.h +++ b/tensorflow/stream_executor/platform/default/dso_loader.h @@ -37,6 +37,7 @@ namespace DsoLoader { port::StatusOr GetCudaDriverDsoHandle(); port::StatusOr GetCudaRuntimeDsoHandle(); port::StatusOr GetCublasDsoHandle(); +port::StatusOr GetCublasLtDsoHandle(); port::StatusOr GetCufftDsoHandle(); port::StatusOr GetCurandDsoHandle(); port::StatusOr GetCusolverDsoHandle(); @@ -72,6 +73,7 @@ namespace CachedDsoLoader { port::StatusOr GetCudaDriverDsoHandle(); port::StatusOr GetCudaRuntimeDsoHandle(); port::StatusOr GetCublasDsoHandle(); +port::StatusOr GetCublasLtDsoHandle(); port::StatusOr GetCufftDsoHandle(); port::StatusOr GetCurandDsoHandle(); port::StatusOr GetCusolverDsoHandle(); diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 62689e61be1..144af92185c 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -4801,6 +4801,143 @@ Stream &Stream::ThenBlasGemmStridedBatched( c, ldc, stride_c, batch_count); } +Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), + PARAM(c), PARAM(algorithm)); + + ThenBlasWithProfileImpl< + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*> + impl; + return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, + c, scratch_allocator, algorithm, output_profile_result); +} + +Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), + PARAM(c), PARAM(algorithm)); + + ThenBlasWithProfileImpl< + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, + ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*> + impl; + return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, + c, scratch_allocator, algorithm, output_profile_result); +} + +Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), + PARAM(c), PARAM(algorithm)); + + ThenBlasWithProfileImpl< + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*> + impl; + return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, + c, scratch_allocator, algorithm, output_profile_result); +} + +Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), + PARAM(c), PARAM(algorithm)); + + ThenBlasWithProfileImpl< + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, + ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*> + impl; + return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, + c, scratch_allocator, algorithm, output_profile_result); +} + +Stream& Stream::ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), + PARAM(c), PARAM(algorithm)); + + ThenBlasWithProfileImpl>&, + const DeviceMemory>&, + const DeviceMemory>&, + const HostOrDeviceScalar>&, + DeviceMemory>*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*> + impl; + return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, + c, scratch_allocator, algorithm, output_profile_result); +} + +Stream& Stream::ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result) { + VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), + PARAM(c), PARAM(algorithm)); + + ThenBlasWithProfileImpl>&, + const DeviceMemory>&, + const DeviceMemory>&, + const HostOrDeviceScalar>&, + DeviceMemory>*, + ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*> + impl; + return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, + c, scratch_allocator, algorithm, output_profile_result); +} + Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) { VLOG_CALL(PARAM(seed), PARAM(seed_bytes)); diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index bfe442641ad..15f5dfc936f 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -1665,6 +1665,56 @@ class Stream { const DeviceMemory> &a, int lda, DeviceMemory> *b, int ldb); + // See BlasSupport::DoBlatLtMatmul. + Stream& ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr); + Stream& ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, const DeviceMemory& b, + const HostOrDeviceScalar& beta, DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr); + Stream& ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr); + Stream& ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, const DeviceMemory& a, + const DeviceMemory& b, const HostOrDeviceScalar& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr); + Stream& ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr); + Stream& ThenBlasLtMatmul( + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar>& alpha, + const DeviceMemory>& a, + const DeviceMemory>& b, + const HostOrDeviceScalar>& beta, + DeviceMemory>* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + blas::ProfileResult* output_profile_result = nullptr); + // See FftSupport::DoFft. Stream &ThenFft(fft::Plan *plan, const DeviceMemory> &input, diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index db4e8f9b694..3fbbc3f2aac 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -336,6 +336,49 @@ bool StreamExecutor::GetBlasGemmAlgorithms( return blas_support->GetBlasGemmAlgorithms(out_algorithms); } +std::unique_ptr StreamExecutor::CreateBlasLtMatmulPlan( + blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, int64 lda, int64 ldb, int64 ldc) { + blas::BlasSupport *blas_support = AsBlas(); + if (!blas_support) { + return nullptr; + } + return blas_support->CreateBlasLtMatmulPlan( + ab_type, cd_type, computation_type, pointer_mode, transa, transb, m, n, k, + lda, ldb, ldc); +} + +std::unique_ptr +StreamExecutor::CreateBlasLtMatmulPlanStridedBatched( + blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, uint64 batch_count, int64 lda, int64 stride_a, int64 ldb, + int64 stride_b, int64 ldc, int64 stride_c) { + blas::BlasSupport *blas_support = AsBlas(); + if (!blas_support) { + return nullptr; + } + return blas_support->CreateBlasLtMatmulPlanStridedBatched( + ab_type, cd_type, computation_type, pointer_mode, transa, transb, m, n, k, + batch_count, lda, stride_a, ldb, stride_b, ldc, stride_c); +} + +bool StreamExecutor::GetBlasLtMatmulAlgorithms( + const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, + int max_algorithm_count, + std::vector>* + out_algorithms) { + blas::BlasSupport *blas_support = AsBlas(); + if (!blas_support) { + return false; + } + return blas_support->GetBlasLtMatmulAlgorithms( + plan, max_workspace_size, max_algorithm_count, out_algorithms); +} + port::StatusOr> StreamExecutor::createRnnDescriptor( int num_layers, int hidden_size, int input_size, int cell_size, diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index b9b118ca42c..90137417250 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -394,6 +394,35 @@ class StreamExecutor { // Get the list of supported algorithms for BLAS gemm. bool GetBlasGemmAlgorithms(std::vector *out_algorithms); + // Creates a backend-specific plan object for a blaslt matmul operation, which + // can then be passed to DoBlasLtMatmul(). When possible, plans should be + // created once and reused for multiple calls to DoBlasLtMatmul(). + // Returns a null pointer on failure. + std::unique_ptr CreateBlasLtMatmulPlan( + blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, int64 lda, int64 ldb, int64 ldc); + + // A more general version of CreateBlasLtMatmulPlan supporting + // batched operations. + std::unique_ptr CreateBlasLtMatmulPlanStridedBatched( + blas::DataType ab_type, blas::DataType cd_type, + blas::ComputationType computation_type, blas::PointerMode pointer_mode, + blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, + uint64 k, uint64 batch_count, int64 lda, int64 stride_a, int64 ldb, + int64 stride_b, int64 ldc, int64 stride_c); + + // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are + // returned in the order of increasing estimated compute time according to an + // internal heuristic. The first returned algorithm can be used as the default + // algorithm if no autotuning is to be performed. + bool GetBlasLtMatmulAlgorithms( + const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, + int max_algorithm_count, + std::vector>* + out_algorithms); + // Create an RNN descriptor based on model shapes and configurations. // The caller retains the ownership of the descriptor. port::StatusOr> createRnnDescriptor( diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl index a4a21abc367..70eacf82883 100644 --- a/third_party/gpus/cuda/BUILD.tpl +++ b/third_party/gpus/cuda/BUILD.tpl @@ -127,6 +127,13 @@ cc_library( linkstatic = 1, ) +cc_library( + name = "cublasLt", + srcs = ["cuda/lib/%{cublasLt_lib}"], + data = ["cuda/lib/%{cublasLt_lib}"], + linkstatic = 1, +) + cc_library( name = "cusolver", srcs = ["cuda/lib/%{cusolver_lib}"], @@ -168,6 +175,7 @@ cc_library( name = "cuda", deps = [ ":cublas", + ":cublasLt", ":cuda_headers", ":cudart", ":cudnn", diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl index ea33963fe19..55bcd6e5ccc 100644 --- a/third_party/gpus/cuda_configure.bzl +++ b/third_party/gpus/cuda_configure.bzl @@ -551,6 +551,13 @@ def _find_libs(repository_ctx, check_cuda_libs_script, cuda_config): cuda_config.cublas_version, static = False, ), + "cublasLt": _check_cuda_lib_params( + "cublasLt", + cpu_value, + cuda_config.config["cublas_library_dir"], + cuda_config.cublas_version, + static = False, + ), "cusolver": _check_cuda_lib_params( "cusolver", cpu_value, @@ -771,6 +778,7 @@ def _create_dummy_repository(repository_ctx): "%{cudart_static_linkopt}": _cudart_static_linkopt(cpu_value), "%{cudart_lib}": lib_name("cudart", cpu_value), "%{cublas_lib}": lib_name("cublas", cpu_value), + "%{cublasLt_lib}": lib_name("cublasLt", cpu_value), "%{cusolver_lib}": lib_name("cusolver", cpu_value), "%{cudnn_lib}": lib_name("cudnn", cpu_value), "%{cufft_lib}": lib_name("cufft", cpu_value), @@ -802,6 +810,7 @@ filegroup(name="cudnn-include") "cuda/cuda/lib/%s" % lib_name("cudart_static", cpu_value), ) repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublas", cpu_value)) + repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cublasLt", cpu_value)) repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cusolver", cpu_value)) repository_ctx.file("cuda/cuda/lib/%s" % lib_name("cudnn", cpu_value)) repository_ctx.file("cuda/cuda/lib/%s" % lib_name("curand", cpu_value)) @@ -992,11 +1001,13 @@ def _create_local_cuda_repository(repository_ctx): cublas_include_path + "/cublas.h", cublas_include_path + "/cublas_v2.h", cublas_include_path + "/cublas_api.h", + cublas_include_path + "/cublasLt.h", ], outs = [ "cublas/include/cublas.h", "cublas/include/cublas_v2.h", "cublas/include/cublas_api.h", + "cublas/include/cublasLt.h", ], )) @@ -1137,6 +1148,7 @@ def _create_local_cuda_repository(repository_ctx): "%{cudart_static_linkopt}": _cudart_static_linkopt(cuda_config.cpu_value), "%{cudart_lib}": _basename(repository_ctx, cuda_libs["cudart"]), "%{cublas_lib}": _basename(repository_ctx, cuda_libs["cublas"]), + "%{cublasLt_lib}": _basename(repository_ctx, cuda_libs["cublasLt"]), "%{cusolver_lib}": _basename(repository_ctx, cuda_libs["cusolver"]), "%{cudnn_lib}": _basename(repository_ctx, cuda_libs["cudnn"]), "%{cufft_lib}": _basename(repository_ctx, cuda_libs["cufft"]), From 0d172940c102b37300c3ddb7d8dbd3835382a474 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 6 Jul 2020 21:25:04 +1000 Subject: [PATCH 0029/1447] Use BlasLtMatmul APIs in batch_matmul_op_impl - Integrates BlasLtMatmul with autotuning into the implementation of the BatchMatMul and Einsum ops. - This integration is only used when the CUDA version is >= 11.0. --- tensorflow/core/kernels/BUILD | 4 + .../core/kernels/batch_matmul_op_impl.h | 584 ++++++++++++------ tensorflow/core/kernels/gpu_utils.cc | 57 ++ tensorflow/core/kernels/gpu_utils.h | 36 ++ .../core/kernels/linalg/einsum_op_impl.h | 7 +- tensorflow/core/util/matmul_autotune.cc | 18 + tensorflow/core/util/matmul_autotune.h | 1 + 7 files changed, 522 insertions(+), 185 deletions(-) diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD index 9917b8e5c95..1eec9056040 100644 --- a/tensorflow/core/kernels/BUILD +++ b/tensorflow/core/kernels/BUILD @@ -3334,6 +3334,9 @@ tf_kernel_library( prefix = "batch_matmul_op", deps = MATH_DEPS + [":eigen_contraction_kernel"] + if_mkl_ml([ "//third_party/mkl:intel_binary_blob", + ]) + if_cuda([ + "//tensorflow/core/kernels:gpu_utils", + "//tensorflow/core/platform:tensor_float_32_utils", ]), ) @@ -3392,6 +3395,7 @@ tf_kernel_library( prefix = "fft_ops", deps = MATH_DEPS + [ ] + if_cuda([ + "//tensorflow/core/kernels:gpu_utils", "//tensorflow/core/platform/default/build_config:cufft_plugin", ]), ) diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index d6cc980633f..5ca85c00835 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -22,7 +22,6 @@ limitations under the License. #include -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -34,17 +33,24 @@ limitations under the License. #include "tensorflow/core/lib/core/errors.h" #include "tensorflow/core/lib/gtl/inlined_vector.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/core/platform/tensor_float_32_utils.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/util/matmul_autotune.h" #include "tensorflow/core/util/matmul_bcast.h" #include "tensorflow/core/util/work_sharder.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) #include "tensorflow/core/kernels/eigen_contraction_kernel.h" #endif #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#include "tensorflow/core/kernels/gpu_utils.h" #include "tensorflow/core/platform/stream_executor.h" #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM +#if GOOGLE_CUDA +#include "third_party/gpus/cuda/include/cuda.h" // For CUDA_VERSION +#endif namespace tensorflow { @@ -219,7 +225,8 @@ template struct LaunchBatchMatMul { static void Launch(OpKernelContext* context, const Tensor& in_x, const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x, - bool trans_y, const MatMulBCast& bcast, Tensor* out) { + bool trans_y, const MatMulBCast& bcast, bool use_autotune, + Tensor* out) { typedef ParallelMatMulKernel::IsComplex> ParallelMatMulKernel; bool conjugate_result = false; @@ -275,45 +282,201 @@ se::DeviceMemory AsDeviceMemory(const T* gpu_memory) { return typed; } -class BlasScratchAllocator : public se::ScratchAllocator { +using BlasScratchAllocator = GpuScratchAllocator; + +int64 GetBlasWorkspaceLimit(const string& envvar_in_mb, + int64 default_value_in_bytes) { + return GetWorkspaceLimit(envvar_in_mb, default_value_in_bytes); +} + +// Encapsulate all of the shape, dtype etc. information that defines a unique +// batched matmul operation. +class BatchMatmulParameters { public: - using Stream = se::Stream; - using DeviceMemoryBytes = se::DeviceMemory; + BatchMatmulParameters(bool trans_a, bool trans_b, bool adj_a, bool adj_b, + uint64 m, uint64 n, uint64 k, uint64 batch_count, + bool broadcast_a, bool broadcast_b, DataType dtype_ab, + DataType dtype_cd, bool allow_tf32, int device_id) + : trans_a_(trans_a), + trans_b_(trans_b), + adj_a_(adj_a), + adj_b_(adj_b), + m_(m), + n_(n), + k_(k), + batch_count_(batch_count), + broadcast_a_(broadcast_a), + broadcast_b_(broadcast_b), + dtype_ab_(dtype_ab), + dtype_cd_(dtype_cd), + allow_tf32_(allow_tf32), + device_id_(device_id) { + hash_code_ = trans_a; + hash_code_ = Hash64Combine(hash_code_, trans_b); + hash_code_ = Hash64Combine(hash_code_, adj_a); + hash_code_ = Hash64Combine(hash_code_, adj_b); + hash_code_ = Hash64Combine(hash_code_, m); + hash_code_ = Hash64Combine(hash_code_, n); + hash_code_ = Hash64Combine(hash_code_, k); + hash_code_ = Hash64Combine(hash_code_, batch_count); + hash_code_ = Hash64Combine(hash_code_, broadcast_a); + hash_code_ = Hash64Combine(hash_code_, broadcast_b); + hash_code_ = Hash64Combine(hash_code_, dtype_ab); + hash_code_ = Hash64Combine(hash_code_, dtype_cd); + hash_code_ = Hash64Combine(hash_code_, allow_tf32); + hash_code_ = Hash64Combine(hash_code_, device_id); + } + bool operator==(const BatchMatmulParameters& other) const { + return this->get_data_as_tuple() == other.get_data_as_tuple(); + } - BlasScratchAllocator(OpKernelContext* context) : context_(context) {} + bool operator!=(const BatchMatmulParameters& other) const { + return !(*this == other); + } + uint64 hash() const { return hash_code_; } - int64 GetMemoryLimitInBytes() override { return -1; } - - se::port::StatusOr AllocateBytes( - int64 byte_size) override { - Tensor temporary_memory; - - Status allocation_status(context_->allocate_temp( - DT_UINT8, TensorShape({byte_size}), &temporary_memory)); - if (!allocation_status.ok()) { - return se::port::StatusOr( - DeviceMemoryBytes::MakeFromByteSize(nullptr, 0)); - } - // Hold the reference of the allocated tensors until the end of the - // allocator. - allocated_tensors_.push_back(temporary_memory); - return se::port::StatusOr( - DeviceMemoryBytes::MakeFromByteSize( - temporary_memory.flat().data(), - temporary_memory.flat().size())); + string ToString() const { + // clang-format off + return strings::StrCat( + trans_a_, ", ", trans_b_, ", ", adj_a_, ", ", adj_b_, ", ", + m_, ", ", n_, ", ", k_, ", ", batch_count_, ", ", + broadcast_a_, ", ", broadcast_b_, ", ", + dtype_ab_, ", ", dtype_cd_, ", ", allow_tf32_, ", ", device_id_); + // clang-format on } private: - OpKernelContext* context_; - std::vector allocated_tensors_; + typedef std::tuple + ParameterDataType; + + ParameterDataType get_data_as_tuple() const { + return std::make_tuple(trans_a_, trans_b_, adj_a_, adj_b_, m_, n_, k_, + batch_count_, broadcast_a_, broadcast_b_, dtype_ab_, + dtype_cd_, allow_tf32_, device_id_); + } + + bool trans_a_; + bool trans_b_; + bool adj_a_; + bool adj_b_; + uint64 m_; + uint64 n_; + uint64 k_; + uint64 batch_count_; + bool broadcast_a_; + bool broadcast_b_; + DataType dtype_ab_; + DataType dtype_cd_; + bool allow_tf32_; + int device_id_; + uint64 hash_code_; }; + +bool GetBlasComputationType(const DataType& dtype, bool allow_tf32, + se::blas::ComputationType* compute_type) { + using se::blas::ComputationType; + static bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input(); + ComputationType f32_type = + allow_tf32 ? ComputationType::kF32FastTF32 : ComputationType::kF32; + switch (dtype) { + case DT_HALF: + case DT_BFLOAT16: + *compute_type = + use_f32_for_f16_computation ? f32_type : ComputationType::kF16; + return true; + case DT_FLOAT: + *compute_type = f32_type; + return true; + case DT_DOUBLE: + *compute_type = ComputationType::kF64; + return true; + case DT_COMPLEX64: + *compute_type = f32_type; + return true; + case DT_COMPLEX128: + *compute_type = ComputationType::kComplexF64; + return true; + default: + // Unsupported compute_type, return false. + return false; + } +} + +// Thread-safe map from matmul parameters to their corresponding plan and +// algorithms. +template +class BlasLtMatmulPlanMap { + public: + struct PlanAndAlgorithms { + std::unique_ptr plan; + std::vector> algorithms; + }; + + const PlanAndAlgorithms* Find(const Parameters& params) { + mutex_lock lock(mu_); + auto iter = params_plan_map_.find(params); + if (iter == params_plan_map_.end()) { + return nullptr; + } + return &iter->second; + } + const PlanAndAlgorithms* Insert(const Parameters& params, + PlanAndAlgorithms value) { + mutex_lock lock(mu_); + return ¶ms_plan_map_.emplace(params, std::move(value)).first->second; + } + + private: + struct Hasher { + std::size_t operator()(const Parameters& parameter) const { + return parameter.hash(); + } + }; + + mutable mutex mu_; + std::unordered_map params_plan_map_ + GUARDED_BY(mu_); +}; + +template +struct BlasLtPlanMapSingleton { + typedef BlasLtMatmulPlanMap PlanMapType; + static PlanMapType* GetInstance() { + static PlanMapType* instance = new PlanMapType(); + return instance; + } +}; + +typedef BlasLtPlanMapSingleton + BatchMatmulPlanMapSingleton; + +// A dummy type to group matmul autotune results together. +struct BatchMatmulAutoTuneGroup { + static string name() { return "MatmulLt"; } +}; + +typedef AutoTuneSingleton + AutoTuneBatchMatmul; + +template +struct CoefficientType { + typedef Scalar type; +}; +template <> +struct CoefficientType { + typedef float type; +}; + } // namespace template struct LaunchBatchMatMul { static void Launch(OpKernelContext* context, const Tensor& in_x, const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x, - bool trans_y, const MatMulBCast& bcast, Tensor* out) { + bool trans_y, const MatMulBCast& bcast, bool use_autotune, + Tensor* out) { se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, se::blas::Transpose::kTranspose, se::blas::Transpose::kConjugateTranspose}; @@ -347,6 +510,198 @@ struct LaunchBatchMatMul { uint64 b_stride; uint64 c_stride; + typedef typename CoefficientType::type Coefficient; + + static const int64 max_scratch_size = GetBlasWorkspaceLimit( + "TF_CUBLAS_WORKSPACE_LIMIT_IN_MB", 1LL << 32); // 4GB by default + + // The BlasLtMatmul routines are only supported from CUDA 11.0 onward. +#if GOOGLE_CUDA && CUDA_VERSION >= 11000 + bool is_full_broadcast = + std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1; + bool requires_mixed_broadcasting = + bcast.IsBroadcastingRequired() && !is_full_broadcast; + if (!requires_mixed_broadcasting) { + bool broadcast_a = bcast.x_batch_size() == 1; + bool broadcast_b = bcast.y_batch_size() == 1; + a_stride = broadcast_a ? 0 : m * k; + b_stride = broadcast_b ? 0 : k * n; + c_stride = m * n; + a_device_memory.push_back(AsDeviceMemory(a_base_ptr)); + b_device_memory.push_back(AsDeviceMemory(b_base_ptr)); + c_device_memory.push_back(AsDeviceMemory(c_base_ptr)); + a_ptrs.push_back(&a_device_memory.back()); + b_ptrs.push_back(&b_device_memory.back()); + c_ptrs.push_back(&c_device_memory.back()); + + DataType dtype = DataTypeToEnum::value; + bool allow_tf32 = tensor_float_32_execution_enabled(); + int device_id = stream->parent()->device_ordinal(); + BatchMatmulParameters matmul_parameters( + trans_x, trans_y, adj_x, adj_y, m, n, k, batch_size, broadcast_a, + broadcast_b, dtype, dtype, allow_tf32, device_id); + + static const bool max_autotune_algorithm_count = + MatmulMaxAutotuneAlgorithmCount(); + int max_algorithm_count = use_autotune ? max_autotune_algorithm_count : 1; + + const auto* plan_and_algorithms = + BatchMatmulPlanMapSingleton::GetInstance()->Find(matmul_parameters); + if (!plan_and_algorithms) { + se::blas::DataType blas_dtype = se::blas::ToDataType::value; + se::blas::ComputationType computation_type; + OP_REQUIRES( + context, + GetBlasComputationType(dtype, allow_tf32, &computation_type), + errors::Internal("Unsupported dtype for batched matmul")); + std::unique_ptr plan = + stream->parent()->CreateBlasLtMatmulPlanStridedBatched( + /*ab_type=*/blas_dtype, + /*cd_type=*/blas_dtype, computation_type, + se::blas::PointerMode::kHost, blas_transpose_b, + blas_transpose_a, n, m, k, batch_size, + /*lda=*/in_y.dim_size(2), b_stride, + /*ldb=*/in_x.dim_size(2), a_stride, /*ldc=*/n, c_stride); + OP_REQUIRES( + context, plan, + errors::Internal( + "CreateBlasLtMatmulPlanStridedBatched failed : a.shape=(", + in_x.dim_size(0), ", ", in_x.dim_size(1), ", ", + in_x.dim_size(2), "), b.shape=(", in_y.dim_size(0), ", ", + in_y.dim_size(1), ", ", in_y.dim_size(2), "), m=", m, ", n=", n, + ", k=", k, ", batch_size=", batch_size, ", adjoint_a=", adj_x, + ", adjoint_b=", adj_x, ", dtype=", dtype, + ", computation_type=", computation_type)); + std::vector> + algorithms; + OP_REQUIRES( + context, + stream->parent()->GetBlasLtMatmulAlgorithms( + plan.get(), max_scratch_size, max_algorithm_count, &algorithms), + errors::Internal("GetBlasLtMatmulAlgorithms failed: a.shape=(", + in_x.dim_size(0), ", ", in_x.dim_size(1), ", ", + in_x.dim_size(2), "), b.shape=(", in_y.dim_size(0), + ", ", in_y.dim_size(1), ", ", in_y.dim_size(2), + "), m=", m, ", n=", n, ", k=", k, + ", batch_size=", batch_size, ", adjoint_a=", adj_x, + ", adjoint_b=", adj_x, ", dtype=", dtype, + ", computation_type=", computation_type)); + plan_and_algorithms = + BatchMatmulPlanMapSingleton::GetInstance()->Insert( + matmul_parameters, {std::move(plan), std::move(algorithms)}); + } + const auto& plan = plan_and_algorithms->plan; + const auto& algorithms = plan_and_algorithms->algorithms; + + // The BlasLtMatmul routines (unlike BlasGemm, BlasGemmBatched etc.) take + // alpha and beta with the same type as the matrices. + Scalar alpha(1.0); + Scalar beta(0.0); + + // Note that algorithm_config.algorithm() here is used to refer + // to the index within the algorithms vector, not the algorithm + // itself. + se::blas::AlgorithmConfig algorithm_config(se::blas::kNoAlgorithm); + if (max_algorithm_count == 1) { + algorithm_config.set_algorithm(0); + } else if (!AutoTuneBatchMatmul::GetInstance()->Find(matmul_parameters, + &algorithm_config)) { + VLOG(4) << "Autotuning BlasLtMatmul over " << algorithms.size() + << " algorithms."; + se::blas::ProfileResult best_result; + se::blas::ProfileResult profile_result; + //for (const auto& profile_algorithm : plan_and_algorithms->algorithms) { + for (size_t i = 0; i != algorithms.size(); ++i) { + const auto& profile_algorithm = algorithms[i]; + // Create a new scratch allocator with every autotuning run so that + // scratch space is deallocated between runs. + BlasScratchAllocator scratch_allocator(max_scratch_size, context); + + bool cublas_launch_status = + stream + ->ThenBlasLtMatmul(plan.get(), alpha, *b_ptrs[0], *a_ptrs[0], + beta, c_ptrs[0], &scratch_allocator, + profile_algorithm.get(), &profile_result) + .ok(); + + VLOG(4) << " Autotune algorithm " << i + << " result: " << profile_result.elapsed_time_in_ms() + << " ms, valid=" << profile_result.is_valid() + << ", workspace_size=" + << profile_algorithm->workspace_size(); + + if (cublas_launch_status && profile_result.is_valid() && + profile_result.elapsed_time_in_ms() < + best_result.elapsed_time_in_ms()) { + best_result = profile_result; + } + } + + if (best_result.is_valid()) { + algorithm_config.set_algorithm(best_result.algorithm()); + } + // We make sure that each matmul parameter set only gets one pass of + // autotune. If no algorithms works, we add kNoAlgorithm to the autotune + // map. + AutoTuneBatchMatmul::GetInstance()->Insert(matmul_parameters, + algorithm_config); + } + se::blas::AlgorithmType algorithm_idx = algorithm_config.algorithm(); + OP_REQUIRES(context, + 0 <= algorithm_idx && algorithm_idx < algorithms.size(), + errors::Internal("Missing/invalid BatchMatmul algorithm")); + const auto& algorithm = algorithms[algorithm_idx]; + BlasScratchAllocator scratch_allocator(max_scratch_size, context); + bool cublas_launch_status = + stream + ->ThenBlasLtMatmul(plan.get(), alpha, *b_ptrs[0], *a_ptrs[0], + beta, c_ptrs[0], &scratch_allocator, + algorithm.get()) + .ok(); + if (!cublas_launch_status) { + context->SetStatus(errors::Internal( + "Blas batched matmul launch failed : a.shape=(", + bcast.x_batch_size(), ", ", in_x.dim_size(0), ", ", + in_x.dim_size(1), "), b.shape=(", bcast.y_batch_size(), ", ", + in_y.dim_size(0), ", ", in_y.dim_size(1), "), m=", m, ", n=", n, + ", k=", k, ", batch_size=", batch_size)); + } + } else { // requires mixed broadcasting + const std::vector& a_batch_indices = bcast.x_batch_indices(); + const std::vector& b_batch_indices = bcast.y_batch_indices(); + for (int64 i = 0; i < bcast.x_batch_size(); ++i) { + a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k)); + } + for (int64 i = 0; i < bcast.y_batch_size(); ++i) { + b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n)); + } + for (int64 i = 0; i < batch_size; ++i) { + c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n)); + a_ptrs.push_back(&a_device_memory[a_batch_indices[i]]); + b_ptrs.push_back(&b_device_memory[b_batch_indices[i]]); + c_ptrs.push_back(&c_device_memory.back()); + } + + BlasScratchAllocator scratch_allocator(max_scratch_size, context); + bool blas_launch_status = + stream + ->ThenBlasGemmBatchedWithScratch( + blas_transpose_b, blas_transpose_a, n, m, k, + static_cast(1.0), b_ptrs, + adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k, + static_cast(0.0), c_ptrs, n, batch_size, + &scratch_allocator) + .ok(); + if (!blas_launch_status) { + context->SetStatus(errors::Internal( + "Blas xGEMMBatched launch failed : a.shape=", + in_x.shape().DebugString(), + ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n, + ", k=", k, ", batch_size=", batch_size)); + } + } + return; +#else // if not GOOGLE_CUDA or CUDA_VERSION < 11000 bool is_full_broadcast = std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1; bool use_strided_batched = @@ -388,8 +743,6 @@ struct LaunchBatchMatMul { } } - typedef Scalar Coefficient; - // Blas does // C = A x B // where A, B and C are assumed to be in column major. @@ -399,7 +752,10 @@ struct LaunchBatchMatMul { if (batch_size == 1) { // This is a regular matrix*matrix or matrix*vector multiply. Avoid the // overhead of the scratch allocator and the batch interface. - if (n == 1 && + // Note that the GEMV call here does not support Eigen::half, so we do not + // use this path in that case. A workaround is applied to the pointers + // passed to the call itself to avoid compilation errors. + if (!std::is_same::value && n == 1 && blas_transpose_b != se::blas::Transpose::kConjugateTranspose && blas_transpose_a != se::blas::Transpose::kConjugateTranspose) { // This is a matrix*vector multiply so use GEMV to compute A * b. @@ -410,13 +766,19 @@ struct LaunchBatchMatMul { auto gemv_trans_a = blas_transpose_a == se::blas::Transpose::kTranspose ? se::blas::Transpose::kNoTranspose : se::blas::Transpose::kTranspose; + // Cast pointers as a workaround for GEMV not supporting Eigen::half + // (this will never actually be executed for Eigen::half). + typedef se::DeviceMemory NonHalfDeviceMemoryType; + NonHalfDeviceMemoryType a_ptr(*(a_ptrs[0])); + NonHalfDeviceMemoryType b_ptr(*(b_ptrs[0])); + NonHalfDeviceMemoryType c_ptr(*(c_ptrs[0])); bool blas_launch_status = stream ->ThenBlasGemv(gemv_trans_a, adj_x || trans_x ? m : k, adj_x || trans_x ? k : m, - static_cast(1.0), *(a_ptrs[0]), - adj_x || trans_x ? m : k, *(b_ptrs[0]), 1, - static_cast(0.0), c_ptrs[0], 1) + static_cast(1.0), a_ptr, + adj_x || trans_x ? m : k, b_ptr, 1, + static_cast(0.0), &c_ptr, 1) .ok(); if (!blas_launch_status) { context->SetStatus(errors::Internal( @@ -459,154 +821,7 @@ struct LaunchBatchMatMul { ", k=", k, ", batch_size=", batch_size)); } } else { - BlasScratchAllocator scratch_allocator(context); - bool blas_launch_status = - stream - ->ThenBlasGemmBatchedWithScratch( - blas_transpose_b, blas_transpose_a, n, m, k, - static_cast(1.0), b_ptrs, - adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k, - static_cast(0.0), c_ptrs, n, batch_size, - &scratch_allocator) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal( - "Blas xGEMMBatched launch failed : a.shape=", - in_x.shape().DebugString(), - ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n, - ", k=", k, ", batch_size=", batch_size)); - } - } - } -}; - -template <> -struct LaunchBatchMatMul { - static void Launch(OpKernelContext* context, const Tensor& in_x, - const Tensor& in_y, bool adj_x, bool adj_y, bool trans_x, - bool trans_y, const MatMulBCast& bcast, Tensor* out) { - typedef Eigen::half Scalar; - se::blas::Transpose trans[] = {se::blas::Transpose::kNoTranspose, - se::blas::Transpose::kTranspose, - se::blas::Transpose::kConjugateTranspose}; - const uint64 m = in_x.dim_size(adj_x || trans_x ? 2 : 1); - const uint64 k = in_x.dim_size(adj_x || trans_x ? 1 : 2); - const uint64 n = in_y.dim_size(adj_y || trans_y ? 1 : 2); - const uint64 batch_size = bcast.output_batch_size(); - auto blas_transpose_a = trans[adj_x ? 2 : (trans_x ? 1 : 0)]; - auto blas_transpose_b = trans[adj_y ? 2 : (trans_y ? 1 : 0)]; - - auto* stream = context->op_device_context()->stream(); - OP_REQUIRES(context, stream, errors::Internal("No GPU stream available.")); - - typedef perftools::gputools::DeviceMemory DeviceMemoryType; - std::vector a_device_memory; - std::vector b_device_memory; - std::vector c_device_memory; - std::vector a_ptrs; - std::vector b_ptrs; - std::vector c_ptrs; - a_device_memory.reserve(bcast.x_batch_size()); - b_device_memory.reserve(bcast.y_batch_size()); - c_device_memory.reserve(batch_size); - a_ptrs.reserve(batch_size); - b_ptrs.reserve(batch_size); - c_ptrs.reserve(batch_size); - auto* a_base_ptr = in_x.template flat().data(); - auto* b_base_ptr = in_y.template flat().data(); - auto* c_base_ptr = out->template flat().data(); - - uint64 a_stride; - uint64 b_stride; - uint64 c_stride; - - bool is_full_broadcast = - std::min(bcast.x_batch_size(), bcast.y_batch_size()) == 1; - bool use_strided_batched = - (!bcast.IsBroadcastingRequired() || is_full_broadcast) && - batch_size > 1; - if (use_strided_batched) { - a_stride = bcast.x_batch_size() != 1 ? m * k : 0; - b_stride = bcast.y_batch_size() != 1 ? k * n : 0; - c_stride = m * n; - a_device_memory.push_back(AsDeviceMemory(a_base_ptr)); - b_device_memory.push_back(AsDeviceMemory(b_base_ptr)); - c_device_memory.push_back(AsDeviceMemory(c_base_ptr)); - a_ptrs.push_back(&a_device_memory.back()); - b_ptrs.push_back(&b_device_memory.back()); - c_ptrs.push_back(&c_device_memory.back()); - } else if (!bcast.IsBroadcastingRequired()) { - for (int64 i = 0; i < batch_size; ++i) { - a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k)); - b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n)); - c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n)); - a_ptrs.push_back(&a_device_memory.back()); - b_ptrs.push_back(&b_device_memory.back()); - c_ptrs.push_back(&c_device_memory.back()); - } - } else { - const std::vector& a_batch_indices = bcast.x_batch_indices(); - const std::vector& b_batch_indices = bcast.y_batch_indices(); - for (int64 i = 0; i < bcast.x_batch_size(); ++i) { - a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k)); - } - for (int64 i = 0; i < bcast.y_batch_size(); ++i) { - b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n)); - } - for (int64 i = 0; i < batch_size; ++i) { - c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n)); - a_ptrs.push_back(&a_device_memory[a_batch_indices[i]]); - b_ptrs.push_back(&b_device_memory[b_batch_indices[i]]); - c_ptrs.push_back(&c_device_memory.back()); - } - } - - typedef float Coefficient; - - // Blas does - // C = A x B - // where A, B and C are assumed to be in column major. - // We want the output to be in row-major, so we can compute - // C' = B' x A', where ' stands for transpose (not adjoint). - // TODO(yangzihao): Choose the best of the three strategies using autotune. - if (batch_size == 1) { - // This is a regular matrix*matrix or matrix*vector multiply. Avoid the - // overhead of the scratch allocator and the batch interface. - // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS - bool blas_launch_status = - stream - ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, - static_cast(1.0), *(b_ptrs[0]), - adj_y || trans_y ? k : n, *(a_ptrs[0]), - adj_x || trans_x ? m : k, - static_cast(0.0), c_ptrs[0], n) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal( - "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(), - ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n, - ", k=", k)); - } - } else if (use_strided_batched) { - bool blas_launch_status = - stream - ->ThenBlasGemmStridedBatched( - blas_transpose_b, blas_transpose_a, n, m, k, - static_cast(1.0), *b_ptrs[0], - adj_y || trans_y ? k : n, b_stride, *a_ptrs[0], - adj_x || trans_x ? m : k, a_stride, - static_cast(0.0), c_ptrs[0], n, c_stride, - batch_size) - .ok(); - if (!blas_launch_status) { - context->SetStatus(errors::Internal( - "Blas xGEMMStridedBatched launch failed : a.shape=", - in_x.shape().DebugString(), - ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n, - ", k=", k, ", batch_size=", batch_size)); - } - } else { - BlasScratchAllocator scratch_allocator(context); + BlasScratchAllocator scratch_allocator(max_scratch_size, context); bool blas_launch_status = stream ->ThenBlasGemmBatchedWithScratch( @@ -624,6 +839,7 @@ struct LaunchBatchMatMul { ", k=", k, ", batch_size=", batch_size)); } } +#endif // not GOOGLE_CUDA or CUDA_VERSION < 11000 } }; @@ -637,6 +853,7 @@ class BaseBatchMatMulOp : public OpKernel { : OpKernel(context) { OP_REQUIRES_OK(context, context->GetAttr("adj_x", &adj_x_)); OP_REQUIRES_OK(context, context->GetAttr("adj_y", &adj_y_)); + use_autotune_ = MatmulAutotuneEnable(); } ~BaseBatchMatMulOp() override {} @@ -698,7 +915,7 @@ class BaseBatchMatMulOp : public OpKernel { out->shape().DebugString())); LaunchBatchMatMul::Launch( ctx, in0_reshaped, in1_reshaped, adj_x_, adj_y_, /*trans_x=*/false, - /*trans_y=*/false, bcast, &out_reshaped); + /*trans_y=*/false, bcast, use_autotune_, &out_reshaped); } protected: @@ -708,6 +925,7 @@ class BaseBatchMatMulOp : public OpKernel { private: bool adj_x_; bool adj_y_; + bool use_autotune_; }; // BatchMatMul Op implementation which disallows broadcasting. diff --git a/tensorflow/core/kernels/gpu_utils.cc b/tensorflow/core/kernels/gpu_utils.cc index 7da1963c676..171a26e5b78 100644 --- a/tensorflow/core/kernels/gpu_utils.cc +++ b/tensorflow/core/kernels/gpu_utils.cc @@ -22,6 +22,7 @@ limitations under the License. #include "google/protobuf/any.pb.h" #include "absl/algorithm/container.h" #include "absl/base/call_once.h" +#include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/platform/logger.h" #include "tensorflow/core/protobuf/autotuning.pb.h" #include "tensorflow/core/protobuf/conv_autotuning.pb.h" @@ -282,6 +283,62 @@ Status BestCudnnConvAlgorithm(absl::Span results, return Status::OK(); } +int64 GetWorkspaceLimit(const string& envvar_in_mb, + int64 default_value_in_bytes) { + const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str()); + if (workspace_limit_in_mb_str != nullptr && + strcmp(workspace_limit_in_mb_str, "") != 0) { + int64 scratch_limit_in_mb = -1; + if (strings::safe_strto64(workspace_limit_in_mb_str, + &scratch_limit_in_mb)) { + return scratch_limit_in_mb * (1 << 20); + } else { + LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": " + << workspace_limit_in_mb_str; + } + } + return default_value_in_bytes; +} + +GpuScratchAllocator::GpuScratchAllocator(int64 memory_limit, + OpKernelContext* context) + : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {} + +se::port::StatusOr> GpuScratchAllocator::AllocateBytes( + int64 byte_size) { + Tensor temporary_memory; + if (byte_size < 0) { + return se::port::Status{se::port::error::INVALID_ARGUMENT, + "Requested negative byte size!"}; + } + if (byte_size > memory_limit_) { + return se::port::Status{ + se::port::error::UNAVAILABLE, + absl::StrCat("Requested memory size (", byte_size, + ") exceeds the max memory limit (", memory_limit_, ").")}; + } + AllocationAttributes allocation_attr; + allocation_attr.retry_on_failure = false; + Status allocation_status(context_->allocate_temp( + DT_UINT8, TensorShape({byte_size}), &temporary_memory, + AllocatorAttributes(), allocation_attr)); + if (!allocation_status.ok()) { + return se::port::Status{ + se::port::error::UNAVAILABLE, + absl::StrCat("Failed to allocate the requested memory size (", + byte_size, ").")}; + } + // Hold the reference of the allocated tensors until the end of the + // allocator. + // NOTE: We expect tensors to be deallocated when this allocator goes out of + // scope when allocated_tensors is destructed. + allocated_tensors_.push_back(temporary_memory); + total_byte_size_ += byte_size; + return se::port::StatusOr>( + AsDeviceMemory(temporary_memory.flat().data(), + temporary_memory.flat().size())); +} + } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h index a1589db3b5b..f97aa182fbd 100644 --- a/tensorflow/core/kernels/gpu_utils.h +++ b/tensorflow/core/kernels/gpu_utils.h @@ -243,6 +243,42 @@ void LogFusedConvForwardAutotuneResults( Status BestCudnnConvAlgorithm(absl::Span results, se::dnn::AlgorithmConfig* algo); +// Get a workspace limit from the environment variable, which is in MB. +// Return the workspace memory limit in bytes. If no value is set, return the +// default value. +int64 GetWorkspaceLimit(const string& envvar_in_mb, + int64 default_value_in_bytes); + +// Get the Dnn workspace limit from the environment variable, which is in MB. +// Return the workspace memory limit in bytes. If no value is set, return the +// default value. +int64 GetDnnWorkspaceLimit(const string& envvar_in_mb, + int64 default_value_in_bytes); + +// A class to provide scratch-space allocator for Stream-Executor callbacks in +// CUDA libraries (CUDNN etc.). +// TensorFlow is responsible for releasing the temporary buffers after +// the kernel finishes. +class GpuScratchAllocator : public se::ScratchAllocator { + public: + virtual ~GpuScratchAllocator() {} + + GpuScratchAllocator(int64 memory_limit, OpKernelContext* context); + + int64 GetMemoryLimitInBytes() override { return memory_limit_; } + + se::port::StatusOr> AllocateBytes( + int64 byte_size) override; + + int64 TotalByteSize() { return total_byte_size_; } + + private: + int64 memory_limit_; + int64 total_byte_size_; + OpKernelContext* context_; + std::vector allocated_tensors_; +}; + } // namespace tensorflow #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h index b9b2d1f0eae..e10322a88e1 100644 --- a/tensorflow/core/kernels/linalg/einsum_op_impl.h +++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h @@ -549,6 +549,7 @@ struct EinsumHelper { static Status ContractOperands(OpKernelContext* ctx, absl::Span inputs, absl::Span swap_free_and_contract, + bool use_autotune, Tensor* output) { if (inputs.size() == 1) return CopyFrom(inputs[0], inputs[0].shape(), output); @@ -583,7 +584,7 @@ struct EinsumHelper { ReshapeToRank3(*output, bcast.output_batch_size(), &output_reshaped)); LaunchBatchMatMul::Launch(ctx, lhs, rhs, /*adj_x=*/false, /*adj_y=*/false, trans_x, trans_y, - bcast, &output_reshaped); + bcast, use_autotune, &output_reshaped); return Status::OK(); } }; @@ -598,6 +599,7 @@ class EinsumOp : public OpKernel { equation_, &input_labels_, &output_labels_, &label_types_, &input_label_counts_, &output_label_counts_, &input_has_ellipsis_, &output_has_ellipsis_)); + use_autotune_ = MatmulAutotuneEnable(); } void Compute(OpKernelContext* ctx) override { @@ -640,7 +642,7 @@ class EinsumOp : public OpKernel { Tensor contraction_output_reshaped; OP_REQUIRES_OK(ctx, EinsumHelper::ContractOperands( ctx, inputs_reduced, swap_free_and_contract, - &contraction_output_reshaped)); + use_autotune_, &contraction_output_reshaped)); // Copy the batch labels from the contraction output. Recover the batch // shape, which may have been broadcasted. @@ -738,6 +740,7 @@ class EinsumOp : public OpKernel { LabelCounts output_label_counts_; gtl::InlinedVector input_has_ellipsis_; bool output_has_ellipsis_ = false; + bool use_autotune_; }; #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM diff --git a/tensorflow/core/util/matmul_autotune.cc b/tensorflow/core/util/matmul_autotune.cc index 741a78a193f..c30a5d930e7 100644 --- a/tensorflow/core/util/matmul_autotune.cc +++ b/tensorflow/core/util/matmul_autotune.cc @@ -48,4 +48,22 @@ bool MatmulDoFP32ComputationFP16Input() { return value; } +int MatmulMaxAutotuneAlgorithmCount() { + int64 value; + // In CUDA 11, cublasLtMatmulAlgoGetHeuristic typically returns <= 4 + // algorithms for a given configuration, so 10 seems like a reasonable default + // here. + Status status = + ReadInt64FromEnvVar("TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS", 10, &value); + if (!status.ok()) { + LOG(ERROR) << status.error_message(); + } + static constexpr const int kMaxValue = std::numeric_limits::max(); + if (value < 1 || value > kMaxValue) { + LOG(ERROR) << "Invalid value for TF_MATMUL_AUTOTUNE_MAX_ALGORITHMS: " + << value << " is not in range [1, " << kMaxValue << "]"; + } + return value; +} + } // namespace tensorflow diff --git a/tensorflow/core/util/matmul_autotune.h b/tensorflow/core/util/matmul_autotune.h index 5846cae2fc7..c77d274e781 100644 --- a/tensorflow/core/util/matmul_autotune.h +++ b/tensorflow/core/util/matmul_autotune.h @@ -22,6 +22,7 @@ namespace tensorflow { bool MatmulAutotuneEnable(); bool MatmulDoFP32ComputationFP16Input(); +int MatmulMaxAutotuneAlgorithmCount(); } // namespace tensorflow From f7d29c94dfda42b3531bddc911cf5b9fdd4bddab Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 6 Jul 2020 21:30:27 +1000 Subject: [PATCH 0030/1447] Remove duplicated code in conv and fft kernels --- tensorflow/core/kernels/conv_ops.cc | 14 +------- tensorflow/core/kernels/conv_ops_gpu.h | 47 +------------------------- tensorflow/core/kernels/fft_ops.cc | 16 ++------- 3 files changed, 4 insertions(+), 73 deletions(-) diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc index b8c2671e7d2..fd8cad0d67c 100644 --- a/tensorflow/core/kernels/conv_ops.cc +++ b/tensorflow/core/kernels/conv_ops.cc @@ -619,19 +619,7 @@ template struct LaunchConv2DOp; int64 GetDnnWorkspaceLimit(const string& envvar_in_mb, int64 default_value_in_bytes) { - const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str()); - if (workspace_limit_in_mb_str != nullptr && - strcmp(workspace_limit_in_mb_str, "") != 0) { - int64 scratch_limit_in_mb = -1; - if (strings::safe_strto64(workspace_limit_in_mb_str, - &scratch_limit_in_mb)) { - return scratch_limit_in_mb * (1 << 20); - } else { - LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": " - << workspace_limit_in_mb_str; - } - } - return default_value_in_bytes; + return GetWorkspaceLimit(envvar_in_mb, default_value_in_bytes); } // A dummy type to group forward convolution autotune results together. diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h index 2e97d486b54..8beab722a64 100644 --- a/tensorflow/core/kernels/conv_ops_gpu.h +++ b/tensorflow/core/kernels/conv_ops_gpu.h @@ -48,52 +48,7 @@ int64 GetDnnWorkspaceLimit(const string& envvar_in_mb, // A class to provide scratch-space allocator for Stream-Executor Cudnn // callback. TensorFlow is responsible for releasing the temporary buffers after // the kernel finishes. -class DnnScratchAllocator : public se::ScratchAllocator { - public: - virtual ~DnnScratchAllocator() {} - DnnScratchAllocator(int64 memory_limit, OpKernelContext* context) - : memory_limit_(memory_limit), total_byte_size_(0), context_(context) {} - int64 GetMemoryLimitInBytes() override { return memory_limit_; } - se::port::StatusOr> AllocateBytes( - int64 byte_size) override { - Tensor temporary_memory; - if (byte_size < 0) { - return se::port::Status{se::port::error::INVALID_ARGUMENT, - "Requested negative byte size!"}; - } - if (byte_size > memory_limit_) { - return se::port::Status{se::port::error::UNAVAILABLE, - absl::StrCat("Requested memory size (", byte_size, - ") exceeds the max memory limit (", - memory_limit_, ").")}; - } - AllocationAttributes allocation_attr; - allocation_attr.retry_on_failure = false; - Status allocation_status(context_->allocate_temp( - DT_UINT8, TensorShape({byte_size}), &temporary_memory, - AllocatorAttributes(), allocation_attr)); - if (!allocation_status.ok()) { - return se::port::Status{ - se::port::error::UNAVAILABLE, - absl::StrCat("Failed to allocate the requested memory size (", - byte_size, ").")}; - } - // Hold the reference of the allocated tensors until the end of the - // allocator. - allocated_tensors_.push_back(temporary_memory); - total_byte_size_ += byte_size; - return se::port::StatusOr>( - AsDeviceMemory(temporary_memory.flat().data(), - temporary_memory.flat().size())); - } - int64 TotalByteSize() { return total_byte_size_; } - - private: - int64 memory_limit_; - int64 total_byte_size_; - OpKernelContext* context_; - std::vector allocated_tensors_; -}; +using DnnScratchAllocator = GpuScratchAllocator; // Encapsulate all the shape information that is used in both forward and // backward conv operations. diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc index 050b83980c6..f73cc29ecf0 100644 --- a/tensorflow/core/kernels/fft_ops.cc +++ b/tensorflow/core/kernels/fft_ops.cc @@ -31,6 +31,7 @@ limitations under the License. #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) +#include "tensorflow/core/kernels/gpu_utils.h" #include "tensorflow/core/platform/stream_executor.h" #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM @@ -400,20 +401,7 @@ class CufftScratchAllocator : public se::ScratchAllocator { int64 GetCufftWorkspaceLimit(const string& envvar_in_mb, int64 default_value_in_bytes) { - const char* workspace_limit_in_mb_str = getenv(envvar_in_mb.c_str()); - if (workspace_limit_in_mb_str != nullptr && - strcmp(workspace_limit_in_mb_str, "") != 0) { - int64 scratch_limit_in_mb = -1; - Status status = ReadInt64FromEnvVar(envvar_in_mb, default_value_in_bytes, - &scratch_limit_in_mb); - if (!status.ok()) { - LOG(WARNING) << "Invalid value for env-var " << envvar_in_mb << ": " - << workspace_limit_in_mb_str; - } else { - return scratch_limit_in_mb * (1 << 20); - } - } - return default_value_in_bytes; + return GetWorkspaceLimit(envvar_in_mb, default_value_in_bytes); } class FFTGPUBase : public FFTBase { From 8c0eb4b35b2b8e081c776e7a2ab73489c9f248ff Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Tue, 4 Aug 2020 22:26:57 +1000 Subject: [PATCH 0031/1447] Add workaround for cublasLt known issue - Avoids a heuristic alignment issue noted in the CUDA Release Notes. --- tensorflow/stream_executor/cuda/cuda_blas.cc | 102 +++++++++++++++---- 1 file changed, 84 insertions(+), 18 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 565a1c02fb4..ba833e562e2 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -488,6 +488,26 @@ cudaDataType_t GetCUDADataType(blas::DataType ty) { return CUDA_C_64F; } } + +int GetDataTypeSizeBytes(blas::DataType ty) { + switch (ty) { + case blas::DataType::kF16: + return 2; + case blas::DataType::kF32: + return 4; + case blas::DataType::kF64: + return 8; + case blas::DataType::kI8: + return 1; + case blas::DataType::kI32: + return 4; + case blas::DataType::kComplexF32: + return 8; + case blas::DataType::kComplexF64: + return 16; + } +} + } // namespace template @@ -3161,22 +3181,6 @@ UniqueLayoutDesc CreateCublasLtLayoutDesc(blas::DataType data_type, uint64 rows, return unique_desc; } -UniqueMatmulPreference CreateCublasLtMatmulPreference( - size_t max_workspace_bytes) { - cublasLtMatmulPreference_t preference; - cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference); - if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulPreferenceCreate failed: " << ToString(status); - return nullptr; - } - UniqueMatmulPreference unique_preference(preference); - if (!SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - max_workspace_bytes)) { - return nullptr; - } - return unique_preference; -} - // Helper function to allocate workspace. port::Status AllocateWorkspace(void** workspace, ScratchAllocator* scratch_allocator, @@ -3230,6 +3234,11 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { blas::DataType cd_type() const { return cd_type_; } blas::DataType scale_type() const { return scale_type_; } blas::PointerMode pointer_mode() const { return pointer_mode_; } + int batch_count() const { return batch_count_; } + int64 stride_a() const { return stride_a_; } + int64 stride_b() const { return stride_b_; } + int64 stride_c() const { return stride_c_; } + int64 stride_d() const { return stride_d_; } private: UniqueOpDesc op_desc_; @@ -3241,6 +3250,11 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { blas::DataType cd_type_; blas::DataType scale_type_; blas::PointerMode pointer_mode_; + int batch_count_; + int64 stride_a_; + int64 stride_b_; + int64 stride_c_; + int64 stride_d_; }; CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( @@ -3261,7 +3275,12 @@ CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( ab_type_(ab_type), cd_type_(cd_type), scale_type_(GetScaleType(cd_type, computation_type)), - pointer_mode_(pointer_mode) { + pointer_mode_(pointer_mode), + batch_count_(batch_count), + stride_a_(stride_a), + stride_b_(stride_b), + stride_c_(stride_c), + stride_d_(stride_d) { uint64 rows_a = transa == blas::Transpose::kNoTranspose ? m : k; uint64 cols_a = transa == blas::Transpose::kNoTranspose ? k : m; uint64 rows_b = transb == blas::Transpose::kNoTranspose ? k : n; @@ -3296,6 +3315,53 @@ class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm { size_t workspace_size_; }; +UniqueMatmulPreference CreateCublasLtMatmulPreference( + const blas::IBlasLtMatmulPlan* plan, + size_t max_workspace_bytes) { + cublasLtMatmulPreference_t preference; + cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference); + if (status != CUBLAS_STATUS_SUCCESS) { + VLOG(2) << "cublasLtMatmulPreferenceCreate failed: " << ToString(status); + return nullptr; + } + UniqueMatmulPreference unique_preference(preference); + if (!SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + max_workspace_bytes)) { + return nullptr; + } + + const auto& cuda_plan = *static_cast(plan); + if (cuda_plan.batch_count() == 0) { + return unique_preference; + } + // This is a workaround for a known issue in cuBlasLt where the heuristic may + // in rare cases select an algo that does not support the specified stride. + // Specifying the alignment requirements manually like this avoids the issue. + auto get_alignment_bytes = [](int64 stride, blas::DataType dtype) { + return (stride & -stride) * GetDataTypeSizeBytes(dtype); + }; + if ((cuda_plan.stride_a() && + !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_a(), + cuda_plan.ab_type()))) || + (cuda_plan.stride_b() && + !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_b(), + cuda_plan.ab_type()))) || + (cuda_plan.stride_c() && + !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_c(), + cuda_plan.cd_type()))) || + (cuda_plan.stride_d() && + !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_d(), + cuda_plan.cd_type())))) { + return nullptr; + } + + return unique_preference; +} + } // namespace #endif // CUDA_VERSION >= 11000 @@ -3327,7 +3393,7 @@ bool CUDABlas::GetBlasLtMatmulAlgorithms( out_algorithms) { #if CUDA_VERSION >= 11000 UniqueMatmulPreference preference = - CreateCublasLtMatmulPreference(max_workspace_size); + CreateCublasLtMatmulPreference(plan, max_workspace_size); if (!preference) return false; std::vector results(max_algorithm_count); From 39bf03f083bc78812eaef8dc7e9b274110b923ee Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Wed, 5 Aug 2020 09:36:23 +1000 Subject: [PATCH 0032/1447] Add support for blasLt epilogue fn and bias vector - Changes the backend APIs to allow an epilogue function (default, ReLU, bias, or bias then ReLU) to be specified and a bias vector to be provided. - This is expected to be useful for XLA to perform fusions. - This functionality is not currently tested, because the BatchMatMulOp does not expose relu/bias fusion. --- .../core/kernels/batch_matmul_op_impl.h | 7 +- tensorflow/stream_executor/blas.h | 58 ++++++--- tensorflow/stream_executor/cuda/cuda_blas.cc | 117 +++++++++++++----- tensorflow/stream_executor/cuda/cuda_blas.h | 3 +- tensorflow/stream_executor/stream.cc | 46 ++++--- tensorflow/stream_executor/stream.h | 6 + .../stream_executor/stream_executor_pimpl.cc | 19 +-- .../stream_executor/stream_executor_pimpl.h | 10 +- 8 files changed, 182 insertions(+), 84 deletions(-) diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index 5ca85c00835..456b4beff1e 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -558,8 +558,8 @@ struct LaunchBatchMatMul { stream->parent()->CreateBlasLtMatmulPlanStridedBatched( /*ab_type=*/blas_dtype, /*cd_type=*/blas_dtype, computation_type, - se::blas::PointerMode::kHost, blas_transpose_b, - blas_transpose_a, n, m, k, batch_size, + se::blas::PointerMode::kHost, se::blas::Epilogue::kDefault, + blas_transpose_b, blas_transpose_a, n, m, k, batch_size, /*lda=*/in_y.dim_size(2), b_stride, /*ldb=*/in_x.dim_size(2), a_stride, /*ldc=*/n, c_stride); OP_REQUIRES( @@ -621,7 +621,8 @@ struct LaunchBatchMatMul { stream ->ThenBlasLtMatmul(plan.get(), alpha, *b_ptrs[0], *a_ptrs[0], beta, c_ptrs[0], &scratch_allocator, - profile_algorithm.get(), &profile_result) + profile_algorithm.get(), {}, + &profile_result) .ok(); VLOG(4) << " Autotune algorithm " << i diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 583fba2a505..ae5b4853d05 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -107,6 +107,13 @@ enum class ComputationType { kF32FastBF16, // 32-bit floating-point with reduced (7-bit) mantissa }; +enum class Epilogue { + kDefault = 1, // No special postprocessing + kReLU = 2, // Apply ReLU func point-wise to the results + kBias = 4, // Add broadcasted bias vector to the results + kBiasThenReLU = kBias | kReLU, // Apply bias and then ReLU transform +}; + // Converts a ComputationType to a string. std::string ComputationTypeString(ComputationType ty); @@ -1462,11 +1469,11 @@ class BlasSupport { std::unique_ptr CreateBlasLtMatmulPlan( blas::DataType ab_type, blas::DataType c_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int64 lda, int64 ldb, int64 ldc) { + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, int64 lda, int64 ldb, int64 ldc) { return CreateBlasLtMatmulPlanStridedBatched( - ab_type, c_type, computation_type, pointer_mode, transa, transb, m, n, - k, 1, lda, 0, ldb, 0, ldc, 0); + ab_type, c_type, computation_type, pointer_mode, epilogue, transa, + transb, m, n, k, 1, lda, 0, ldb, 0, ldc, 0); } // A more general version of CreateBlasLtMatmulPlan supporting @@ -1475,9 +1482,9 @@ class BlasSupport { CreateBlasLtMatmulPlanStridedBatched( blas::DataType ab_type, blas::DataType c_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, - int64 stride_b, int64 ldc, int64 stride_c) = 0; + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, int64 stride_a, + int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) = 0; // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are // returned in the order of increasing estimated compute time according to an @@ -1492,13 +1499,18 @@ class BlasSupport { // Executes a blaslt matmul operation on the stream. If output_profile_result // is not nullptr, the operation is profiled, error messages are // suppressed, and output_profile_result->algorithm() is set to - // algorithm->index(). + // algorithm->index(). If epilogue was set to kBias or kBiasThenReLU when + // creating the plan, the bias argument here must refer to a valid device + // vector of length equal to the number of rows in matrix c. If epilogue was + // set to any other value then the bias argument here must be null. The bias + // vector is broadcast across the batch dimension. virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, const HostOrDeviceScalar& alpha, const DeviceMemory& a, const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr) = 0; virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, @@ -1507,6 +1519,7 @@ class BlasSupport { const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr) = 0; virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, @@ -1514,6 +1527,7 @@ class BlasSupport { const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr) = 0; virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, @@ -1521,6 +1535,7 @@ class BlasSupport { const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr) = 0; virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, @@ -1530,6 +1545,7 @@ class BlasSupport { const HostOrDeviceScalar>& beta, DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias = {}, blas::ProfileResult* output_profile_result = nullptr) = 0; virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, @@ -1540,6 +1556,7 @@ class BlasSupport { DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias = {}, blas::ProfileResult* output_profile_result = nullptr) = 0; virtual port::Status GetVersion(std::string *version) = 0; @@ -2359,9 +2376,10 @@ class BlasSupport { CreateBlasLtMatmulPlanStridedBatched( \ blas::DataType ab_type, blas::DataType cd_type, \ blas::ComputationType computation_type, blas::PointerMode pointer_mode, \ - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, \ - uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, \ - int64 stride_b, int64 ldc, int64 stride_c) override; \ + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, \ + uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, \ + int64 stride_a, int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) \ + override; \ bool GetBlasLtMatmulAlgorithms( \ const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, \ int max_algorithm_count, \ @@ -2373,6 +2391,7 @@ class BlasSupport { const DeviceMemory& b, const HostOrDeviceScalar& beta, \ DeviceMemory* c, ScratchAllocator* scratch_allocator, \ const blas::IBlasLtMatmulAlgorithm* algorithm, \ + const DeviceMemory& bias = {}, \ blas::ProfileResult* output_profile_result = nullptr) override; \ bool DoBlasLtMatmul( \ Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ @@ -2381,21 +2400,24 @@ class BlasSupport { const HostOrDeviceScalar& beta, \ DeviceMemory* c, ScratchAllocator* scratch_allocator, \ const blas::IBlasLtMatmulAlgorithm* algorithm, \ - blas::ProfileResult* output_profile_result) override; \ + const DeviceMemory& bias = {}, \ + blas::ProfileResult* output_profile_result = nullptr) override; \ bool DoBlasLtMatmul( \ Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ const DeviceMemory& b, const HostOrDeviceScalar& beta, \ DeviceMemory* c, ScratchAllocator* scratch_allocator, \ const blas::IBlasLtMatmulAlgorithm* algorithm, \ - blas::ProfileResult* output_profile_result) override; \ + const DeviceMemory& bias = {}, \ + blas::ProfileResult* output_profile_result = nullptr) override; \ bool DoBlasLtMatmul( \ Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ const DeviceMemory& b, const HostOrDeviceScalar& beta, \ DeviceMemory* c, ScratchAllocator* scratch_allocator, \ const blas::IBlasLtMatmulAlgorithm* algorithm, \ - blas::ProfileResult* output_profile_result) override; \ + const DeviceMemory& bias = {}, \ + blas::ProfileResult* output_profile_result = nullptr) override; \ bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ const HostOrDeviceScalar>& alpha, \ const DeviceMemory>& a, \ @@ -2404,7 +2426,9 @@ class BlasSupport { DeviceMemory>* c, \ ScratchAllocator* scratch_allocator, \ const blas::IBlasLtMatmulAlgorithm* algorithm, \ - blas::ProfileResult* output_profile_result) override; \ + const DeviceMemory>& bias = {}, \ + blas::ProfileResult* output_profile_result = nullptr) \ + override; \ bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ const HostOrDeviceScalar>& alpha, \ const DeviceMemory>& a, \ @@ -2413,7 +2437,9 @@ class BlasSupport { DeviceMemory>* c, \ ScratchAllocator* scratch_allocator, \ const blas::IBlasLtMatmulAlgorithm* algorithm, \ - blas::ProfileResult* output_profile_result) override; \ + const DeviceMemory>& bias = {}, \ + blas::ProfileResult* output_profile_result = nullptr) \ + override; \ port::Status GetVersion(std::string *version) override; } // namespace blas diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index ba833e562e2..1d95b00ce7e 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -468,6 +468,18 @@ cublasLtPointerMode_t CUBLASPointerMode(blas::PointerMode pointer_mode) { return CUBLASLT_POINTER_MODE_DEVICE; } } +cublasLtEpilogue_t CUBLASEpilogue(blas::Epilogue epilogue) { + switch (epilogue) { + case blas::Epilogue::kDefault: + return CUBLASLT_EPILOGUE_DEFAULT; + case blas::Epilogue::kReLU: + return CUBLASLT_EPILOGUE_RELU; + case blas::Epilogue::kBias: + return CUBLASLT_EPILOGUE_BIAS; + case blas::Epilogue::kBiasThenReLU: + return CUBLASLT_EPILOGUE_RELU_BIAS; + } +} #endif // CUDA_VERSION >= 11000 cudaDataType_t GetCUDADataType(blas::DataType ty) { @@ -3135,12 +3147,12 @@ using UniqueMatmulPreference = std::unique_ptr::type, MatmulPreferenceDestroyer>; -UniqueOpDesc CreateCublasLtOperationDesc( - blas::ComputationType computation_type, blas::DataType scale_type, - blas::PointerMode pointer_mode, blas::Transpose transa, - blas::Transpose transb) { - cublasOperation_t cuda_transa = CUDABlasTranspose(transa); - cublasOperation_t cuda_transb = CUDABlasTranspose(transb); +UniqueOpDesc CreateCublasLtOperationDesc(blas::ComputationType computation_type, + blas::DataType scale_type, + blas::PointerMode pointer_mode, + blas::Epilogue epilogue, + blas::Transpose transa, + blas::Transpose transb) { cublasLtMatmulDesc_t desc; cublasComputeType_t cublas_compute_type = CUBLASComputationType(computation_type); @@ -3154,9 +3166,13 @@ UniqueOpDesc CreateCublasLtOperationDesc( } UniqueOpDesc unique_desc(desc); if (!SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, - CUBLASPointerMode(pointer_mode)) || - !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA, cuda_transa) || - !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB, cuda_transb)) { + CUBLASPointerMode(pointer_mode)) || + !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + CUBLASEpilogue(epilogue)) || + !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA, + CUDABlasTranspose(transa)) || + !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB, + CUDABlasTranspose(transb))) { return nullptr; } return unique_desc; @@ -3217,11 +3233,11 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { public: CUDABlasLtMatmulPlan(blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType compute_type, - blas::PointerMode pointer_mode, blas::Transpose transa, - blas::Transpose transb, uint64 m, uint64 n, uint64 k, - int batch_count, int64 lda, int64 stride_a, int64 ldb, - int64 stride_b, int64 ldc, int64 stride_c, int64 ldd, - int64 stride_d); + blas::PointerMode pointer_mode, blas::Epilogue epilogue, + blas::Transpose transa, blas::Transpose transb, uint64 m, + uint64 n, uint64 k, int batch_count, int64 lda, + int64 stride_a, int64 ldb, int64 stride_b, int64 ldc, + int64 stride_c, int64 ldd, int64 stride_d); cublasLtMatmulDesc_t op_desc() const { return op_desc_.get(); } cublasLtMatrixLayout_t a_desc() const { return a_desc_.get(); } @@ -3234,12 +3250,17 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { blas::DataType cd_type() const { return cd_type_; } blas::DataType scale_type() const { return scale_type_; } blas::PointerMode pointer_mode() const { return pointer_mode_; } + blas::Epilogue epilogue() const { return epilogue_; } int batch_count() const { return batch_count_; } int64 stride_a() const { return stride_a_; } int64 stride_b() const { return stride_b_; } int64 stride_c() const { return stride_c_; } int64 stride_d() const { return stride_d_; } + // Note: Must be const to satisfy API. This is always called before the plan + // is executed, so the state change is not observed in subsequent executions. + bool SetBiasPointer(const void* bias) const; + private: UniqueOpDesc op_desc_; UniqueLayoutDesc a_desc_; @@ -3250,6 +3271,7 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { blas::DataType cd_type_; blas::DataType scale_type_; blas::PointerMode pointer_mode_; + blas::Epilogue epilogue_; int batch_count_; int64 stride_a_; int64 stride_b_; @@ -3260,12 +3282,13 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, - int64 stride_b, int64 ldc, int64 stride_c, int64 ldd, int64 stride_d) + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, int64 stride_a, + int64 ldb, int64 stride_b, int64 ldc, int64 stride_c, int64 ldd, + int64 stride_d) : op_desc_(CreateCublasLtOperationDesc( computation_type, GetScaleType(cd_type, computation_type), - pointer_mode, transa, transb)), + pointer_mode, epilogue, transa, transb)), a_desc_(nullptr), b_desc_(nullptr), c_desc_( @@ -3276,6 +3299,7 @@ CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( cd_type_(cd_type), scale_type_(GetScaleType(cd_type, computation_type)), pointer_mode_(pointer_mode), + epilogue_(epilogue), batch_count_(batch_count), stride_a_(stride_a), stride_b_(stride_b), @@ -3291,6 +3315,11 @@ CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( batch_count); } +bool CUDABlasLtMatmulPlan::SetBiasPointer(const void* bias) const { + return SetCublasLtAttr(op_desc_.get(), CUBLASLT_MATMUL_DESC_BIAS_POINTER, + bias); +} + class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm { public: CUDABlasLtMatmulAlgorithm(blas::AlgorithmType index, @@ -3370,13 +3399,14 @@ std::unique_ptr CUDABlas::CreateBlasLtMatmulPlanStridedBatched( blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int batch_count, int64 lda, int64 stride_a, int64 ldb, - int64 stride_b, int64 ldc, int64 stride_c) { + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, int64 stride_a, + int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) { #if CUDA_VERSION >= 11000 auto result = std::make_unique( - ab_type, cd_type, computation_type, pointer_mode, transa, transb, m, n, k, - batch_count, lda, stride_a, ldb, stride_b, ldc, stride_c, ldc, stride_c); + ab_type, cd_type, computation_type, pointer_mode, epilogue, transa, + transb, m, n, k, batch_count, lda, stride_a, ldb, stride_b, ldc, stride_c, + ldc, stride_c); if (!result->ok()) { result.reset(); } @@ -3436,7 +3466,8 @@ bool CUDABlas::DoBlasLtMatmulInternalImpl( const HostOrDeviceScalar& alpha, const ABType* a, const ABType* b, const HostOrDeviceScalar& beta, const CDType* c, CDType* d, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm) { + const blas::IBlasLtMatmulAlgorithm* algorithm, + const CDType* bias) { const auto& cuda_plan = *static_cast(plan); const auto& cuda_algo = *static_cast(algorithm); @@ -3474,6 +3505,20 @@ bool CUDABlas::DoBlasLtMatmulInternalImpl( "pointer_mode for the given alpha/beta."; return false; } + if ((cuda_plan.epilogue() == blas::Epilogue::kBias || + cuda_plan.epilogue() == blas::Epilogue::kBiasThenReLU) != + (bias != nullptr)) { + VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong " + "epilogue for the given bias pointer."; + return false; + } + if (bias != nullptr) { + if (!cuda_plan.SetBiasPointer(bias)) { + VLOG(2) << "DoBlasLtMatmul returning false because setting the bias " + "pointer failed."; + return false; + } + } const ScaleType* alpha_ptr = alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(); const ScaleType* beta_ptr = @@ -3525,6 +3570,7 @@ bool CUDABlas::DoBlasLtMatmulInternal( const DeviceMemory& c, DeviceMemory* d, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { #if CUDA_VERSION >= 11000 std::unique_ptr timer; @@ -3538,7 +3584,8 @@ bool CUDABlas::DoBlasLtMatmulInternal( bool err_on_failure = timer != nullptr; bool result = DoBlasLtMatmulInternalImpl( stream, err_on_failure, plan, alpha, GpuMemory(a), GpuMemory(b), beta, - GpuMemory(c), GpuMemoryMutable(d), scratch_allocator, algorithm); + GpuMemory(c), GpuMemoryMutable(d), scratch_allocator, algorithm, + GpuMemory(bias)); if (timer && result) { // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error @@ -3563,9 +3610,10 @@ bool CUDABlas::DoBlasLtMatmul( const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, + scratch_allocator, algorithm, bias, output_profile_result); } @@ -3578,6 +3626,7 @@ bool CUDABlas::DoBlasLtMatmul(Stream* stream, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { #if CUDA_VERSION >= 11000 const auto& cuda_plan = *static_cast(plan); @@ -3591,11 +3640,11 @@ bool CUDABlas::DoBlasLtMatmul(Stream* stream, HostOrDeviceScalar float_alpha(static_cast(alpha.value())); HostOrDeviceScalar float_beta(static_cast(beta.value())); return DoBlasLtMatmulInternal(stream, plan, float_alpha, a, b, float_beta, - *c, c, scratch_allocator, algorithm, + *c, c, scratch_allocator, algorithm, bias, output_profile_result); } return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, + scratch_allocator, algorithm, bias, output_profile_result); #else // if CUDA_VERSION < 11000 return false; @@ -3608,9 +3657,10 @@ bool CUDABlas::DoBlasLtMatmul( const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, + scratch_allocator, algorithm, bias, output_profile_result); } @@ -3620,9 +3670,10 @@ bool CUDABlas::DoBlasLtMatmul( const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, + scratch_allocator, algorithm, bias, output_profile_result); } @@ -3634,9 +3685,10 @@ bool CUDABlas::DoBlasLtMatmul( const HostOrDeviceScalar>& beta, DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias, blas::ProfileResult* output_profile_result) { return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, + scratch_allocator, algorithm, bias, output_profile_result); } @@ -3648,9 +3700,10 @@ bool CUDABlas::DoBlasLtMatmul( const HostOrDeviceScalar>& beta, DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias, blas::ProfileResult* output_profile_result) { return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, + scratch_allocator, algorithm, bias, output_profile_result); } diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 351a7778c01..3fdfcb0a50c 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -148,6 +148,7 @@ class CUDABlas : public blas::BlasSupport { const DeviceMemory& c, DeviceMemory* d, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result); // Helper function for implementing DoBlasLtMatmulInternal. @@ -157,7 +158,7 @@ class CUDABlas : public blas::BlasSupport { const HostOrDeviceScalar& alpha, const ABType* a, const ABType* b, const HostOrDeviceScalar& beta, const CDType* c, CDType* d, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm); + const blas::IBlasLtMatmulAlgorithm* algorithm, const CDType* bias); // Guards the cuBLAS handle for this device. absl::Mutex mu_; diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 144af92185c..66728c94821 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -4809,18 +4809,19 @@ Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm)); + PARAM(c), PARAM(algorithm), PARAM(bias)); ThenBlasWithProfileImpl< const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, const DeviceMemory&, const DeviceMemory&, const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*> + const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, output_profile_result); + c, scratch_allocator, algorithm, bias, output_profile_result); } Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, @@ -4831,18 +4832,20 @@ Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm)); + PARAM(c), PARAM(algorithm), PARAM(bias)); ThenBlasWithProfileImpl< const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, const DeviceMemory&, const DeviceMemory&, const HostOrDeviceScalar&, DeviceMemory*, - ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*> + ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, output_profile_result); + c, scratch_allocator, algorithm, bias, output_profile_result); } Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, @@ -4853,18 +4856,19 @@ Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm)); + PARAM(c), PARAM(algorithm), PARAM(bias)); ThenBlasWithProfileImpl< const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, const DeviceMemory&, const DeviceMemory&, const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*> + const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, output_profile_result); + c, scratch_allocator, algorithm, bias, output_profile_result); } Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, @@ -4875,18 +4879,20 @@ Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm)); + PARAM(c), PARAM(algorithm), PARAM(bias)); ThenBlasWithProfileImpl< const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, const DeviceMemory&, const DeviceMemory&, const HostOrDeviceScalar&, DeviceMemory*, - ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*> + ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, output_profile_result); + c, scratch_allocator, algorithm, bias, output_profile_result); } Stream& Stream::ThenBlasLtMatmul( @@ -4897,9 +4903,10 @@ Stream& Stream::ThenBlasLtMatmul( const HostOrDeviceScalar>& beta, DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm)); + PARAM(c), PARAM(algorithm), PARAM(bias)); ThenBlasWithProfileImpl>&, @@ -4907,10 +4914,11 @@ Stream& Stream::ThenBlasLtMatmul( const DeviceMemory>&, const HostOrDeviceScalar>&, DeviceMemory>*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*> + const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory>&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, output_profile_result); + c, scratch_allocator, algorithm, bias, output_profile_result); } Stream& Stream::ThenBlasLtMatmul( @@ -4921,9 +4929,10 @@ Stream& Stream::ThenBlasLtMatmul( const HostOrDeviceScalar>& beta, DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm)); + PARAM(c), PARAM(algorithm), PARAM(bias)); ThenBlasWithProfileImpl>&, @@ -4932,10 +4941,11 @@ Stream& Stream::ThenBlasLtMatmul( const HostOrDeviceScalar>&, DeviceMemory>*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*> + const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory>&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, output_profile_result); + c, scratch_allocator, algorithm, bias, output_profile_result); } Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) { diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index 15f5dfc936f..91a80331f8e 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -1672,6 +1672,7 @@ class Stream { const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr); Stream& ThenBlasLtMatmul( const blas::IBlasLtMatmulPlan* plan, @@ -1680,6 +1681,7 @@ class Stream { const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr); Stream& ThenBlasLtMatmul( const blas::IBlasLtMatmulPlan* plan, @@ -1687,6 +1689,7 @@ class Stream { const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr); Stream& ThenBlasLtMatmul( const blas::IBlasLtMatmulPlan* plan, @@ -1694,6 +1697,7 @@ class Stream { const DeviceMemory& b, const HostOrDeviceScalar& beta, DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr); Stream& ThenBlasLtMatmul( const blas::IBlasLtMatmulPlan* plan, @@ -1703,6 +1707,7 @@ class Stream { const HostOrDeviceScalar>& beta, DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias = {}, blas::ProfileResult* output_profile_result = nullptr); Stream& ThenBlasLtMatmul( const blas::IBlasLtMatmulPlan* plan, @@ -1713,6 +1718,7 @@ class Stream { DeviceMemory>* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory>& bias = {}, blas::ProfileResult* output_profile_result = nullptr); // See FftSupport::DoFft. diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index 3fbbc3f2aac..d75c1bc65c5 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -339,31 +339,32 @@ bool StreamExecutor::GetBlasGemmAlgorithms( std::unique_ptr StreamExecutor::CreateBlasLtMatmulPlan( blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int64 lda, int64 ldb, int64 ldc) { + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, int64 lda, int64 ldb, int64 ldc) { blas::BlasSupport *blas_support = AsBlas(); if (!blas_support) { return nullptr; } return blas_support->CreateBlasLtMatmulPlan( - ab_type, cd_type, computation_type, pointer_mode, transa, transb, m, n, k, - lda, ldb, ldc); + ab_type, cd_type, computation_type, pointer_mode, epilogue, transa, + transb, m, n, k, lda, ldb, ldc); } std::unique_ptr StreamExecutor::CreateBlasLtMatmulPlanStridedBatched( blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, uint64 batch_count, int64 lda, int64 stride_a, int64 ldb, - int64 stride_b, int64 ldc, int64 stride_c) { + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, uint64 batch_count, int64 lda, int64 stride_a, + int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) { blas::BlasSupport *blas_support = AsBlas(); if (!blas_support) { return nullptr; } return blas_support->CreateBlasLtMatmulPlanStridedBatched( - ab_type, cd_type, computation_type, pointer_mode, transa, transb, m, n, k, - batch_count, lda, stride_a, ldb, stride_b, ldc, stride_c); + ab_type, cd_type, computation_type, pointer_mode, epilogue, transa, + transb, m, n, k, batch_count, lda, stride_a, ldb, stride_b, ldc, + stride_c); } bool StreamExecutor::GetBlasLtMatmulAlgorithms( diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index 90137417250..b40c0c23c05 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -401,17 +401,17 @@ class StreamExecutor { std::unique_ptr CreateBlasLtMatmulPlan( blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, int64 lda, int64 ldb, int64 ldc); + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, int64 lda, int64 ldb, int64 ldc); // A more general version of CreateBlasLtMatmulPlan supporting // batched operations. std::unique_ptr CreateBlasLtMatmulPlanStridedBatched( blas::DataType ab_type, blas::DataType cd_type, blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n, - uint64 k, uint64 batch_count, int64 lda, int64 stride_a, int64 ldb, - int64 stride_b, int64 ldc, int64 stride_c); + blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, + uint64 m, uint64 n, uint64 k, uint64 batch_count, int64 lda, + int64 stride_a, int64 ldb, int64 stride_b, int64 ldc, int64 stride_c); // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are // returned in the order of increasing estimated compute time according to an From c5b15cbca8683aa5f7088111e1ee1662c4284780 Mon Sep 17 00:00:00 2001 From: Tzu-Wei Sung Date: Mon, 10 Aug 2020 09:41:27 -0700 Subject: [PATCH 0033/1447] Add SegmentProdGrad Fix typo --- tensorflow/python/ops/math_grad.py | 42 +++++++++++++++++++++++++ tensorflow/python/ops/math_grad_test.py | 37 ++++++++++++++++++++++ 2 files changed, 79 insertions(+) diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py index 463ad8337c7..3f91ce6729f 100644 --- a/tensorflow/python/ops/math_grad.py +++ b/tensorflow/python/ops/math_grad.py @@ -414,6 +414,48 @@ def _SegmentMaxGrad(op, grad): return _SegmentMinOrMaxGrad(op, grad) +@ops.RegisterGradient("SegmentProd") +def _SegmentProdGrad(op, grad): + """Gradient for SegmentProd. + + The gradient can be expressed for each segment by dividing the segment's + product by each element of the segment input tensor, but this approach can't + deal with zeros in the input. + Unlike reduce_prod we can't use cumsum here as individual segments may have + a different number of elements. Therefore we consider three cases: + 1) A segment input contains no zeros and we can safely divide by the input + tensor. + 2) A segment contains exactly one zero. Then the gradient of each input of + the segment is zero except for the 0-input, there the gradient is + the product of the remaining segment entries. + 3) A segment contains at least two zeros. The gradient is zero for all + segment inputs. + """ + data = op.inputs[0] + segment_ids = op.inputs[1] + is_zero = math_ops.equal(data, 0) + num_zeros = gen_math_ops.segment_sum( + math_ops.cast(is_zero, dtype=dtypes.int32), segment_ids) + # handle case 3 and set the gradient to 0 for segments with more than one + # 0 as input + grad = array_ops.where_v2( + math_ops.greater(num_zeros, 1), array_ops.zeros_like(grad), grad) + # replace all zeros with ones and compute the segment_prod + non_zero_data = array_ops.where_v2(is_zero, + array_ops.ones_like(data), + data) + non_zero_prod = gen_math_ops.segment_prod(non_zero_data, segment_ids) + gathered_prod = array_ops.gather(op.outputs[0], segment_ids) + gathered_non_zero_prod = array_ops.gather(non_zero_prod, segment_ids) + prod_divided_by_el = gathered_prod / data # May contain nan/inf. + # Now fetch the individual results for segments containing 0 and those that + # don't. + partial_derivative = array_ops.where_v2(is_zero, gathered_non_zero_prod, + prod_divided_by_el) + gathered_grad = array_ops.gather(grad, segment_ids) + return gathered_grad * partial_derivative, None + + def _GatherDropNegatives(params, ids, zero_clipped_indices=None, diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py index e856749f885..bbd30ef5537 100644 --- a/tensorflow/python/ops/math_grad_test.py +++ b/tensorflow/python/ops/math_grad_test.py @@ -374,6 +374,43 @@ class SegmentMinOrMaxGradientTest(test.TestCase): self.assertLess(error, 1e-4) +@test_util.run_all_in_graph_and_eager_modes +class SegmentProdGradientTest(test.TestCase): + + def _run_gradient_check(self, data, segment_ids): + + def _segment_prod(x): + return math_ops.segment_prod(x, segment_ids) + + err = gradient_checker_v2.max_error( + *gradient_checker_v2.compute_gradient(_segment_prod, [data])) + self.assertLess(err, 2e-4) + + def testSegmentProdGradientWithoutOverlap(self): + data = constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], + dtype=dtypes.float32) + segment_ids = constant_op.constant([0, 1, 2], dtype=dtypes.int64) + self._run_gradient_check(data, segment_ids) + + def testSegmentProdGradientWithoutZeros(self): + data = constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], + dtype=dtypes.float32) + segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64) + self._run_gradient_check(data, segment_ids) + + def testSegmentProdGradientWithZeros(self): + data = constant_op.constant([[0, 2, 3, 4], [0, 0, 2, 0], [5, 0, 7, 0]], + dtype=dtypes.float32) + segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64) + self._run_gradient_check(data, segment_ids) + + def testSegmentProdGradientWithEmptySegment(self): + data = constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1], [5, 6, 7, 8]], + dtype=dtypes.float32) + segment_ids = constant_op.constant([0, 0, 2], dtype=dtypes.int64) + self._run_gradient_check(data, segment_ids) + + class FloorModGradientTest(test.TestCase): @test_util.run_deprecated_v1 From 552580beb1b5488128053506a03730e3d1ba02ad Mon Sep 17 00:00:00 2001 From: Tzu-Wei Sung Date: Thu, 17 Sep 2020 14:05:34 -0700 Subject: [PATCH 0034/1447] Divide by non zero data --- tensorflow/python/ops/math_grad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py index 3f91ce6729f..a66c7be4195 100644 --- a/tensorflow/python/ops/math_grad.py +++ b/tensorflow/python/ops/math_grad.py @@ -447,7 +447,7 @@ def _SegmentProdGrad(op, grad): non_zero_prod = gen_math_ops.segment_prod(non_zero_data, segment_ids) gathered_prod = array_ops.gather(op.outputs[0], segment_ids) gathered_non_zero_prod = array_ops.gather(non_zero_prod, segment_ids) - prod_divided_by_el = gathered_prod / data # May contain nan/inf. + prod_divided_by_el = gathered_prod / non_zero_data # Now fetch the individual results for segments containing 0 and those that # don't. partial_derivative = array_ops.where_v2(is_zero, gathered_non_zero_prod, From c1784bd109c1a6f4123934847d543c7c09dbfaa7 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Fri, 18 Sep 2020 13:05:34 +0100 Subject: [PATCH 0035/1447] Update BATCH_MATMUL operator version supported by the reference kernel --- tensorflow/lite/kernels/register_ref.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/register_ref.cc b/tensorflow/lite/kernels/register_ref.cc index c8fb46adb96..b9a5b13b477 100644 --- a/tensorflow/lite/kernels/register_ref.cc +++ b/tensorflow/lite/kernels/register_ref.cc @@ -447,7 +447,7 @@ BuiltinRefOpResolver::BuiltinRefOpResolver() { AddBuiltin(BuiltinOperator_DENSIFY, Register_DENSIFY()); AddBuiltin(BuiltinOperator_BATCH_MATMUL, Register_BATCH_MATMUL_REF(), /* min_version = */ 1, - /* max_version = */ 2); + /* max_version = */ 3); AddCustom("NumericVerify", tflite::ops::custom::Register_NUMERIC_VERIFY_REF()); // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that From 4b660b49225b49d62d023506aad248ba50cfe997 Mon Sep 17 00:00:00 2001 From: Zhangqiang Date: Mon, 21 Sep 2020 14:16:47 +0800 Subject: [PATCH 0036/1447] add support for mips64 platform --- tensorflow/BUILD | 6 ++++++ third_party/cpuinfo/BUILD.bazel | 6 ++++++ third_party/remote_config/remote_platform_configure.bzl | 2 ++ 3 files changed, 14 insertions(+) diff --git a/tensorflow/BUILD b/tensorflow/BUILD index f95a0691c79..29b222209b4 100644 --- a/tensorflow/BUILD +++ b/tensorflow/BUILD @@ -238,6 +238,12 @@ config_setting( visibility = ["//visibility:public"], ) +config_setting( + name = "linux_mips64", + values = {"cpu": "mips64"}, + visibility = ["//visibility:public"], +) + config_setting( name = "debug", values = { diff --git a/third_party/cpuinfo/BUILD.bazel b/third_party/cpuinfo/BUILD.bazel index 15cfcd1c4ee..9b007cc0daa 100644 --- a/third_party/cpuinfo/BUILD.bazel +++ b/third_party/cpuinfo/BUILD.bazel @@ -102,6 +102,7 @@ cc_library( ":linux_armv7a": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS, ":linux_armeabi": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS, ":linux_aarch64": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM64_SRCS, + ":linux_mips64": COMMON_SRCS + LINUX_SRCS, ":macos_x86_64": COMMON_SRCS + X86_SRCS + MACH_SRCS + MACH_X86_SRCS, ":windows_x86_64": COMMON_SRCS + X86_SRCS + WINDOWS_X86_SRCS, ":android_armv7": COMMON_SRCS + ARM_SRCS + LINUX_SRCS + LINUX_ARM32_SRCS + ANDROID_ARM_SRCS, @@ -208,6 +209,11 @@ config_setting( values = {"cpu": "aarch64"}, ) +config_setting( + name = "linux_mips64", + values = {"cpu": "mips64"}, +) + config_setting( name = "macos_x86_64", values = { diff --git a/third_party/remote_config/remote_platform_configure.bzl b/third_party/remote_config/remote_platform_configure.bzl index 386ad603950..29520396905 100644 --- a/third_party/remote_config/remote_platform_configure.bzl +++ b/third_party/remote_config/remote_platform_configure.bzl @@ -22,6 +22,8 @@ def _remote_platform_configure_impl(repository_ctx): cpu = "aarch64" elif machine_type.startswith("arm"): cpu = "arm" + elif machine_type.startswith("mips64"): + cpu = "mips64" exec_properties = repository_ctx.attr.platform_exec_properties From 2d9820e8f00a0332471ba37eea148fde4d885a57 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Mon, 21 Sep 2020 16:22:45 +0100 Subject: [PATCH 0037/1447] Fix typo in comment --- tensorflow/lite/kernels/concatenation.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc index 5137b7c0e52..41630541504 100644 --- a/tensorflow/lite/kernels/concatenation.cc +++ b/tensorflow/lite/kernels/concatenation.cc @@ -100,7 +100,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { } if (input_type == kTfLiteInt16) { - // Make sure there all Int16 inputs have a null zero-point. + // Make sure that all Int16 inputs have a null zero-point. for (int i = 0; i < node->inputs->size; ++i) { const TfLiteTensor* t = GetInput(context, node, i); TF_LITE_ENSURE_EQ(context, t->params.zero_point, 0); From 97c07fb738931d71f124fc982d7cb81c2487a69a Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Wed, 23 Sep 2020 01:00:26 +0200 Subject: [PATCH 0038/1447] Add transform to fuse activations into TFL pooling ops --- .../compiler/mlir/lite/tests/optimize.mlir | 20 +++++++++++++++++++ .../mlir/lite/transforms/optimize_patterns.td | 20 +++++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir index 8d64bc6ed0a..f115af99f47 100644 --- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir +++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir @@ -26,6 +26,26 @@ func @fusedDepthwiseConv2dRelu6(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16 // CHECK: return %0 } +// CHECK-LABEL: fusedMaxPool2dRelu +func @fusedMaxPool2dRelu(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> { + %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> + %1 = "tfl.relu"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32> + return %1 : tensor<1x73x73x16xf32> + + // CHECK: %0 = "tfl.max_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> + // CHECK: return %0 +} + +// CHECK-LABEL: fusedAvgPool2dRelu1 +func @fusedAvgPool2dRelu1(%arg0: tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> { + %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> + %1 = "tfl.relu_n1_to_1"(%0) : (tensor<1x73x73x16xf32>) -> tensor<1x73x73x16xf32> + return %1 : tensor<1x73x73x16xf32> + + // CHECK: %0 = "tfl.average_pool_2d"(%arg0) {filter_height = 3 : i32, filter_width = 3 : i32, fused_activation_function = "RELU_N1_TO_1", padding = "VALID", stride_h = 2 : i32, stride_w = 2 : i32} : (tensor<1x147x147x16xf32>) -> tensor<1x73x73x16xf32> + // CHECK: return %0 +} + // CHECK-LABEL: fuseAddIntoConv2d func @fuseAddIntoConv2d(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>) -> tensor<256x30x30x16xf32> { %cst = constant dense<1.5> : tensor<16xf32> diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td index 8243ed2a620..79de9b0b424 100644 --- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td +++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td @@ -57,15 +57,31 @@ multiclass FuseActFnIntoConvOpPat { [(HasOneUse $conv_out)]>; } +multiclass FuseActFnIntoPoolOpPat { + def FuseActivationFuncWithAvgPool#ActFnOp#ActFnAttr : Pat< + (ActFnOp (TFL_AveragePool2DOp:$pool_out $input, $filter_height, + $filter_width, $padding, $stride_h, $stride_w, TFL_AF_None)), + (TFL_AveragePool2DOp $input, $filter_height, $filter_width, $padding, + $stride_h, $stride_w, ActFnAttr), + [(HasOneUse $pool_out)]>; + def FuseActivationFuncWithMaxPool#ActFnOp#ActFnAttr : Pat< + (ActFnOp (TFL_MaxPool2DOp:$pool_out $input, $padding, $stride_w, $stride_h, + $filter_width, $filter_height, TFL_AF_None)), + (TFL_MaxPool2DOp $input, $padding, $stride_w, $stride_h, + $filter_width, $filter_height, ActFnAttr), + [(HasOneUse $pool_out)]>; +} + // TODO(hinsu): Also fuse ops corresponding to SIGN_BIT fused // activation functions. // Currently we're not fusing tanh, sigmoid, hard_swish and other activations // those cannot be simply translated into clamping. foreach actFnPair = [[TFL_ReluOp, TFL_AF_Relu], [TFL_Relu6Op, TFL_AF_Relu6], - [TFL_Relu1Op, TFL_AF_Relu1]] in + [TFL_Relu1Op, TFL_AF_Relu1]] in { defm : FuseActFnIntoConvOpPat; - + defm : FuseActFnIntoPoolOpPat; +} class CanFuseConvOrDepthwiseConv : Constraint< CPred<"TFL::CanFuseConvOrDepthwiseConv($0, $1, " # is_depthwise # ")">>; From dff42b15e264bf86ab1eb4d81394de9aa2846333 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Wed, 23 Sep 2020 17:05:53 +0100 Subject: [PATCH 0039/1447] All the inputs of the BATCH_MATMUL should be quantized by the type of the activation --- tensorflow/lite/tools/optimize/operator_property.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/lite/tools/optimize/operator_property.cc b/tensorflow/lite/tools/optimize/operator_property.cc index ce56b186216..f29935f6eca 100644 --- a/tensorflow/lite/tools/optimize/operator_property.cc +++ b/tensorflow/lite/tools/optimize/operator_property.cc @@ -97,6 +97,7 @@ OperatorProperty GetOperatorProperty(const ModelT* model, int subgraph_index, property.inputs = {{0, {}}, {1, {}}}; property.outputs = {{0, {}}}; property.version = 2; + property.quantize_input_as_activations = true; break; } case BuiltinOperator_BATCH_TO_SPACE_ND: From 673befc6d6cf37f1f547bc6c6a07404d519da191 Mon Sep 17 00:00:00 2001 From: Mahmoud Abuzaina Date: Wed, 23 Sep 2020 13:25:23 -0700 Subject: [PATCH 0040/1447] Enbaling max pooling with native format --- .../core/common_runtime/mkl_layout_pass.cc | 25 +++-- .../core/kernels/mkl/mkl_maxpooling_op.cc | 95 ++++++++++++------- tensorflow/core/ops/mkl_nn_ops.cc | 91 ++++++++++++++++++ 3 files changed, 171 insertions(+), 40 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc index ccbe9b17686..2843afb3ed1 100644 --- a/tensorflow/core/common_runtime/mkl_layout_pass.cc +++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc @@ -2442,7 +2442,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T)); for (auto ws : wsinfo_) { if (orig_node->type_string() == ws.fwd_op && - mkl_op_registry::IsMklLayoutDependentOp( + mkl_op_registry::IsMklOp( mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { // If this op is a fwd op, then we need to check if there is an // edge from this node's fwd_slot to bwdop's bwd_slot. If there is @@ -2469,7 +2469,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( nb->Attr("workspace_enabled", false); } } else if (orig_node->type_string() == ws.bwd_op && - mkl_op_registry::IsMklLayoutDependentOp( + mkl_op_registry::IsMklOp( mkl_op_registry::GetMklOpName(orig_node->type_string()), T)) { // If this op is a bwd op, then we need to add workspace edge and @@ -2493,10 +2493,14 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( CHECK_NOTNULL(ws_tensors); // Add workspace edge between fwd op and bwd op. ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot)); - // Add Mkl tensor edge for workspace edge between fwd op and bwd op. - ws_tensors->push_back(NodeBuilder::NodeOut( - e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot, - e->src()->num_outputs()))); + // Check if we are running in native format mode. If so, + // we don't need to have an Mkl metadata tensor for the workspace. + if (!NativeFormatEnabled()) { + // Add Mkl tensor edge for workspace edge between fwd op and bwd op. + ws_tensors->push_back(NodeBuilder::NodeOut( + e->src(), DataIndexToMetaDataIndex(ws.ws_fwd_slot, + e->src()->num_outputs()))); + } *are_ws_tensors_added = true; // In terms of input ordering, we add these calls to add Input // here because workspace edge (and its Mkl tensor) is the last @@ -3647,6 +3651,15 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange( return s; } + std::vector workspace_tensors; + bool are_workspace_tensors_available = false; + AddWorkSpaceEdgeIfNeeded(g, orig_node, &nb, &workspace_tensors, + &are_workspace_tensors_available); + if (are_workspace_tensors_available) { + CHECK_EQ(workspace_tensors.size(), 1); + nb.Input(workspace_tensors[0].node, workspace_tensors[0].index); + } + if (!NativeFormatEnabled()) { ri->copy_attrs(const_cast(orig_node), &nb, true); } else { diff --git a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc index ca7ebd7fd12..276027eb56d 100644 --- a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc @@ -44,7 +44,7 @@ namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; // An implementation of MaxPooling (forward). -template +template class MklMaxPoolingOp : public MklPoolingForwardOpBase { public: explicit MklMaxPoolingOp(OpKernelConstruction* context) @@ -52,6 +52,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // In Max Pooling, MKL-DNN does not allow passing workspace as nullptr. // So we set workspace_enabled_ to true. this->workspace_enabled_ = true; + this->native_format_ = native_format; } void Compute(OpKernelContext* context) override { @@ -59,7 +60,8 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { const Tensor& input_tensor = MklGetInput(context, this->kInputTensorIndexInput); MklDnnShape dnn_shape_input; - GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input); + GetMklShape(context, this->kInputTensorIndexInput, &dnn_shape_input, + this->native_format_); this->SanityCheckInput(context, input_tensor, dnn_shape_input); if (!context->status().ok()) return; @@ -229,7 +231,7 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { workspace_tf_shape.AddDim(workspace_bytes); AllocateOutputSetMklShape(context, kOutputTensorIndexWorkspace, &workspace_tensor, workspace_tf_shape, - workspace_mkl_shape); + workspace_mkl_shape, this->native_format_); DCHECK(workspace_tensor); dnn_data_wksp->SetUsrMem(workspace_pd, workspace_tensor); } @@ -241,11 +243,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase { // - The original output tensor // - Backprop tensor for output // It produces one output: backprop tensor for input. -template +template class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { public: explicit MklMaxPoolingGradOp(OpKernelConstruction* context) - : MklPoolingBackwardOpBase(context) {} + : MklPoolingBackwardOpBase(context) { + this->native_format_ = native_format; + } void Compute(OpKernelContext* context) override { try { const Tensor& orig_input_tensor = @@ -255,8 +259,10 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { const Tensor& workspace_tensor = MklGetInput(context, kInputTensorIndexWorkspace); MklDnnShape orig_input_mkl_shape, grad_mkl_shape; - GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape); - GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape); + GetMklShape(context, kInputTensorIndexOrigInput, &orig_input_mkl_shape, + this->native_format_); + GetMklShape(context, kInputTensorIndexGradient, &grad_mkl_shape, + this->native_format_); if (!context->status().ok()) return; MklDnnData grad_dnn_data(&cpu_engine_); @@ -336,7 +342,8 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { pooling_bwd->GetPoolingBwdPd(); T* diff_dst_data = nullptr; if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd, - pooling_bwd)) { + pooling_bwd) && + !this->native_format_) { grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor); grad_dnn_data.CheckReorderToOpMem( MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd), @@ -389,36 +396,56 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { engine cpu_engine_ = engine(ENGINE_CPU, 0); }; // MklMaxPoolingGradOp -#define REGISTER_MKL_MAXPOOL3D_KERNELS(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklMaxPool3D") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklMaxPoolingOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklMaxPool3DGrad") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklMaxPoolingGradOp); +#define REGISTER_MKL_MAXPOOL3D_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklMaxPool3D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklMaxPoolingOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklMaxPool3DGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklMaxPoolingGradOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool3D") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklMaxPoolingOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool3DGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklMaxPoolingGradOp); TF_CALL_float(REGISTER_MKL_MAXPOOL3D_KERNELS); TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL3D_KERNELS); -#define REGISTER_MKL_MAXPOOL_KERNELS(T) \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklMaxPool") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklMaxPoolingOp); \ - REGISTER_KERNEL_BUILDER( \ - Name("_MklMaxPoolGrad") \ - .Device(DEVICE_CPU) \ - .TypeConstraint("T") \ - .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ - MklMaxPoolingGradOp); +#define REGISTER_MKL_MAXPOOL_KERNELS(T) \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklMaxPool") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklMaxPoolingOp); \ + REGISTER_KERNEL_BUILDER( \ + Name("_MklMaxPoolGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklLayoutDependentOpLabel), \ + MklMaxPoolingGradOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPool") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklMaxPoolingOp); \ + REGISTER_KERNEL_BUILDER(Name("_MklNativeMaxPoolGrad") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T") \ + .Label(mkl_op_registry::kMklNameChangeOpLabel), \ + MklMaxPoolingGradOp); TF_CALL_float(REGISTER_MKL_MAXPOOL_KERNELS); TF_CALL_bfloat16(REGISTER_MKL_MAXPOOL_KERNELS); diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc index 248cf1d0e8a..63e21896795 100644 --- a/tensorflow/core/ops/mkl_nn_ops.cc +++ b/tensorflow/core/ops/mkl_nn_ops.cc @@ -169,6 +169,97 @@ REGISTER_OP("_MklPadWithFusedConv2D") is expected to create these operators. )doc"); +REGISTER_OP("_MklNativeMaxPool") + .Attr("T: {float, half, bfloat16} = DT_FLOAT") + .Attr("ksize: list(int) >= 4") + .Attr("strides: list(int) >= 4") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr(GetExplicitPaddingsAttrString()) + .Attr("workspace_enabled: bool = false") + .Input("input: T") + .Output("output: T") + .Output("workspace: uint8") + .SetShapeFn(shape_inference::MaxPoolShape) + .Doc(R"doc( +MKL version of MaxPool operator that does not depend +on layout propagation. Uses oneDNN APIs to perform max pooling +on the input. + +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklNativeMaxPoolGrad") + .Attr("T: {float, half, bfloat16} = DT_FLOAT") + .Attr("ksize: list(int) >= 4") + .Attr("strides: list(int) >= 4") + .Attr("workspace_enabled: bool = false") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnetDataFormatAttrString()) + .Attr(GetExplicitPaddingsAttrString()) + .Input("orig_input: T") + .Input("orig_output: T") + .Input("grad: T") + .Input("workspace: uint8") + .Output("output: T") + .SetShapeFn([](InferenceContext* c) { + return UnchangedShapeWithRank(c, 4); + }) + .Doc(R"doc( +MKL version of MaxPoolGrad that does not depend +on layout propagation. Uses oneDNN APIs to compute gradients of +MaxPool operator. + +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklNativeMaxPool3D") + .Input("input: T") + .Output("output: T") + .Output("workspace: uint8") + .Attr("ksize: list(int) >= 5") + .Attr("strides: list(int) >= 5") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnet3dDataFormatAttrString()) + .Attr("T: {half, bfloat16, float}") + .Attr("workspace_enabled: bool = false") + .SetShapeFn(shape_inference::Pool3DShape) + .Doc(R"doc( +MKL version of MaxPool3D operator that does not depend +on layout propagation. Uses oneDNN APIs to perform 3D max pooling +on the input. + +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + +REGISTER_OP("_MklNativeMaxPool3DGrad") + .Input("orig_input: TInput") + .Input("orig_output: TInput") + .Input("grad: T") + .Input("workspace: uint8") + .Output("output: T") + .Attr("ksize: list(int) >= 5") + .Attr("strides: list(int) >= 5") + .Attr(GetPaddingAttrString()) + .Attr(GetConvnet3dDataFormatAttrString()) + .Attr("T: {half, bfloat16, float} = DT_FLOAT") + .Attr("TInput: {half, bfloat16, float} = DT_FLOAT") + .Attr("workspace_enabled: bool = false") + .SetShapeFn([](InferenceContext* c) { + return UnchangedShapeWithRank(c, 5); + }) + .Doc(R"doc( +MKL version of MaxPool3DGrad operator that does not depend +on layout propagation. Uses oneDNN APIs to compute gradients +of MaxPool3D function. + +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. +)doc"); + REGISTER_OP("_MklQuantizedMaxPool") .Input("input: T") .Input("min_input: float") From 7a91a85c7e8e70dde9dce5f44bd8a0a4fffb855e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Tue, 22 Sep 2020 16:04:24 +0200 Subject: [PATCH 0041/1447] TFLu: Allocate scratch tensors even though they are not in subgraph input Change-Id: Iebe41ed254eeaef8e4ef6b2a2626291326c2a307 --- tensorflow/lite/micro/micro_allocator.cc | 26 ++---- tensorflow/lite/micro/micro_allocator_test.cc | 81 +++++++++++++++++-- 2 files changed, 79 insertions(+), 28 deletions(-) diff --git a/tensorflow/lite/micro/micro_allocator.cc b/tensorflow/lite/micro/micro_allocator.cc index edac0d5ae5e..88d4a3ebd45 100644 --- a/tensorflow/lite/micro/micro_allocator.cc +++ b/tensorflow/lite/micro/micro_allocator.cc @@ -242,21 +242,6 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph, for (size_t n = 0; n < op->inputs()->size(); ++n) { const int tensor_index = op->inputs()->Get(n); AllocationInfo* current = &info_[tensor_index]; - - // TODO(b/166484865): Figure out a more general solution. - // This workaround is needed to handle situations where subgraph input != - // operator input. - // In case operator input(s) are not in subgraph inputs initialize them. - if (current->first_created == 0) { - for (size_t op_input = 0; op_input < op->inputs()->size(); ++op_input) { - const int op_tensor_index = op->inputs()->Get(op_input); - AllocationInfo* op_current = &info_[op_tensor_index]; - if (op_current->needs_allocating && op_current->first_created == -1) { - op_current->first_created = i; - } - } - } - if (((current->last_used == -1) || (current->last_used < i))) { current->last_used = i; } @@ -270,16 +255,15 @@ TfLiteStatus AllocationInfoBuilder::AddTensors(const SubGraph* subgraph, } } - // Work out which tensors need to be allocated. + // Sanity check for valid tensor lifetime. for (size_t i = 0; i < tensor_count_; ++i) { AllocationInfo* current = &info_[i]; - const bool is_read_only = + // Even though tensor appears to be read only it may still need to be + // allocated. + const bool appears_read_only = (current->first_created == -1) && (current->last_used != -1); - if (is_read_only) { - current->needs_allocating = false; - } const bool has_partial_lifetime = - !is_read_only && + !appears_read_only && ((current->first_created == -1) || (current->last_used == -1)); if (has_partial_lifetime && current->needs_allocating) { TF_LITE_REPORT_ERROR( diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc index 1ac68443f1a..87484d0dfb4 100644 --- a/tensorflow/lite/micro/micro_allocator_test.cc +++ b/tensorflow/lite/micro/micro_allocator_test.cc @@ -749,14 +749,14 @@ TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) { 48, // t3 -1}; // t4 - int t0 = 0; - int t1 = 1; - int t2 = 2; - int t3 = 3; - int t4 = 4; + constexpr int t0 = 0; + constexpr int t1 = 1; + constexpr int t2 = 2; + constexpr int t3 = 3; + constexpr int t4 = 4; - int num_conns = 2; - tflite::testing::NodeConnection node_list[2] = { + constexpr int num_conns = 2; + tflite::testing::NodeConnection node_list[num_conns] = { { {t0, t1, t2}, // t0: input (actual input part of subgraph inputs as // well as operator inputs) @@ -795,4 +795,71 @@ TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) { TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[4].data.uint8 - start); } +TF_LITE_MICRO_TEST(TestTypicalFirstOpAndSecondOpWithScratchTensors) { + constexpr int nbr_tensors = 6; + tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); + tflite::NodeAndRegistration* node_and_registration; + const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + + nbr_tensors] = { + 1, 0, nbr_tensors, // header: version, subgraph, nbr tensors + // memory offsets: + 0, // t0 + 0, // t1 + 0, // t2 + 0, // t3 + 48, // t4 + -1}; // t5 + + constexpr int t0 = 0; + constexpr int t1 = 1; + constexpr int t2 = 2; + constexpr int t3 = 3; + constexpr int t4 = 4; + constexpr int t5 = 5; + + constexpr int num_conns = 3; + tflite::testing::NodeConnection node_list[num_conns] = { + { + {t0}, // t0: input (actual input part of subgraph inputs as + // well as operator inputs) + {t1} // t1: output + }, + { + {t1, t2, t3}, // t1: input + // t2: scratch1 (only in operator inputs) + // t3: scratch2 (only in operator inputs) + {t4} // t4: output + }, + { + {t4}, // input + {t5} // output + }, + }; + + const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( + nbr_tensors, metadata_buffer, node_list, num_conns, + 1 /* only first tensor (t0) is in subgraph input list*/); + + TfLiteEvalTensor* eval_tensors = nullptr; + constexpr size_t arena_size = 4096; + uint8_t arena[arena_size]; + tflite::MicroAllocator* allocator = + tflite::MicroAllocator::Create(arena, arena_size, micro_test::reporter); + + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, + allocator->StartModelAllocation(model, op_resolver, + &node_and_registration, &eval_tensors)); + TF_LITE_MICRO_EXPECT_EQ( + kTfLiteOk, allocator->FinishModelAllocation(model, eval_tensors)); + + uint8_t* start = eval_tensors[0].data.uint8; + TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[0].data.uint8 - start); + TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[1].data.uint8 - start); + TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[2].data.uint8 - start); + TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[3].data.uint8 - start); + TF_LITE_MICRO_EXPECT_EQ(48, eval_tensors[4].data.uint8 - start); + TF_LITE_MICRO_EXPECT_EQ(0, eval_tensors[5].data.uint8 - start); +} + TF_LITE_MICRO_TESTS_END From e06c862dd1ce4cfe964ba17c67e0064350585812 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Thu, 24 Sep 2020 16:36:37 +0100 Subject: [PATCH 0042/1447] Remove unnecessary versioning of kBatchMatMul in deprecated file --- tensorflow/lite/toco/tflite/op_version.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/tensorflow/lite/toco/tflite/op_version.cc b/tensorflow/lite/toco/tflite/op_version.cc index 053ee9cf02a..72fc4eea1e7 100644 --- a/tensorflow/lite/toco/tflite/op_version.cc +++ b/tensorflow/lite/toco/tflite/op_version.cc @@ -66,7 +66,6 @@ std::string GetMinimumRuntimeVersionForModel(const Model& model) { {{OperatorType::kBatchToSpaceND, 1}, "1.6.0"}, {{OperatorType::kBatchToSpaceND, 2}, "1.14.0"}, {{OperatorType::kBatchMatMul, 1}, kPendingReleaseOpVersion}, - {{OperatorType::kBatchMatMul, 3}, kPendingReleaseOpVersion}, {{OperatorType::kCast, 1}, "1.5.0"}, {{OperatorType::kConcatenation, 1}, "1.5.0"}, {{OperatorType::kConcatenation, 2}, "1.14.0"}, From d234e832a7c85bdd2d1a7baf649d0f74855d7579 Mon Sep 17 00:00:00 2001 From: "ag.ramesh" Date: Fri, 25 Sep 2020 21:23:10 -0700 Subject: [PATCH 0043/1447] Removed MKL binary blob and replaced openmp with opensource lib. --- tensorflow/tensorflow.bzl | 3 +- tensorflow/workspace.bzl | 22 ++--- third_party/llvm-openmp/BUILD | 124 ++++++++++++++++++++++++++ third_party/llvm/BUILD | 2 +- third_party/llvm/expand_cmake_vars.py | 3 +- third_party/llvm/llvm.bzl | 14 +-- third_party/mkl/BUILD | 13 ++- third_party/mkl/mkl.BUILD | 9 -- 8 files changed, 157 insertions(+), 33 deletions(-) create mode 100644 third_party/llvm-openmp/BUILD diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl index 8ed12136c55..a364143c935 100644 --- a/tensorflow/tensorflow.bzl +++ b/tensorflow/tensorflow.bzl @@ -331,8 +331,7 @@ def tf_copts( if_tpu(["-DLIBTFTPU"]) + if_xla_available(["-DTENSORFLOW_USE_XLA=1"]) + if_tensorrt(["-DGOOGLE_TENSORRT=1"]) + - if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16"]) + - if_mkl_open_source_only(["-DINTEL_MKL_DNN_ONLY"]) + + if_mkl(["-DINTEL_MKL=1", "-DENABLE_MKLDNN_V1", "-DENABLE_INTEL_MKL_BFLOAT16","-DINTEL_MKL_DNN_ONLY"]) + if_mkldnn_threadpool(["-DENABLE_MKLDNN_THREADPOOL"]) + if_enable_mkl(["-DENABLE_MKL"]) + if_ngraph(["-DINTEL_NGRAPH=1"]) + diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index eafc52ad549..78dd4b4576d 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -125,16 +125,6 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): armhf_repo = "../armhf_linux_toolchain", ) - mkl_repository( - name = "mkl_linux", - build_file = clean_dep("//third_party/mkl:mkl.BUILD"), - sha256 = "a936d6b277a33d2a027a024ea8e65df62bd2e162c7ca52c48486ed9d5dc27160", - strip_prefix = "mklml_lnx_2019.0.5.20190502", - urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/mkl-dnn/releases/download/v0.21/mklml_lnx_2019.0.5.20190502.tgz", - "https://github.com/intel/mkl-dnn/releases/download/v0.21/mklml_lnx_2019.0.5.20190502.tgz", - ], - ) mkl_repository( name = "mkl_windows", build_file = clean_dep("//third_party/mkl:mkl.BUILD"), @@ -730,6 +720,18 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): }, ) + # Intel openMP that is part of LLVM sources. + tf_http_archive( + name = "llvm-openmp", + build_file = clean_dep("//third_party/llvm-openmp:BUILD"), + sha256 = "d19f728c8e04fb1e94566c8d76aef50ec926cd2f95ef3bf1e0a5de4909b28b44", + strip_prefix = "openmp-10.0.1.src", + urls = [ + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz", + "https://github.com/llvm/llvm-project/releases/download/llvmorg-10.0.1/openmp-10.0.1.src.tar.xz", + ], + ) + tf_http_archive( name = "lmdb", build_file = clean_dep("//third_party:lmdb.BUILD"), diff --git a/third_party/llvm-openmp/BUILD b/third_party/llvm-openmp/BUILD new file mode 100644 index 00000000000..ee355768777 --- /dev/null +++ b/third_party/llvm-openmp/BUILD @@ -0,0 +1,124 @@ +# Build file for OpenMP library that is part of llvm + +exports_files(["LICENSE.txt"]) + +load( + "@org_tensorflow//third_party/llvm:llvm.bzl", + "cmake_var_string", + "dict_add", + "expand_cmake_vars", +) +load( + "@org_tensorflow//third_party:common.bzl", + "template_rule", +) + +genrule( + name = "il8n_id", + srcs = [ + "runtime/tools/message-converter.pl", + "runtime/src/i18n/en_US.txt", + ], + outs = ["include/kmp_i18n_id.inc"], + cmd = "$(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --enum=$@ $(location runtime/src/i18n/en_US.txt)", +) + +genrule( + name = "kmp_i18n_default", + srcs = [ + "runtime/tools/message-converter.pl", + "runtime/src/i18n/en_US.txt", + ], + outs = ["include/kmp_i18n_default.inc"], + cmd = "perl $(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --default=$@ $(location runtime/src/i18n/en_US.txt)", +) + +# Bazel doesn't accept .txt as an input, rename the ldscript to .inc to workaround. +genrule( + name = "ldscript", + srcs = ["runtime/src/exports_so.txt"], + outs = ["exports_so.inc"], + cmd = "cp $(location runtime/src/exports_so.txt) $@", +) + +# Cmake vars to replace. +omp_vars = { + "LIBOMP_USE_VERSION_SYMBOLS": 1, + "LIBOMP_HAVE_WEAK_ATTRIBUTE": 1, + "LIBOMP_USE_ADAPTIVE_LOCKS": 1, + "LIBOMP_ENABLE_ASSERTIONS": 1, + "LIBOMP_ENABLE_SHARED": 1, + "LIBOMP_LEGAL_ARCH": "Intel(R) 64", + "LIBOMP_LIB_FILE": "libiomp5.so", + "LIBOMP_VERSION_MAJOR": 5, + "LIBOMP_VERSION_MINOR": 0, +} + +omp_all_cmake_vars = cmake_var_string(dict_add(omp_vars)) + +expand_cmake_vars( + name = "config_kmp", + src = "runtime/src/kmp_config.h.cmake", + cmake_vars = omp_all_cmake_vars, + dst = "include/kmp_config.h", +) + +expand_cmake_vars( + name = "config_omp", + src = "runtime/src/include/omp.h.var", + cmake_vars = omp_all_cmake_vars, + dst = "include/omp.h", +) + +cc_binary( + name = "libiomp5.so", + srcs = glob([ + "runtime/src/*.h", + "runtime/src/kmp_alloc.cpp", + "runtime/src/kmp_atomic.cpp", + "runtime/src/kmp_csupport.cpp", + "runtime/src/kmp_debug.cpp", + "runtime/src/kmp_itt.cpp", + "runtime/src/kmp_environment.cpp", + "runtime/src/kmp_error.cpp", + "runtime/src/kmp_global.cpp", + "runtime/src/kmp_i18n.cpp", + "runtime/src/kmp_io.cpp", + "runtime/src/kmp_runtime.cpp", + "runtime/src/kmp_settings.cpp", + "runtime/src/kmp_str.cpp", + "runtime/src/kmp_tasking.cpp", + "runtime/src/kmp_threadprivate.cpp", + "runtime/src/kmp_utility.cpp", + "runtime/src/kmp_barrier.cpp", + "runtime/src/kmp_wait_release.cpp", + "runtime/src/kmp_affinity.cpp", + "runtime/src/kmp_dispatch.cpp", + "runtime/src/kmp_lock.cpp", + "runtime/src/kmp_sched.cpp", + "runtime/src/kmp_taskdeps.cpp", + "runtime/src/kmp_cancel.cpp", + "runtime/src/kmp_ftn_cdecl.cpp", + "runtime/src/kmp_ftn_extra.cpp", + "runtime/src/kmp_version.cpp", + + #linux specific files + "runtime/src/z_Linux_util.cpp", + "runtime/src/kmp_gsupport.cpp", + "runtime/src/z_Linux_asm.S", + ]) + [ + ":config_kmp", + ":config_omp", + ":il8n_id", + ":kmp_i18n_default", + ":ldscript", + ], + copts = ["-Domp_EXPORTS -D_GNU_SOURCE -D_REENTRANT"], + includes = [ + "include/", + "runtime/src/", + ], + linkopts = ["-lpthread -ldl -Wl,--version-script=$(location :ldscript)"], + linkshared = True, + visibility = ["//visibility:public"], +) diff --git a/third_party/llvm/BUILD b/third_party/llvm/BUILD index 1a5634a6285..f2e079aad76 100644 --- a/third_party/llvm/BUILD +++ b/third_party/llvm/BUILD @@ -2,5 +2,5 @@ py_binary( name = "expand_cmake_vars", srcs = ["expand_cmake_vars.py"], srcs_version = "PY2AND3", - visibility = ["@llvm-project//:__subpackages__"], + visibility = ["//visibility:public"], ) diff --git a/third_party/llvm/expand_cmake_vars.py b/third_party/llvm/expand_cmake_vars.py index ffc6a255fd1..73f071aa504 100644 --- a/third_party/llvm/expand_cmake_vars.py +++ b/third_party/llvm/expand_cmake_vars.py @@ -25,6 +25,7 @@ import sys _CMAKE_DEFINE_REGEX = re.compile(r"\s*#cmakedefine\s+([A-Za-z_0-9]*)(\s.*)?$") _CMAKE_DEFINE01_REGEX = re.compile(r"\s*#cmakedefine01\s+([A-Za-z_0-9]*)") _CMAKE_VAR_REGEX = re.compile(r"\${([A-Za-z_0-9]*)}") +_CMAKE_ATVAR_REGEX = re.compile(r"\@([A-Za-z_0-9]*)@") def _parse_args(argv): @@ -50,7 +51,7 @@ def _expand_variables(input_str, cmake_vars): if match.group(1) in cmake_vars: return cmake_vars[match.group(1)] return "" - return _CMAKE_VAR_REGEX.sub(replace, input_str) + return _CMAKE_ATVAR_REGEX.sub(replace,_CMAKE_VAR_REGEX.sub(replace, input_str)) def _expand_cmakedefines(line, cmake_vars): diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl index dcbaab9edd4..c2be9dca302 100644 --- a/third_party/llvm/llvm.bzl +++ b/third_party/llvm/llvm.bzl @@ -7,7 +7,7 @@ TODO(chandlerc): Currently this expresses include-based dependencies as correctly understood by the build system. """ -def _dict_add(*dictionaries): +def dict_add(*dictionaries): """Returns a new `dict` that has all the entries of the given dictionaries. If the same key is present in more than one of the input dictionaries, the @@ -305,7 +305,7 @@ win32_cmake_vars = { # than hardcoding x86_64. llvm_all_cmake_vars = select({ "@org_tensorflow//tensorflow:macos": cmake_var_string( - _dict_add( + dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-apple-darwin"), posix_cmake_vars, @@ -313,7 +313,7 @@ llvm_all_cmake_vars = select({ ), ), "@org_tensorflow//tensorflow:linux_ppc64le": cmake_var_string( - _dict_add( + dict_add( cmake_vars, llvm_target_cmake_vars("PowerPC", "powerpc64le-unknown-linux_gnu"), posix_cmake_vars, @@ -321,21 +321,21 @@ llvm_all_cmake_vars = select({ ), ), "@org_tensorflow//tensorflow:windows": cmake_var_string( - _dict_add( + dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-pc-win32"), win32_cmake_vars, ), ), "@org_tensorflow//tensorflow:freebsd": cmake_var_string( - _dict_add( + dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-unknown-freebsd"), posix_cmake_vars, ), ), "@org_tensorflow//tensorflow:linux_s390x": cmake_var_string( - _dict_add( + dict_add( cmake_vars, llvm_target_cmake_vars("SystemZ", "systemz-unknown-linux_gnu"), posix_cmake_vars, @@ -343,7 +343,7 @@ llvm_all_cmake_vars = select({ ), ), "//conditions:default": cmake_var_string( - _dict_add( + dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-unknown-linux_gnu"), posix_cmake_vars, diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD index 66a2bf8ceb9..5cee0514a06 100644 --- a/third_party/mkl/BUILD +++ b/third_party/mkl/BUILD @@ -34,7 +34,7 @@ filegroup( name = "LICENSE", srcs = ["MKL_LICENSE"] + select({ "@org_tensorflow//tensorflow:linux_x86_64": [ - "@mkl_linux//:LICENSE", + "@llvm-openmp//:LICENSE.txt", ], "@org_tensorflow//tensorflow:macos": [ "@mkl_darwin//:LICENSE", @@ -47,13 +47,20 @@ filegroup( visibility = ["//visibility:public"], ) +cc_library( + name = "mkl_libs_linux", + srcs = [ + "@llvm-openmp//:libiomp5.so", + ], + visibility = ["//visibility:public"], +) + cc_library( name = "intel_binary_blob", visibility = ["//visibility:public"], deps = select({ "@org_tensorflow//tensorflow:linux_x86_64": [ - "@mkl_linux//:mkl_headers", - "@mkl_linux//:mkl_libs_linux", + ":mkl_libs_linux", ], "@org_tensorflow//tensorflow:macos": [ "@mkl_darwin//:mkl_headers", diff --git a/third_party/mkl/mkl.BUILD b/third_party/mkl/mkl.BUILD index 72370182c41..32d2965780f 100644 --- a/third_party/mkl/mkl.BUILD +++ b/third_party/mkl/mkl.BUILD @@ -17,15 +17,6 @@ cc_library( visibility = ["//visibility:public"], ) -cc_library( - name = "mkl_libs_linux", - srcs = [ - "lib/libiomp5.so", - "lib/libmklml_intel.so", - ], - visibility = ["//visibility:public"], -) - cc_library( name = "mkl_libs_darwin", srcs = [ From 286195f4221687a983887b496fde1c0ed58811a4 Mon Sep 17 00:00:00 2001 From: Thibaut Goetghebuer-Planchon Date: Mon, 28 Sep 2020 17:50:46 +0100 Subject: [PATCH 0044/1447] Fix compilation error, lhs should be used and not rhs --- tensorflow/lite/kernels/batch_matmul.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/batch_matmul.cc b/tensorflow/lite/kernels/batch_matmul.cc index a4a8fe00ffd..5f6afa3d14f 100644 --- a/tensorflow/lite/kernels/batch_matmul.cc +++ b/tensorflow/lite/kernels/batch_matmul.cc @@ -627,7 +627,7 @@ TfLiteTensor* GetTempLhs(TfLiteContext* context, TfLiteNode* node, return nullptr; } - if (lhs->type == kTfLiteInt8 || rhs->type == kTfLiteInt16) { + if (lhs->type == kTfLiteInt8 || lhs->type == kTfLiteInt16) { // Get the quantization params from the LHS tensor. transposed_lhs->params.scale = lhs->params.scale; transposed_lhs->params.zero_point = lhs->params.zero_point; From b03ae6de78599b6595f24cdf5bc7ae4c1e416aeb Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Tue, 29 Sep 2020 16:02:09 +1000 Subject: [PATCH 0045/1447] Replace CreateBlasLtMatmulPlan args with struct --- .../core/kernels/batch_matmul_op_impl.h | 30 ++++---- tensorflow/stream_executor/blas.h | 52 ++++++------- tensorflow/stream_executor/cuda/cuda_blas.cc | 77 +++++++------------ .../stream_executor/stream_executor_pimpl.cc | 26 +------ .../stream_executor/stream_executor_pimpl.h | 14 +--- 5 files changed, 72 insertions(+), 127 deletions(-) diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index 456b4beff1e..ac5a45b99ba 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -555,23 +555,23 @@ struct LaunchBatchMatMul { GetBlasComputationType(dtype, allow_tf32, &computation_type), errors::Internal("Unsupported dtype for batched matmul")); std::unique_ptr plan = - stream->parent()->CreateBlasLtMatmulPlanStridedBatched( - /*ab_type=*/blas_dtype, - /*cd_type=*/blas_dtype, computation_type, - se::blas::PointerMode::kHost, se::blas::Epilogue::kDefault, - blas_transpose_b, blas_transpose_a, n, m, k, batch_size, - /*lda=*/in_y.dim_size(2), b_stride, - /*ldb=*/in_x.dim_size(2), a_stride, /*ldc=*/n, c_stride); + stream->parent()->CreateBlasLtMatmulPlan( + {/*ab_type=*/blas_dtype, + /*c_type=*/blas_dtype, computation_type, + se::blas::PointerMode::kHost, se::blas::Epilogue::kDefault, + blas_transpose_b, blas_transpose_a, n, m, k, + /*lda=*/in_y.dim_size(2), /*ldb=*/in_x.dim_size(2), /*ldc=*/n, + batch_size, b_stride, a_stride, c_stride}); OP_REQUIRES( context, plan, - errors::Internal( - "CreateBlasLtMatmulPlanStridedBatched failed : a.shape=(", - in_x.dim_size(0), ", ", in_x.dim_size(1), ", ", - in_x.dim_size(2), "), b.shape=(", in_y.dim_size(0), ", ", - in_y.dim_size(1), ", ", in_y.dim_size(2), "), m=", m, ", n=", n, - ", k=", k, ", batch_size=", batch_size, ", adjoint_a=", adj_x, - ", adjoint_b=", adj_x, ", dtype=", dtype, - ", computation_type=", computation_type)); + errors::Internal("CreateBlasLtMatmulPlan failed : a.shape=(", + in_x.dim_size(0), ", ", in_x.dim_size(1), ", ", + in_x.dim_size(2), "), b.shape=(", in_y.dim_size(0), + ", ", in_y.dim_size(1), ", ", in_y.dim_size(2), + "), m=", m, ", n=", n, ", k=", k, + ", batch_size=", batch_size, ", adjoint_a=", adj_x, + ", adjoint_b=", adj_x, ", dtype=", dtype, + ", computation_type=", computation_type)); std::vector> algorithms; OP_REQUIRES( diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index ae5b4853d05..411f6f11275 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -242,6 +242,27 @@ struct IBlasLtMatmulAlgorithm { virtual size_t workspace_size() const = 0; }; +// Parameters for the CreateBlasLtMatmulPlan method. +struct BlasLtMatmulPlanParams { + DataType ab_type; + DataType c_type; + ComputationType computation_type; + PointerMode pointer_mode; + Epilogue epilogue; + Transpose transa; + Transpose transb; + uint64 m; + uint64 n; + uint64 k; + int64 lda; + int64 ldb; + int64 ldc; + int batch_count = 1; + int64 stride_a = 0; + int64 stride_b = 0; + int64 stride_c = 0; +}; + // BLAS support interface -- this can be derived from a GPU executor when the // underlying platform has an BLAS library implementation available. See // StreamExecutor::AsBlas(). @@ -1466,25 +1487,8 @@ class BlasSupport { // can then be passed to DoBlasLtMatmul(). When possible, plans should be // created once and reused for multiple calls to DoBlasLtMatmul(). // Returns a null pointer on failure. - std::unique_ptr CreateBlasLtMatmulPlan( - blas::DataType ab_type, blas::DataType c_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, int64 lda, int64 ldb, int64 ldc) { - return CreateBlasLtMatmulPlanStridedBatched( - ab_type, c_type, computation_type, pointer_mode, epilogue, transa, - transb, m, n, k, 1, lda, 0, ldb, 0, ldc, 0); - } - - // A more general version of CreateBlasLtMatmulPlan supporting - // batched operations. - virtual std::unique_ptr - CreateBlasLtMatmulPlanStridedBatched( - blas::DataType ab_type, blas::DataType c_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, int64 stride_a, - int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) = 0; + virtual std::unique_ptr CreateBlasLtMatmulPlan( + const blas::BlasLtMatmulPlanParams& params) = 0; // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are // returned in the order of increasing estimated compute time according to an @@ -2372,14 +2376,8 @@ class BlasSupport { uint64 n, std::complex alpha, \ const DeviceMemory> &a, int lda, \ DeviceMemory> *b, int ldb) override; \ - std::unique_ptr \ - CreateBlasLtMatmulPlanStridedBatched( \ - blas::DataType ab_type, blas::DataType cd_type, \ - blas::ComputationType computation_type, blas::PointerMode pointer_mode, \ - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, \ - uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, \ - int64 stride_a, int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) \ - override; \ + std::unique_ptr CreateBlasLtMatmulPlan( \ + const blas::BlasLtMatmulPlanParams& params) override; \ bool GetBlasLtMatmulAlgorithms( \ const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, \ int max_algorithm_count, \ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 1d95b00ce7e..f2bc79e1c29 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -3231,13 +3231,7 @@ blas::ComputationType ToComputationType>() { class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { public: - CUDABlasLtMatmulPlan(blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType compute_type, - blas::PointerMode pointer_mode, blas::Epilogue epilogue, - blas::Transpose transa, blas::Transpose transb, uint64 m, - uint64 n, uint64 k, int batch_count, int64 lda, - int64 stride_a, int64 ldb, int64 stride_b, int64 ldc, - int64 stride_c, int64 ldd, int64 stride_d); + CUDABlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& params); cublasLtMatmulDesc_t op_desc() const { return op_desc_.get(); } cublasLtMatrixLayout_t a_desc() const { return a_desc_.get(); } @@ -3280,39 +3274,34 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { }; CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( - blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, int64 stride_a, - int64 ldb, int64 stride_b, int64 ldc, int64 stride_c, int64 ldd, - int64 stride_d) + const blas::BlasLtMatmulPlanParams& p) : op_desc_(CreateCublasLtOperationDesc( - computation_type, GetScaleType(cd_type, computation_type), - pointer_mode, epilogue, transa, transb)), + p.computation_type, GetScaleType(p.c_type, p.computation_type), + p.pointer_mode, p.epilogue, p.transa, p.transb)), a_desc_(nullptr), b_desc_(nullptr), - c_desc_( - CreateCublasLtLayoutDesc(cd_type, m, n, ldc, stride_c, batch_count)), - d_desc_( - CreateCublasLtLayoutDesc(cd_type, m, n, ldd, stride_d, batch_count)), - ab_type_(ab_type), - cd_type_(cd_type), - scale_type_(GetScaleType(cd_type, computation_type)), - pointer_mode_(pointer_mode), - epilogue_(epilogue), - batch_count_(batch_count), - stride_a_(stride_a), - stride_b_(stride_b), - stride_c_(stride_c), - stride_d_(stride_d) { - uint64 rows_a = transa == blas::Transpose::kNoTranspose ? m : k; - uint64 cols_a = transa == blas::Transpose::kNoTranspose ? k : m; - uint64 rows_b = transb == blas::Transpose::kNoTranspose ? k : n; - uint64 cols_b = transb == blas::Transpose::kNoTranspose ? n : k; - a_desc_ = CreateCublasLtLayoutDesc(ab_type, rows_a, cols_a, lda, stride_a, - batch_count); - b_desc_ = CreateCublasLtLayoutDesc(ab_type, rows_b, cols_b, ldb, stride_b, - batch_count); + c_desc_(CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c, + p.batch_count)), + d_desc_(CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c, + p.batch_count)), + ab_type_(p.ab_type), + cd_type_(p.c_type), + scale_type_(GetScaleType(p.c_type, p.computation_type)), + pointer_mode_(p.pointer_mode), + epilogue_(p.epilogue), + batch_count_(p.batch_count), + stride_a_(p.stride_a), + stride_b_(p.stride_b), + stride_c_(p.stride_c), + stride_d_(p.stride_c) { + uint64 rows_a = p.transa == blas::Transpose::kNoTranspose ? p.m : p.k; + uint64 cols_a = p.transa == blas::Transpose::kNoTranspose ? p.k : p.m; + uint64 rows_b = p.transb == blas::Transpose::kNoTranspose ? p.k : p.n; + uint64 cols_b = p.transb == blas::Transpose::kNoTranspose ? p.n : p.k; + a_desc_ = CreateCublasLtLayoutDesc(p.ab_type, rows_a, cols_a, p.lda, + p.stride_a, p.batch_count); + b_desc_ = CreateCublasLtLayoutDesc(p.ab_type, rows_b, cols_b, p.ldb, + p.stride_b, p.batch_count); } bool CUDABlasLtMatmulPlan::SetBiasPointer(const void* bias) const { @@ -3395,18 +3384,10 @@ UniqueMatmulPreference CreateCublasLtMatmulPreference( #endif // CUDA_VERSION >= 11000 -std::unique_ptr -CUDABlas::CreateBlasLtMatmulPlanStridedBatched( - blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, int batch_count, int64 lda, int64 stride_a, - int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) { +std::unique_ptr CUDABlas::CreateBlasLtMatmulPlan( + const blas::BlasLtMatmulPlanParams& params) { #if CUDA_VERSION >= 11000 - auto result = std::make_unique( - ab_type, cd_type, computation_type, pointer_mode, epilogue, transa, - transb, m, n, k, batch_count, lda, stride_a, ldb, stride_b, ldc, stride_c, - ldc, stride_c); + auto result = std::make_unique(params); if (!result->ok()) { result.reset(); } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index d75c1bc65c5..d40b6adc285 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -337,34 +337,12 @@ bool StreamExecutor::GetBlasGemmAlgorithms( } std::unique_ptr StreamExecutor::CreateBlasLtMatmulPlan( - blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, int64 lda, int64 ldb, int64 ldc) { + const blas::BlasLtMatmulPlanParams& params) { blas::BlasSupport *blas_support = AsBlas(); if (!blas_support) { return nullptr; } - return blas_support->CreateBlasLtMatmulPlan( - ab_type, cd_type, computation_type, pointer_mode, epilogue, transa, - transb, m, n, k, lda, ldb, ldc); -} - -std::unique_ptr -StreamExecutor::CreateBlasLtMatmulPlanStridedBatched( - blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, uint64 batch_count, int64 lda, int64 stride_a, - int64 ldb, int64 stride_b, int64 ldc, int64 stride_c) { - blas::BlasSupport *blas_support = AsBlas(); - if (!blas_support) { - return nullptr; - } - return blas_support->CreateBlasLtMatmulPlanStridedBatched( - ab_type, cd_type, computation_type, pointer_mode, epilogue, transa, - transb, m, n, k, batch_count, lda, stride_a, ldb, stride_b, ldc, - stride_c); + return blas_support->CreateBlasLtMatmulPlan(params); } bool StreamExecutor::GetBlasLtMatmulAlgorithms( diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index b40c0c23c05..ce801bf0f28 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -399,19 +399,7 @@ class StreamExecutor { // created once and reused for multiple calls to DoBlasLtMatmul(). // Returns a null pointer on failure. std::unique_ptr CreateBlasLtMatmulPlan( - blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, int64 lda, int64 ldb, int64 ldc); - - // A more general version of CreateBlasLtMatmulPlan supporting - // batched operations. - std::unique_ptr CreateBlasLtMatmulPlanStridedBatched( - blas::DataType ab_type, blas::DataType cd_type, - blas::ComputationType computation_type, blas::PointerMode pointer_mode, - blas::Epilogue epilogue, blas::Transpose transa, blas::Transpose transb, - uint64 m, uint64 n, uint64 k, uint64 batch_count, int64 lda, - int64 stride_a, int64 ldb, int64 stride_b, int64 ldc, int64 stride_c); + const blas::BlasLtMatmulPlanParams& params); // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are // returned in the order of increasing estimated compute time according to an From 6186c0936704539e2d4ac4d2f216799be21ea997 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Tue, 29 Sep 2020 16:07:57 +1000 Subject: [PATCH 0046/1447] Use (+generalize) existing dnn::DataType in blas:: --- tensorflow/stream_executor/blas.cc | 14 +++--- tensorflow/stream_executor/blas.h | 45 ++------------------ tensorflow/stream_executor/cuda/cuda_blas.cc | 42 +++++++++--------- tensorflow/stream_executor/dnn.h | 8 ++++ tensorflow/stream_executor/dnn.proto | 2 + 5 files changed, 41 insertions(+), 70 deletions(-) diff --git a/tensorflow/stream_executor/blas.cc b/tensorflow/stream_executor/blas.cc index f55e318e88b..ca597595969 100644 --- a/tensorflow/stream_executor/blas.cc +++ b/tensorflow/stream_executor/blas.cc @@ -97,19 +97,19 @@ std::ostream& operator<<(std::ostream& os, ComputationType ty) { string DataTypeString(DataType ty) { switch (ty) { - case DataType::kF16: + case DataType::kHalf: return "f16"; - case DataType::kF32: + case DataType::kFloat: return "f32"; - case DataType::kF64: + case DataType::kDouble: return "f64"; - case DataType::kI8: + case DataType::kInt8: return "i8"; - case DataType::kI32: + case DataType::kInt32: return "i32"; - case DataType::kComplexF32: + case DataType::kComplexFloat: return "complex f32"; - case DataType::kComplexF64: + case DataType::kComplexDouble: return "complex f64"; default: LOG(FATAL) << "Unknown DataType " << static_cast(ty); diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 411f6f11275..29fa7dbc68e 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -44,6 +44,7 @@ limitations under the License. #include #include "tensorflow/stream_executor/host_or_device_scalar.h" +#include "tensorflow/stream_executor/dnn.h" // For DataType, ToDataType #include "tensorflow/stream_executor/lib/array_slice.h" #include "tensorflow/stream_executor/lib/statusor.h" #include "tensorflow/stream_executor/platform/port.h" @@ -119,16 +120,8 @@ std::string ComputationTypeString(ComputationType ty); std::ostream &operator<<(std::ostream &os, ComputationType ty); -// Type with which inputs and outputs of a blaslt routine are performed. -enum class DataType { - kF16, // 16-bit floating-point - kF32, // 32-bit floating-point - kF64, // 64-bit floating-point - kI8, // 8-bit integer - kI32, // 32-bit integer - kComplexF32, // Complex number comprised of two f32s - kComplexF64, // Complex number comprised of two f64s -}; +using dnn::DataType; +using dnn::ToDataType; // Describes the type of pointers for the scaling factors alpha and beta in // blaslt routines. @@ -142,38 +135,6 @@ string DataTypeString(DataType ty); std::ostream &operator<<(std::ostream &os, DataType ty); -// Converts a compile-time type to a DataType value. -template -struct ToDataType {}; -template <> -struct ToDataType { - static constexpr const DataType value = DataType::kF16; -}; -template <> -struct ToDataType { - static constexpr const DataType value = DataType::kF32; -}; -template <> -struct ToDataType { - static constexpr const DataType value = DataType::kF64; -}; -template <> -struct ToDataType { - static constexpr const DataType value = DataType::kI8; -}; -template <> -struct ToDataType { - static constexpr const DataType value = DataType::kI32; -}; -template <> -struct ToDataType> { - static constexpr const DataType value = DataType::kComplexF32; -}; -template <> -struct ToDataType> { - static constexpr const DataType value = DataType::kComplexF64; -}; - // Opaque identifier for an "algorithm" used by a blas routine. This functions // as a hint to the blas library. typedef int64 AlgorithmType; diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index f2bc79e1c29..1c97b6db6a3 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -441,21 +441,21 @@ cublasComputeType_t CUBLASComputationType(blas::ComputationType ty) { blas::DataType GetScaleType(blas::DataType data_type, blas::ComputationType compute_type) { - bool is_complex = data_type == blas::DataType::kComplexF32 || - data_type == blas::DataType::kComplexF64; + bool is_complex = data_type == blas::DataType::kComplexFloat || + data_type == blas::DataType::kComplexDouble; switch (compute_type) { case blas::ComputationType::kF16: - return blas::DataType::kF16; + return blas::DataType::kHalf; case blas::ComputationType::kF32: // fall-through case blas::ComputationType::kComplexF32: // fall-through case blas::ComputationType::kF32FastTF32: // fall-through case blas::ComputationType::kF32FastBF16: - return is_complex ? blas::DataType::kComplexF32 : blas::DataType::kF32; + return is_complex ? blas::DataType::kComplexFloat : blas::DataType::kFloat; case blas::ComputationType::kF64: // fall-through case blas::ComputationType::kComplexF64: - return is_complex ? blas::DataType::kComplexF64 : blas::DataType::kF64; + return is_complex ? blas::DataType::kComplexDouble : blas::DataType::kDouble; case blas::ComputationType::kI32: - return blas::DataType::kI32; + return blas::DataType::kInt32; } } @@ -484,38 +484,38 @@ cublasLtEpilogue_t CUBLASEpilogue(blas::Epilogue epilogue) { cudaDataType_t GetCUDADataType(blas::DataType ty) { switch (ty) { - case blas::DataType::kF16: + case blas::DataType::kHalf: return CUDA_R_16F; - case blas::DataType::kF32: + case blas::DataType::kFloat: return CUDA_R_32F; - case blas::DataType::kF64: + case blas::DataType::kDouble: return CUDA_R_64F; - case blas::DataType::kI8: + case blas::DataType::kInt8: return CUDA_R_8I; - case blas::DataType::kI32: + case blas::DataType::kInt32: return CUDA_R_32I; - case blas::DataType::kComplexF32: + case blas::DataType::kComplexFloat: return CUDA_C_32F; - case blas::DataType::kComplexF64: + case blas::DataType::kComplexDouble: return CUDA_C_64F; } } int GetDataTypeSizeBytes(blas::DataType ty) { switch (ty) { - case blas::DataType::kF16: + case blas::DataType::kHalf: return 2; - case blas::DataType::kF32: + case blas::DataType::kFloat: return 4; - case blas::DataType::kF64: + case blas::DataType::kDouble: return 8; - case blas::DataType::kI8: + case blas::DataType::kInt8: return 1; - case blas::DataType::kI32: + case blas::DataType::kInt32: return 4; - case blas::DataType::kComplexF32: + case blas::DataType::kComplexFloat: return 8; - case blas::DataType::kComplexF64: + case blas::DataType::kComplexDouble: return 16; } } @@ -3611,7 +3611,7 @@ bool CUDABlas::DoBlasLtMatmul(Stream* stream, blas::ProfileResult* output_profile_result) { #if CUDA_VERSION >= 11000 const auto& cuda_plan = *static_cast(plan); - if (cuda_plan.scale_type() == blas::DataType::kF32) { + if (cuda_plan.scale_type() == blas::DataType::kFloat) { // F32* computation types require F32 alpha/beta type, so we must cast them. if (alpha.is_pointer() || beta.is_pointer()) { // We cannot easily convert a pointer to f16 memory to a pointer to f32 diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 53cdff8cb7a..fd38efc2537 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -133,6 +133,14 @@ template <> struct ToDataType { static constexpr DataType value = DataType::kInt32; }; +template <> +struct ToDataType> { + static constexpr DataType value = DataType::kComplexFloat; +}; +template <> +struct ToDataType> { + static constexpr DataType value = DataType::kComplexDouble; +}; // Specifies the types of a RNN model. enum class RnnMode { diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto index 4d09e615e7d..f849b011eb3 100644 --- a/tensorflow/stream_executor/dnn.proto +++ b/tensorflow/stream_executor/dnn.proto @@ -12,6 +12,8 @@ enum DataType { kHalf = 2; kInt8 = 3; kInt32 = 4; + kComplexFloat = 5; + kComplexDouble = 6; } // Describes how a convolution input or output layer's data is formatted. From f5fd56c99bde8950146bbe7c739e77c03d615c74 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Tue, 29 Sep 2020 21:14:17 +1000 Subject: [PATCH 0047/1447] Replace blasLt overloads with runtime types --- tensorflow/stream_executor/blas.h | 138 +++---------- tensorflow/stream_executor/cuda/cuda_blas.cc | 194 ++++-------------- tensorflow/stream_executor/cuda/cuda_blas.h | 28 +-- .../stream_executor/host_or_device_scalar.h | 135 ++++++++++++ 4 files changed, 221 insertions(+), 274 deletions(-) diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 29fa7dbc68e..65b3dadfd27 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -43,7 +43,6 @@ limitations under the License. #include #include -#include "tensorflow/stream_executor/host_or_device_scalar.h" #include "tensorflow/stream_executor/dnn.h" // For DataType, ToDataType #include "tensorflow/stream_executor/lib/array_slice.h" #include "tensorflow/stream_executor/lib/statusor.h" @@ -61,6 +60,9 @@ class ScratchAllocator; template class DeviceMemory; +template +class HostOrDeviceScalar; + namespace blas { // Specifies whether the input matrix will be transposed or @@ -1469,60 +1471,32 @@ class BlasSupport { // vector of length equal to the number of rows in matrix c. If epilogue was // set to any other value then the bias argument here must be null. The bias // vector is broadcast across the batch dimension. + // Note that the data types of a and b (c and bias) must match the ab_type + // (c_type) with which the plan was created, and the data types of alpha and + // beta must match the data type of c. virtual bool DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr) = 0; - virtual bool DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, const DeviceMemory& b, - const HostOrDeviceScalar& beta, DeviceMemory* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr) = 0; - virtual bool DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr) = 0; - virtual bool DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr) = 0; - virtual bool DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias = {}, - blas::ProfileResult* output_profile_result = nullptr) = 0; - virtual bool DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias = {}, - blas::ProfileResult* output_profile_result = nullptr) = 0; + const HostOrDeviceScalar& alpha, DeviceMemoryBase a, + DeviceMemoryBase b, const HostOrDeviceScalar& beta, + DeviceMemoryBase c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, DeviceMemoryBase bias, + blas::ProfileResult* output_profile_result) = 0; + + template + bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias = {}, + blas::ProfileResult* output_profile_result = nullptr) { + return DoBlasLtMatmul(stream, plan, alpha, a, b, beta, *c, + scratch_allocator, algorithm, bias, + output_profile_result); + } virtual port::Status GetVersion(std::string *version) = 0; @@ -2346,59 +2320,11 @@ class BlasSupport { out_algorithms) override; \ bool DoBlasLtMatmul( \ Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ - const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ - const DeviceMemory& b, const HostOrDeviceScalar& beta, \ - DeviceMemory* c, ScratchAllocator* scratch_allocator, \ - const blas::IBlasLtMatmulAlgorithm* algorithm, \ - const DeviceMemory& bias = {}, \ - blas::ProfileResult* output_profile_result = nullptr) override; \ - bool DoBlasLtMatmul( \ - Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ - const HostOrDeviceScalar& alpha, \ - const DeviceMemory& a, const DeviceMemory& b, \ - const HostOrDeviceScalar& beta, \ - DeviceMemory* c, ScratchAllocator* scratch_allocator, \ - const blas::IBlasLtMatmulAlgorithm* algorithm, \ - const DeviceMemory& bias = {}, \ - blas::ProfileResult* output_profile_result = nullptr) override; \ - bool DoBlasLtMatmul( \ - Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ - const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ - const DeviceMemory& b, const HostOrDeviceScalar& beta, \ - DeviceMemory* c, ScratchAllocator* scratch_allocator, \ - const blas::IBlasLtMatmulAlgorithm* algorithm, \ - const DeviceMemory& bias = {}, \ - blas::ProfileResult* output_profile_result = nullptr) override; \ - bool DoBlasLtMatmul( \ - Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ - const HostOrDeviceScalar& alpha, const DeviceMemory& a, \ - const DeviceMemory& b, const HostOrDeviceScalar& beta, \ - DeviceMemory* c, ScratchAllocator* scratch_allocator, \ - const blas::IBlasLtMatmulAlgorithm* algorithm, \ - const DeviceMemory& bias = {}, \ - blas::ProfileResult* output_profile_result = nullptr) override; \ - bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ - const HostOrDeviceScalar>& alpha, \ - const DeviceMemory>& a, \ - const DeviceMemory>& b, \ - const HostOrDeviceScalar>& beta, \ - DeviceMemory>* c, \ - ScratchAllocator* scratch_allocator, \ - const blas::IBlasLtMatmulAlgorithm* algorithm, \ - const DeviceMemory>& bias = {}, \ - blas::ProfileResult* output_profile_result = nullptr) \ - override; \ - bool DoBlasLtMatmul(Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ - const HostOrDeviceScalar>& alpha, \ - const DeviceMemory>& a, \ - const DeviceMemory>& b, \ - const HostOrDeviceScalar>& beta, \ - DeviceMemory>* c, \ - ScratchAllocator* scratch_allocator, \ - const blas::IBlasLtMatmulAlgorithm* algorithm, \ - const DeviceMemory>& bias = {}, \ - blas::ProfileResult* output_profile_result = nullptr) \ - override; \ + const HostOrDeviceScalar& alpha, DeviceMemoryBase a, \ + DeviceMemoryBase b, const HostOrDeviceScalar& beta, \ + DeviceMemoryBase c, ScratchAllocator* scratch_allocator, \ + const blas::IBlasLtMatmulAlgorithm* algorithm, DeviceMemoryBase bias, \ + blas::ProfileResult* output_profile_result) override; \ port::Status GetVersion(std::string *version) override; } // namespace blas diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 1c97b6db6a3..b285142276d 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -3441,37 +3441,22 @@ bool CUDABlas::GetBlasLtMatmulAlgorithms( } #if CUDA_VERSION >= 11000 -template -bool CUDABlas::DoBlasLtMatmulInternalImpl( +bool CUDABlas::DoBlasLtMatmulInternal( Stream* stream, bool err_on_failure, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const ABType* a, - const ABType* b, const HostOrDeviceScalar& beta, const CDType* c, - CDType* d, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const CDType* bias) { + const HostOrDeviceScalar& alpha, DeviceMemoryBase a, + DeviceMemoryBase b, const HostOrDeviceScalar& beta, + DeviceMemoryBase c, DeviceMemoryBase d, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, DeviceMemoryBase bias) { const auto& cuda_plan = *static_cast(plan); const auto& cuda_algo = *static_cast(algorithm); - if (cuda_plan.ab_type() != blas::ToDataType::value) { - VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong ab_type: " - "expected " - << blas::ToDataType::value << ", got " - << cuda_plan.ab_type(); - return false; - } - if (cuda_plan.cd_type() != blas::ToDataType::value) { - VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong cd_type: " - "expected " - << blas::ToDataType::value << ", got " - << cuda_plan.cd_type(); - return false; - } - if (cuda_plan.scale_type() != blas::ToDataType::value) { - VLOG(2) << "DoBlasLtMatmul returning false because plan has wrong " - "scale_type: expected " - << blas::ToDataType::value << ", got " - << cuda_plan.cd_type(); + if (alpha.data_type() != cuda_plan.scale_type() || + beta.data_type() != cuda_plan.scale_type()) { + VLOG(2) << "DoBlasLtMatmul returning false because alpha and beta types do " + "not match plan: expected " + << cuda_plan.cd_type() << ", got alpha=" << alpha.data_type() + << " beta=" << beta.data_type(); return false; } if (alpha.is_pointer() != beta.is_pointer()) { @@ -3494,16 +3479,16 @@ bool CUDABlas::DoBlasLtMatmulInternalImpl( return false; } if (bias != nullptr) { - if (!cuda_plan.SetBiasPointer(bias)) { + if (!cuda_plan.SetBiasPointer(bias.opaque())) { VLOG(2) << "DoBlasLtMatmul returning false because setting the bias " "pointer failed."; return false; } } - const ScaleType* alpha_ptr = - alpha.is_pointer() ? GpuMemory(alpha.pointer()) : &alpha.value(); - const ScaleType* beta_ptr = - beta.is_pointer() ? GpuMemory(beta.pointer()) : &beta.value(); + const void* alpha_ptr = alpha.is_pointer() ? alpha.opaque_pointer().opaque() + : alpha.opaque_value(); + const void* beta_ptr = + beta.is_pointer() ? beta.opaque_pointer().opaque() : beta.opaque_value(); void* workspace = nullptr; if (cuda_algo.workspace_size()) { @@ -3529,9 +3514,9 @@ bool CUDABlas::DoBlasLtMatmulInternalImpl( gpu::ScopedActivateExecutorContext sac{parent_}; cublasStatus_t ret = cublasLtMatmul( - blasLt_, cuda_plan.op_desc(), alpha_ptr, a, cuda_plan.a_desc(), b, - cuda_plan.b_desc(), beta_ptr, c, cuda_plan.c_desc(), d, - cuda_plan.d_desc(), cuda_algo.algo(), workspace, + blasLt_, cuda_plan.op_desc(), alpha_ptr, a.opaque(), cuda_plan.a_desc(), + b.opaque(), cuda_plan.b_desc(), beta_ptr, c.opaque(), cuda_plan.c_desc(), + d.opaque(), cuda_plan.d_desc(), cuda_algo.algo(), workspace, cuda_algo.workspace_size(), cuda_stream); if (ret != CUBLAS_STATUS_SUCCESS) { if (err_on_failure || VLOG_IS_ON(3)) { @@ -3543,17 +3528,32 @@ bool CUDABlas::DoBlasLtMatmulInternalImpl( } #endif // CUDA_VERSION >= 11000 -template -bool CUDABlas::DoBlasLtMatmulInternal( +bool CUDABlas::DoBlasLtMatmul( Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - const DeviceMemory& c, DeviceMemory* d, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, + const HostOrDeviceScalar& alpha, DeviceMemoryBase a, + DeviceMemoryBase b, const HostOrDeviceScalar& beta, + DeviceMemoryBase c, ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, DeviceMemoryBase bias, blas::ProfileResult* output_profile_result) { #if CUDA_VERSION >= 11000 + const auto& cuda_plan = *static_cast(plan); + HostOrDeviceScalar alpha_cast = alpha; + HostOrDeviceScalar beta_cast = beta; + if (cuda_plan.cd_type() == blas::DataType::kHalf && + cuda_plan.scale_type() == blas::DataType::kFloat) { + // The given alpha and beta types are F16 (they always match c), but F32* + // computation type requires that they be F32, so we must cast them. + if (alpha.is_pointer() || beta.is_pointer()) { + // We cannot easily convert a pointer to f16 memory to a pointer to f32 + // memory from here, so we don't support this for now. + return false; + } + alpha_cast = HostOrDeviceScalar( + static_cast(alpha.value())); + beta_cast = + HostOrDeviceScalar(static_cast(beta.value())); + } + std::unique_ptr timer; if (output_profile_result) { timer.reset(new GpuTimer(parent_)); @@ -3563,10 +3563,9 @@ bool CUDABlas::DoBlasLtMatmulInternal( } bool err_on_failure = timer != nullptr; - bool result = DoBlasLtMatmulInternalImpl( - stream, err_on_failure, plan, alpha, GpuMemory(a), GpuMemory(b), beta, - GpuMemory(c), GpuMemoryMutable(d), scratch_allocator, algorithm, - GpuMemory(bias)); + bool result = DoBlasLtMatmulInternal(stream, err_on_failure, plan, alpha_cast, + a, b, beta_cast, c, c, scratch_allocator, + algorithm, bias); if (timer && result) { // GpuTimer will CHECK-fail if we Stop() it while the stream is in an error @@ -3585,109 +3584,6 @@ bool CUDABlas::DoBlasLtMatmulInternal( #endif } -bool CUDABlas::DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { - return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, bias, - output_profile_result); -} - -bool CUDABlas::DoBlasLtMatmul(Stream* stream, - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, - const DeviceMemory& b, - const HostOrDeviceScalar& beta, - DeviceMemory* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { -#if CUDA_VERSION >= 11000 - const auto& cuda_plan = *static_cast(plan); - if (cuda_plan.scale_type() == blas::DataType::kFloat) { - // F32* computation types require F32 alpha/beta type, so we must cast them. - if (alpha.is_pointer() || beta.is_pointer()) { - // We cannot easily convert a pointer to f16 memory to a pointer to f32 - // memory from here, so we don't support this for now. - return false; - } - HostOrDeviceScalar float_alpha(static_cast(alpha.value())); - HostOrDeviceScalar float_beta(static_cast(beta.value())); - return DoBlasLtMatmulInternal(stream, plan, float_alpha, a, b, float_beta, - *c, c, scratch_allocator, algorithm, bias, - output_profile_result); - } - return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, bias, - output_profile_result); -#else // if CUDA_VERSION < 11000 - return false; -#endif -} - -bool CUDABlas::DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { - return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, bias, - output_profile_result); -} - -bool CUDABlas::DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { - return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, bias, - output_profile_result); -} - -bool CUDABlas::DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias, - blas::ProfileResult* output_profile_result) { - return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, bias, - output_profile_result); -} - -bool CUDABlas::DoBlasLtMatmul( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias, - blas::ProfileResult* output_profile_result) { - return DoBlasLtMatmulInternal(stream, plan, alpha, a, b, beta, *c, c, - scratch_allocator, algorithm, bias, - output_profile_result); -} - port::Status CUDABlas::GetVersion(std::string *version) { absl::MutexLock lock(&mu_); diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h index 3fdfcb0a50c..eb8e4f6a7dc 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.h +++ b/tensorflow/stream_executor/cuda/cuda_blas.h @@ -140,25 +140,15 @@ class CUDABlas : public blas::BlasSupport { blas::ProfileResult *output_profile_result); // Helper function for implementing DoBlasLtMatmul. - template - bool DoBlasLtMatmulInternal( - Stream* stream, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - const DeviceMemory& c, DeviceMemory* d, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result); - - // Helper function for implementing DoBlasLtMatmulInternal. - template - bool DoBlasLtMatmulInternalImpl( - Stream* stream, bool err_on_failure, const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const ABType* a, - const ABType* b, const HostOrDeviceScalar& beta, - const CDType* c, CDType* d, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, const CDType* bias); + bool DoBlasLtMatmulInternal(Stream* stream, bool err_on_failure, + const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + DeviceMemoryBase a, DeviceMemoryBase b, + const HostOrDeviceScalar& beta, + DeviceMemoryBase c, DeviceMemoryBase d, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + DeviceMemoryBase bias); // Guards the cuBLAS handle for this device. absl::Mutex mu_; diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h index 1f5d4b9260c..e5319e7d187 100644 --- a/tensorflow/stream_executor/host_or_device_scalar.h +++ b/tensorflow/stream_executor/host_or_device_scalar.h @@ -17,12 +17,14 @@ limitations under the License. #define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_ #include "tensorflow/stream_executor/device_memory.h" +#include "tensorflow/stream_executor/dnn.h" // For DataType, ToDataType #include "tensorflow/stream_executor/platform/logging.h" namespace stream_executor { // Allows to represent a value that is either a host scalar or a scalar stored // on the GPU device. +// See also the specialization for ElemT=void below. template class HostOrDeviceScalar { public: @@ -52,5 +54,138 @@ class HostOrDeviceScalar { bool is_pointer_; }; +// Specialization for wrapping a dynamically-typed value (via type erasure). +template <> +class HostOrDeviceScalar { + public: + using DataType = dnn::DataType; + // Not marked as explicit because when using this constructor, we usually want + // to set this to a compile-time constant. + HostOrDeviceScalar(float value) + : float_(value), is_pointer_(false), dtype_(DataType::kFloat) {} + HostOrDeviceScalar(double value) + : double_(value), is_pointer_(false), dtype_(DataType::kDouble) {} + HostOrDeviceScalar(Eigen::half value) + : half_(value), is_pointer_(false), dtype_(DataType::kHalf) {} + HostOrDeviceScalar(int8 value) + : int8_(value), is_pointer_(false), dtype_(DataType::kInt8) {} + HostOrDeviceScalar(int32 value) + : int32_(value), is_pointer_(false), dtype_(DataType::kInt32) {} + HostOrDeviceScalar(std::complex value) + : complex_float_(value), + is_pointer_(false), + dtype_(DataType::kComplexFloat) {} + HostOrDeviceScalar(std::complex value) + : complex_double_(value), + is_pointer_(false), + dtype_(DataType::kComplexDouble) {} + template + explicit HostOrDeviceScalar(const DeviceMemory& pointer) + : pointer_(pointer), + is_pointer_(true), + dtype_(dnn::ToDataType::value) { + CHECK_EQ(1, pointer.ElementCount()); + } + // Construct from statically-typed version. + template ::value, + int>::type = 0> + HostOrDeviceScalar(const HostOrDeviceScalar& other) { + if (other.is_pointer()) { + *this = HostOrDeviceScalar(other.pointer()); + } else { + *this = HostOrDeviceScalar(other.value()); + } + } + + bool is_pointer() const { return is_pointer_; } + template + const DeviceMemory& pointer() const { + CHECK(is_pointer()); + CHECK(dtype_ == dnn::ToDataType::value); + return pointer_; + } + template + const T& value() const { + CHECK(!is_pointer()); + CHECK(dtype_ == dnn::ToDataType::value); + return value_impl(); + } + const DeviceMemoryBase& opaque_pointer() const { + CHECK(is_pointer()); + return pointer_; + } + const void* opaque_value() const { + CHECK(!is_pointer()); + switch (dtype_) { + case DataType::kFloat: + return &float_; + case DataType::kDouble: + return &double_; + case DataType::kHalf: + return &half_; + case DataType::kInt8: + return &int8_; + case DataType::kInt32: + return &int32_; + case DataType::kComplexFloat: + return &complex_float_; + case DataType::kComplexDouble: + return &complex_double_; + default: + return nullptr; + } + } + DataType data_type() const { return dtype_; } + + private: + template + const T& value_impl() const; + + union { + float float_; + double double_; + Eigen::half half_; + int8 int8_; + int32 int32_; + std::complex complex_float_; + std::complex complex_double_; + DeviceMemoryBase pointer_; + }; + bool is_pointer_; + DataType dtype_; +}; + +template <> +inline const float& HostOrDeviceScalar::value_impl() const { + return float_; +}; +template <> +inline const double& HostOrDeviceScalar::value_impl() const { + return double_; +}; +template <> +inline const Eigen::half& HostOrDeviceScalar::value_impl() + const { + return half_; +}; +template <> +inline const int8& HostOrDeviceScalar::value_impl() const { + return int8_; +}; +template <> +inline const int32& HostOrDeviceScalar::value_impl() const { + return int32_; +}; +template <> +inline const std::complex& +HostOrDeviceScalar::value_impl>() const { + return complex_float_; +}; +template <> +inline const std::complex& +HostOrDeviceScalar::value_impl>() const { + return complex_double_; +}; + } // namespace stream_executor #endif // TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_ From ec203dedad74bd06097e3faeaf60d06b0707341b Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Tue, 29 Sep 2020 21:17:57 +1000 Subject: [PATCH 0048/1447] Add more detailed comment for kF32FastTF32/BF16 --- tensorflow/stream_executor/blas.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 65b3dadfd27..ac3a788aaef 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -105,7 +105,8 @@ enum class ComputationType { kComplexF32, // Complex number comprised of two f32s. kComplexF64, // Complex number comprised of two f64s. // The below values are only supported for BlasLt routines (both real and - // complex). + // complex). They use float32 for accumulation but round the input mantissas + // to a smaller number of bits. kF32FastTF32, // 32-bit floating-point with reduced (>=10-bit) mantissa kF32FastBF16, // 32-bit floating-point with reduced (7-bit) mantissa }; From d3c8909365246fac5066a1d69087aa9e3ab86990 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Wed, 30 Sep 2020 15:59:12 +1000 Subject: [PATCH 0049/1447] Replace stream BlasLt overloads with runtime types --- .../stream_executor/host_or_device_scalar.h | 23 +++ tensorflow/stream_executor/stream.cc | 151 ++---------------- tensorflow/stream_executor/stream.h | 88 +++++----- 3 files changed, 77 insertions(+), 185 deletions(-) diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h index e5319e7d187..5f06cf027a0 100644 --- a/tensorflow/stream_executor/host_or_device_scalar.h +++ b/tensorflow/stream_executor/host_or_device_scalar.h @@ -137,6 +137,29 @@ class HostOrDeviceScalar { } DataType data_type() const { return dtype_; } + template + ResultType CallWithValue(GenericUnaryFunc func) const { + CHECK(!is_pointer()); + switch (dtype_) { + case DataType::kFloat: + return func(float_); + case DataType::kDouble: + return func(double_); + case DataType::kHalf: + return func(half_); + case DataType::kInt8: + return func(int8_); + case DataType::kInt32: + return func(int32_); + case DataType::kComplexFloat: + return func(complex_float_); + case DataType::kComplexDouble: + return func(complex_double_); + default: + return {}; + } + } + private: template const T& value_impl() const; diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 66728c94821..7233056df0a 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -140,6 +140,14 @@ std::string ToVlogString(const HostOrDeviceScalar &memory_or_constant) { return ToVlogString(memory_or_constant.value()); } +std::string ToVlogString(const HostOrDeviceScalar& memory_or_constant) { + if (memory_or_constant.is_pointer()) { + return ToVlogString(memory_or_constant.opaque_pointer()); + } + return memory_or_constant.CallWithValue( + [](const auto& value) { return ToVlogString(value); }); +} + template std::string ToVlogString(port::ArraySlice elements) { std::string str = absl::StrCat( @@ -4802,147 +4810,22 @@ Stream &Stream::ThenBlasGemmStridedBatched( } Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, - const DeviceMemory& b, - const HostOrDeviceScalar& beta, - DeviceMemory* c, + const HostOrDeviceScalar& alpha, + DeviceMemoryBase a, DeviceMemoryBase b, + const HostOrDeviceScalar& beta, + DeviceMemoryBase c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, + DeviceMemoryBase bias, blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), PARAM(c), PARAM(algorithm), PARAM(bias)); - ThenBlasWithProfileImpl< - const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, - const DeviceMemory&, const DeviceMemory&, - const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&> - impl; - return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, bias, output_profile_result); -} - -Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, - const DeviceMemory& b, - const HostOrDeviceScalar& beta, - DeviceMemory* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { - VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm), PARAM(bias)); - - ThenBlasWithProfileImpl< - const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, - const DeviceMemory&, const DeviceMemory&, - const HostOrDeviceScalar&, DeviceMemory*, - ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*, - const DeviceMemory&> - impl; - return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, bias, output_profile_result); -} - -Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, - const DeviceMemory& b, - const HostOrDeviceScalar& beta, - DeviceMemory* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { - VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm), PARAM(bias)); - - ThenBlasWithProfileImpl< - const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, - const DeviceMemory&, const DeviceMemory&, - const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&> - impl; - return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, bias, output_profile_result); -} - -Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, - const DeviceMemory& b, - const HostOrDeviceScalar& beta, - DeviceMemory* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias, - blas::ProfileResult* output_profile_result) { - VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm), PARAM(bias)); - - ThenBlasWithProfileImpl< - const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, - const DeviceMemory&, const DeviceMemory&, - const HostOrDeviceScalar&, DeviceMemory*, - ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*, - const DeviceMemory&> - impl; - return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, bias, output_profile_result); -} - -Stream& Stream::ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias, - blas::ProfileResult* output_profile_result) { - VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm), PARAM(bias)); - ThenBlasWithProfileImpl>&, - const DeviceMemory>&, - const DeviceMemory>&, - const HostOrDeviceScalar>&, - DeviceMemory>*, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*, - const DeviceMemory>&> - impl; - return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, - c, scratch_allocator, algorithm, bias, output_profile_result); -} - -Stream& Stream::ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias, - blas::ProfileResult* output_profile_result) { - VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), - PARAM(c), PARAM(algorithm), PARAM(bias)); - - ThenBlasWithProfileImpl>&, - const DeviceMemory>&, - const DeviceMemory>&, - const HostOrDeviceScalar>&, - DeviceMemory>*, - ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*, - const DeviceMemory>&> + const HostOrDeviceScalar&, DeviceMemoryBase, + DeviceMemoryBase, const HostOrDeviceScalar&, + DeviceMemoryBase, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, DeviceMemoryBase> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, c, scratch_allocator, algorithm, bias, output_profile_result); diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index 91a80331f8e..b82c34b6c02 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -74,6 +74,19 @@ class AlgorithmDesc; class StreamExecutor; class ScratchAllocator; +namespace detail { + +// Helper class to prevent a template function argument from being deduced. This +// is identical to std::type_identity in C++20. +template +struct NonDeduced { + using type = T; +}; +template +using NonDeducedType = typename NonDeduced::type; + +} // namespace detail + // Convert a type to the corresponding QuantizedActivationMode. template struct Quantization; @@ -1666,60 +1679,33 @@ class Stream { DeviceMemory> *b, int ldb); // See BlasSupport::DoBlatLtMatmul. + Stream& ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + DeviceMemoryBase a, DeviceMemoryBase b, + const HostOrDeviceScalar& beta, + DeviceMemoryBase c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + DeviceMemoryBase bias, + blas::ProfileResult* output_profile_result); + + // Note that we prevent alpha and beta from being used to deduce CType so that + // they can be constructed implicitly from values of type CType. Without this, + // type deduction would fail when this function is called with a value of type + // CType for alpha or beta. + template Stream& ThenBlasLtMatmul( const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, + const detail::NonDeducedType>& alpha, + const DeviceMemory& a, const DeviceMemory& b, + const detail::NonDeducedType>& beta, + DeviceMemory* c, ScratchAllocator* scratch_allocator, const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr); - Stream& ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - const DeviceMemory& a, const DeviceMemory& b, - const HostOrDeviceScalar& beta, DeviceMemory* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr); - Stream& ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr); - Stream& ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, const DeviceMemory& a, - const DeviceMemory& b, const HostOrDeviceScalar& beta, - DeviceMemory* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory& bias = {}, - blas::ProfileResult* output_profile_result = nullptr); - Stream& ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias = {}, - blas::ProfileResult* output_profile_result = nullptr); - Stream& ThenBlasLtMatmul( - const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar>& alpha, - const DeviceMemory>& a, - const DeviceMemory>& b, - const HostOrDeviceScalar>& beta, - DeviceMemory>* c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - const DeviceMemory>& bias = {}, - blas::ProfileResult* output_profile_result = nullptr); + const DeviceMemory& bias = {}, + blas::ProfileResult* output_profile_result = nullptr) { + return ThenBlasLtMatmul(plan, alpha, a, b, beta, *c, scratch_allocator, + algorithm, bias, output_profile_result); + } // See FftSupport::DoFft. Stream &ThenFft(fft::Plan *plan, From c352409cba79f0731aa6e0f1827ce52ba261a9b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Wed, 30 Sep 2020 15:21:48 +0200 Subject: [PATCH 0050/1447] TFLu: Add comment in ethosu.inc as per review --- tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc index acb4d32ab65..e29f5df1eea 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc @@ -1,4 +1,5 @@ ifneq ($(filter ethos-u,$(ALL_TAGS)),) + # Do not link Math library MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) ifneq (,$(filter $(TARGET_ARCH), x86_64)) From 9b2fabd8d87706487a5ea2998d85f8a88923df07 Mon Sep 17 00:00:00 2001 From: kushanam Date: Mon, 20 Jul 2020 21:53:32 -0700 Subject: [PATCH 0051/1447] Add file changes --- .../python/distribute/distribute_lib.py | 31 ++++++ tensorflow/python/distribute/input_lib.py | 105 ++++++++++++++++-- .../python/distribute/mirrored_strategy.py | 25 ++++- 3 files changed, 148 insertions(+), 13 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 14e2b6f3f02..4c335874436 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -439,8 +439,12 @@ class InputReplicationMode(enum.Enum): Replicas will dequeue from the local Dataset on their worker. `tf.distribute.Strategy` doesn't manage any state sharing between such separate input pipelines. + * `PER_REPLICA`: The input function will be called on each replica seperately. + `tf.distribute.Strategy` doesn't manage any state sharing between such + separate input pipelines. """ PER_WORKER = "PER_WORKER" + PER_REPLICA = "PER_REPLICA" @tf_export("distribute.InputContext") @@ -1071,6 +1075,7 @@ class StrategyBase(object): # pylint: disable=line-too-long """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`. +<<<<<<< HEAD The argument `dataset_fn` that users pass in is an input function that has a `tf.distribute.InputContext` argument and returns a `tf.data.Dataset` instance. It is expected that the returned dataset from `dataset_fn` is @@ -1088,6 +1093,14 @@ class StrategyBase(object): specify your own batching and sharding logic. (In contrast, `tf.distribute.experimental_distribute_dataset` does batching and sharding for you.)For example, where +======= + `dataset_fn` will be called once for each worker in the strategy. Each + replica on that worker will dequeue one batch of inputs from the local + `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued + from the `Dataset` every step). + + This method can be used for several purposes. For example, where +>>>>>>> 87a7ad3be2... Adding per-replica dataset distribution `experimental_distribute_dataset` is unable to shard the input files, this method might be used to manually shard the dataset (avoiding the slow fallback behavior in `experimental_distribute_dataset`). In cases where the @@ -1118,6 +1131,7 @@ class StrategyBase(object): snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats) for an example of how to order outputs. +<<<<<<< HEAD Note: Stateful dataset transformations are currently not supported with `tf.distribute.experimental_distribute_dataset` or `tf.distribute.distribute_datasets_from_function`. Any stateful @@ -1129,6 +1143,23 @@ class StrategyBase(object): For a tutorial on more usage and properties of this method, refer to the [tutorial on distributed input](https://www.tensorflow.org/tutorials/distribute/input#tfdistributestrategyexperimental_distribute_datasets_from_function)). If you are interested in last partial batch handling, read [this section](https://www.tensorflow.org/tutorials/distribute/input#partial_batches). +======= + In the case where you want to specify datasets `PER_REPLICA`, that is having + a separate dataset per each device, you can specify as follows. + + ```python + train_dist_dataset = strategy.experimental_distribute_datasets_from_function( + train_dataset_fn, + distribute_lib.InputReplicationMode.PER_REPLICA) + + train_dist_iterator = iter(train_dist_dataset) + for epoch in range(NUM_EPOCHS): + total_loss = 0.0 + for iteration in range(ITERATIONS): + data = next(train_dist_iterator) + total_loss += distributed_train_step(data) + ``` +>>>>>>> 9598cf19be... apply review changes Args: dataset_fn: A function taking a `tf.distribute.InputContext` instance and diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index df81fee3e37..ccfe4313b25 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -35,6 +35,7 @@ from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.distribute import input_ops from tensorflow.python.distribute import reduce_util from tensorflow.python.distribute import values +from tensorflow.python.distribute import distribute_lib from tensorflow.python.eager import context from tensorflow.python.framework import composite_tensor from tensorflow.python.framework import constant_op @@ -106,7 +107,8 @@ def get_distributed_dataset(dataset, def get_distributed_datasets_from_function(dataset_fn, input_workers, input_contexts, - strategy): + strategy, + replication_mode=InputReplicationMode.PER_WORKER): """Returns a distributed dataset from the given input function. This is a common function that is used by all strategies to return a @@ -129,11 +131,18 @@ def get_distributed_datasets_from_function(dataset_fn, A distributed dataset instance. """ if tf2.enabled(): - return DistributedDatasetsFromFunction( - dataset_fn, - input_workers, - input_contexts, - strategy) + if strategy.extended.replication_mode == distribute_lib.InputReplicationMode.PER_WORKER: + return DistributedDatasetsFromFunction( + dataset_fn, + input_workers, + input_contexts, + strategy) + else: + return DistributedDatasetsFromFunctionForReplicas( + dataset_fn, + input_workers, + input_contexts, + strategy) else: return DistributedDatasetsFromFunctionV1( dataset_fn, @@ -141,7 +150,6 @@ def get_distributed_datasets_from_function(dataset_fn, input_contexts, strategy) - @tf_export("distribute.DistributedIterator", v1=[]) class DistributedIteratorInterface(collections_abc.Iterator, distribute_types.Iterator): @@ -669,6 +677,21 @@ class DistributedIteratorBase(DistributedIteratorInterface): return distribute_utils.regroup(replicas) +class DistributedIteratorForReplicas(DistributedIterator): + """Input Iterator for a distributed dataset on replicas.""" + def __init__(self, input_workers, iterators, strategy): + super(DistributedIteratorForReplicas, self).__init__(input_workers, iterators, strategy) + + def get_next(self, name=None): + """Returns the next input from the iterator for all replicas.""" + if not self._enable_get_next_as_optional: + replicas = [] + for iterator in self._iterators: + with ops.device(iterator._worker): + next_out = iterator.get_next_as_list_static_shapes(iterator._worker) + replicas.append(next_out) + return values.regroup(replicas) + class DistributedIteratorV1(DistributedIteratorBase): """Input Iterator for a distributed dataset.""" @@ -1200,6 +1223,47 @@ class DistributedDatasetsFromFunction(_IterableInput): return self._element_spec +class DistributedDatasetsFromFunctionForReplicas(_IterableInput): + """Inputs created from dataset function.""" + + def __init__(self, dataset_fn, input_workers, input_contexts, strategy): + super(DistributedDatasetsFromFunctionForReplicas, self).__init__( + input_workers=input_workers) + + self._dataset_fn = dataset_fn + self._input_workers = input_workers + self._input_contexts = input_contexts + self._strategy = strategy + self._element_spec = None + + + def __iter__(self): + if not (context.executing_eagerly() or + ops.get_default_graph().building_function): + raise RuntimeError("__iter__() is only supported inside of tf.function " + "or when eager execution is enabled.") + + iterators, element_spec = _create_iterators_per_replica_with_input_context( + self._input_contexts, self._input_workers, self._dataset_fn) + iterator = DistributedIteratorForReplicas(self._input_workers, iterators, self._strategy) + self._element_spec = _create_distributed_tensor_spec( + self._strategy, element_spec) + iterator._element_spec = self._element_spec # pylint: disable=protected-access + return iterator + + @property + def element_spec(self): + """The type specification of an element of this dataset.""" + if self._element_spec is None: + raise ValueError("You must create an iterator before calling " + "`element_spec` on the distributed dataset or iterator. " + "This is because the dataset function is not called " + "before an iterator is created.") + return self._element_spec + + + + class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction): """Inputs created from dataset function.""" @@ -1689,6 +1753,17 @@ class _SingleWorkerDatasetIterator(_SingleWorkerDatasetIteratorBase): return dataset_ops.get_legacy_output_types(self._iterator) +class _SingleReplicaDatasetIterator(_SingleWorkerDatasetIterator): + def __init__(self, dataset, device): + super(_SingleReplicaDatasetIterator, self).__init__(dataset, device, []) + + def _make_iterator(self): + """Make appropriate iterator on the dataset.""" + with ops.device(self._worker): + self._iterator = iter(self._dataset) + + + class _SingleWorkerCallableIterator(object): """Iterator for a single tensor-returning callable.""" @@ -1722,6 +1797,22 @@ class _SingleWorkerCallableIterator(object): return [] +def _create_iterators_per_replica_with_input_context(input_contexts, + input_workers, + dataset_fn): + """Create a multidevice iterator per workers given a dataset function.""" + iterators = [] + for i, ctx in enumerate(input_contexts): + devices = input_workers.compute_devices_for_worker(i) + with ops.device(devices[0]): + dataset = dataset_fn(ctx) + # Wrapping dataset here (ex. applying options) might result in moving it to the CPU + iterator = _SingleReplicaDatasetIterator(dataset, devices[0]) + iterators.append(iterator) + return iterators, dataset.element_spec + + + def _create_iterators_per_worker(worker_datasets, input_workers, enable_legacy_iterators): """Create a multidevice iterator on each of the workers.""" diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 523c71c4fb5..0291581e03f 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -267,9 +267,11 @@ class MirroredStrategy(distribute_lib.Strategy): the particular hardware is available. """ - def __init__(self, devices=None, cross_device_ops=None): - extended = MirroredExtended( - self, devices=devices, cross_device_ops=cross_device_ops) + def __init__(self, devices=None, cross_device_ops=None, + replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): + extended = MirroredExtended(self, devices=devices, + cross_device_ops=cross_device_ops, + replication_mode=replication_mode) super(MirroredStrategy, self).__init__(extended) distribute_lib.distribution_strategy_gauge.get_cell("V2").set( "MirroredStrategy") @@ -292,7 +294,8 @@ class MirroredStrategyV1(distribute_lib.StrategyV1): # pylint: disable=g-missin class MirroredExtended(distribute_lib.StrategyExtendedV1): """Implementation of MirroredStrategy.""" - def __init__(self, container_strategy, devices=None, cross_device_ops=None): + def __init__(self, container_strategy, devices=None, cross_device_ops=None, + replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): super(MirroredExtended, self).__init__(container_strategy) if context.executing_eagerly(): if devices and not _is_device_list_single_worker(devices): @@ -313,6 +316,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): assert devices, ("Got an empty `devices` list and unable to recognize " "any local devices.") self._cross_device_ops = cross_device_ops + self._replication_mode = replication_mode self._initialize_strategy(devices) # TODO(b/128995245): Enable last partial batch support in graph mode. @@ -337,8 +341,13 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): def _initialize_single_worker(self, devices): """Initializes the object for single-worker training.""" self._devices = tuple(device_util.canonicalize(d) for d in devices) - self._input_workers_devices = ( - (device_util.canonicalize("/device:CPU:0", devices[0]), devices),) + if self._replication_mode == distribute_lib.InputReplicationMode.PER_WORKER: + self._input_workers_devices = ( + (device_util.canonicalize("/device:CPU:0", devices[0]), devices),) + else: + self._input_workers_devices = ( + tuple((device_util.canonicalize("/device:CPU:0", d),(d,)) for d in devices)) + self._inferred_cross_device_ops = None if self._cross_device_ops else ( cross_device_ops_lib.choose_the_best(devices)) self._host_input_device = numpy_dataset.SingleDevice( @@ -713,6 +722,10 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): @property def worker_devices(self): return self._devices + + @property + def replication_mode(self): + return self._replication_mode @property def worker_devices_by_replica(self): From 83e4697ed4e4da016528575a451bf303eefbb714 Mon Sep 17 00:00:00 2001 From: kushanam Date: Mon, 27 Apr 2020 22:24:03 -0700 Subject: [PATCH 0052/1447] Adding per-replica dataset distribution --- .../python/distribute/distribute_lib.py | 2 + tensorflow/python/distribute/input_lib.py | 2 + .../python/distribute/input_lib_test.py | 62 +++++++++++++++++++ .../python/distribute/mirrored_strategy.py | 16 +++++ 4 files changed, 82 insertions(+) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 4c335874436..1f61603f1d9 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -1099,6 +1099,8 @@ class StrategyBase(object): `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued from the `Dataset` every step). + `replication_mode` determines how to replicate the dataset pipeline. + This method can be used for several purposes. For example, where >>>>>>> 87a7ad3be2... Adding per-replica dataset distribution `experimental_distribute_dataset` is unable to shard the input files, this diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index ccfe4313b25..9827e28626e 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -467,6 +467,8 @@ class DistributedDatasetInterface(collections_abc.Iterable, "DistributedDataset.reduce must be implemented in descendants.") + + class InputWorkers(object): """A 1-to-many mapping from input worker devices to compute devices.""" diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py index ec0b591d710..038dee2df44 100644 --- a/tensorflow/python/distribute/input_lib_test.py +++ b/tensorflow/python/distribute/input_lib_test.py @@ -1250,5 +1250,67 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase, self.assertAllEqual(nest.flatten(sums), [expected_for_sum] * 3) +def dali_const_dataset(batch_size, sample_size, device_id): + import tensorflow as tf + import nvidia.dali.fn as fn + from nvidia.dali.pipeline import Pipeline + import nvidia.dali.plugin.tf as dali_tf + + pipeline = Pipeline(batch_size, 4, device_id) + const = fn.constant(device = 'gpu', fdata = sample_size * [1.]) + pipeline.set_outputs(const) + + dali_dataset = dali_tf.DALIDataset( + pipeline=pipeline, + batch_size=batch_size, + output_shapes=((batch_size, sample_size)), + output_dtypes=(tf.float32), + device_id=device_id) + + options = tf.data.Options() + options.experimental_optimization.apply_default_optimizations = False + options.experimental_optimization.autotune = False + + return dali_dataset.with_options(options) + + +class InputTypeSpecAndDevicePerReplicaTest(test.TestCase, parameterized.TestCase): + + @combinations.generate( + combinations.combine( + mode=["eager"], + distribution=[ + strategy_combinations.mirrored_strategy_with_two_gpus])) + def testInputSignatureForPerReplicaValues(self, distribution): + with distribution.scope(): + def dataset_fn(input_context): + return dali_const_dataset(4, 4, input_context.input_pipeline_id) + + ds = distribution.experimental_distribute_datasets_from_function( + dataset_fn, distribute_lib.InputReplicationMode.PER_REPLICA) + + iterator = iter(ds) + type_spec = iterator.element_spec + + @def_function.function(input_signature=[type_spec]) + def process_inputs(inputs): + distribution.run(lambda inputs: inputs, args=(inputs,)) + + for x in ds: + process_inputs(x) + self.assertEqual( + x[0].values[0].device, + distribution.extended.worker_devices[0]) + self.assertEqual( + x[0].values[0].backing_device, + distribution.extended.worker_devices[0]) + self.assertEqual( + x[0].values[1].device, + distribution.extended.worker_devices[1]) + self.assertEqual( + x[0].values[1].backing_device, + distribution.extended.worker_devices[1]) + break # DALI dataset is infinite + if __name__ == "__main__": combinations.main() diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 0291581e03f..cafbe91f2b1 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -532,6 +532,22 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): input_contexts, self._container_strategy()) + def _experimental_distribute_datasets_from_function_per_replica(self, dataset_fn): + input_contexts = [] + num_replicas = self.worker_devices + for i in range(len(num_replicas)): + input_contexts.append(distribute_lib.InputContext( + num_input_pipelines=len(num_replicas), + input_pipeline_id=i, + num_replicas_in_sync=self._num_replicas_in_sync)) + + return input_lib.get_distributed_datasets_from_function_per_replica( + dataset_fn, + self._input_workers, + input_contexts, + self._container_strategy()) + + def _experimental_distribute_values_from_function(self, value_fn): per_replica_values = [] for replica_id in range(self._num_replicas_in_sync): From 385e315f9535fc1a2506477567386091019a7324 Mon Sep 17 00:00:00 2001 From: Kushan Ahmadian Date: Wed, 17 Jun 2020 15:45:52 -0700 Subject: [PATCH 0053/1447] apply review changes --- .../python/distribute/distribute_lib.py | 2 - .../python/distribute/input_lib_test.py | 62 ------------------- .../python/distribute/mirrored_strategy.py | 16 ----- 3 files changed, 80 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 1f61603f1d9..4c335874436 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -1099,8 +1099,6 @@ class StrategyBase(object): `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued from the `Dataset` every step). - `replication_mode` determines how to replicate the dataset pipeline. - This method can be used for several purposes. For example, where >>>>>>> 87a7ad3be2... Adding per-replica dataset distribution `experimental_distribute_dataset` is unable to shard the input files, this diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py index 038dee2df44..ec0b591d710 100644 --- a/tensorflow/python/distribute/input_lib_test.py +++ b/tensorflow/python/distribute/input_lib_test.py @@ -1250,67 +1250,5 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase, self.assertAllEqual(nest.flatten(sums), [expected_for_sum] * 3) -def dali_const_dataset(batch_size, sample_size, device_id): - import tensorflow as tf - import nvidia.dali.fn as fn - from nvidia.dali.pipeline import Pipeline - import nvidia.dali.plugin.tf as dali_tf - - pipeline = Pipeline(batch_size, 4, device_id) - const = fn.constant(device = 'gpu', fdata = sample_size * [1.]) - pipeline.set_outputs(const) - - dali_dataset = dali_tf.DALIDataset( - pipeline=pipeline, - batch_size=batch_size, - output_shapes=((batch_size, sample_size)), - output_dtypes=(tf.float32), - device_id=device_id) - - options = tf.data.Options() - options.experimental_optimization.apply_default_optimizations = False - options.experimental_optimization.autotune = False - - return dali_dataset.with_options(options) - - -class InputTypeSpecAndDevicePerReplicaTest(test.TestCase, parameterized.TestCase): - - @combinations.generate( - combinations.combine( - mode=["eager"], - distribution=[ - strategy_combinations.mirrored_strategy_with_two_gpus])) - def testInputSignatureForPerReplicaValues(self, distribution): - with distribution.scope(): - def dataset_fn(input_context): - return dali_const_dataset(4, 4, input_context.input_pipeline_id) - - ds = distribution.experimental_distribute_datasets_from_function( - dataset_fn, distribute_lib.InputReplicationMode.PER_REPLICA) - - iterator = iter(ds) - type_spec = iterator.element_spec - - @def_function.function(input_signature=[type_spec]) - def process_inputs(inputs): - distribution.run(lambda inputs: inputs, args=(inputs,)) - - for x in ds: - process_inputs(x) - self.assertEqual( - x[0].values[0].device, - distribution.extended.worker_devices[0]) - self.assertEqual( - x[0].values[0].backing_device, - distribution.extended.worker_devices[0]) - self.assertEqual( - x[0].values[1].device, - distribution.extended.worker_devices[1]) - self.assertEqual( - x[0].values[1].backing_device, - distribution.extended.worker_devices[1]) - break # DALI dataset is infinite - if __name__ == "__main__": combinations.main() diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index cafbe91f2b1..0291581e03f 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -532,22 +532,6 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): input_contexts, self._container_strategy()) - def _experimental_distribute_datasets_from_function_per_replica(self, dataset_fn): - input_contexts = [] - num_replicas = self.worker_devices - for i in range(len(num_replicas)): - input_contexts.append(distribute_lib.InputContext( - num_input_pipelines=len(num_replicas), - input_pipeline_id=i, - num_replicas_in_sync=self._num_replicas_in_sync)) - - return input_lib.get_distributed_datasets_from_function_per_replica( - dataset_fn, - self._input_workers, - input_contexts, - self._container_strategy()) - - def _experimental_distribute_values_from_function(self, value_fn): per_replica_values = [] for replica_id in range(self._num_replicas_in_sync): From 895b39bf44d8a1f7e03d9290f0fc33246d69b6b6 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 26 Jul 2020 22:17:42 -0700 Subject: [PATCH 0054/1447] moving the replicaiton_mode to input_lib --- .../python/distribute/distribute_lib.py | 28 +--------- tensorflow/python/distribute/input_lib.py | 5 +- .../python/distribute/mirrored_strategy.py | 53 +++---------------- 3 files changed, 11 insertions(+), 75 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 4c335874436..a7de8ae070b 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -1062,6 +1062,7 @@ class StrategyBase(object): Args: dataset: `tf.data.Dataset` that will be sharded across all replicas using the rules stated above. + replication_mode: Replication mode for the input function. options: `tf.distribute.InputOptions` used to control options on how this dataset is distributed. @@ -1075,7 +1076,6 @@ class StrategyBase(object): # pylint: disable=line-too-long """Distributes `tf.data.Dataset` instances created by calls to `dataset_fn`. -<<<<<<< HEAD The argument `dataset_fn` that users pass in is an input function that has a `tf.distribute.InputContext` argument and returns a `tf.data.Dataset` instance. It is expected that the returned dataset from `dataset_fn` is @@ -1093,14 +1093,6 @@ class StrategyBase(object): specify your own batching and sharding logic. (In contrast, `tf.distribute.experimental_distribute_dataset` does batching and sharding for you.)For example, where -======= - `dataset_fn` will be called once for each worker in the strategy. Each - replica on that worker will dequeue one batch of inputs from the local - `Dataset` (i.e. if a worker has two replicas, two batches will be dequeued - from the `Dataset` every step). - - This method can be used for several purposes. For example, where ->>>>>>> 87a7ad3be2... Adding per-replica dataset distribution `experimental_distribute_dataset` is unable to shard the input files, this method might be used to manually shard the dataset (avoiding the slow fallback behavior in `experimental_distribute_dataset`). In cases where the @@ -1131,7 +1123,6 @@ class StrategyBase(object): snippet](https://www.tensorflow.org/tutorials/distribute/input#caveats) for an example of how to order outputs. -<<<<<<< HEAD Note: Stateful dataset transformations are currently not supported with `tf.distribute.experimental_distribute_dataset` or `tf.distribute.distribute_datasets_from_function`. Any stateful @@ -1143,23 +1134,6 @@ class StrategyBase(object): For a tutorial on more usage and properties of this method, refer to the [tutorial on distributed input](https://www.tensorflow.org/tutorials/distribute/input#tfdistributestrategyexperimental_distribute_datasets_from_function)). If you are interested in last partial batch handling, read [this section](https://www.tensorflow.org/tutorials/distribute/input#partial_batches). -======= - In the case where you want to specify datasets `PER_REPLICA`, that is having - a separate dataset per each device, you can specify as follows. - - ```python - train_dist_dataset = strategy.experimental_distribute_datasets_from_function( - train_dataset_fn, - distribute_lib.InputReplicationMode.PER_REPLICA) - - train_dist_iterator = iter(train_dist_dataset) - for epoch in range(NUM_EPOCHS): - total_loss = 0.0 - for iteration in range(ITERATIONS): - data = next(train_dist_iterator) - total_loss += distributed_train_step(data) - ``` ->>>>>>> 9598cf19be... apply review changes Args: dataset_fn: A function taking a `tf.distribute.InputContext` instance and diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 9827e28626e..c8deb35e7fc 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -108,7 +108,7 @@ def get_distributed_datasets_from_function(dataset_fn, input_workers, input_contexts, strategy, - replication_mode=InputReplicationMode.PER_WORKER): + replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): """Returns a distributed dataset from the given input function. This is a common function that is used by all strategies to return a @@ -126,12 +126,13 @@ def get_distributed_datasets_from_function(dataset_fn, `worker_device_pairs`. strategy: a `tf.distribute.Strategy` object, used to run all-reduce to handle last partial batch. + replication_mode: Replication mode for the input function. Returns: A distributed dataset instance. """ if tf2.enabled(): - if strategy.extended.replication_mode == distribute_lib.InputReplicationMode.PER_WORKER: + if replication_mode == distribute_lib.InputReplicationMode.PER_WORKER: return DistributedDatasetsFromFunction( dataset_fn, input_workers, diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 0291581e03f..676e396735b 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -230,34 +230,6 @@ class MirroredStrategy(distribute_lib.Strategy): 1: } - `experimental_distribute_dataset` can be used to distribute the dataset across - the replicas when writing your own training loop. If you are using `.fit` and - `.compile` methods available in `tf.keras`, then `tf.keras` will handle the - distribution for you. - - For example: - - ```python - my_strategy = tf.distribute.MirroredStrategy() - with my_strategy.scope(): - @tf.function - def distribute_train_epoch(dataset): - def replica_fn(input): - # process input and return result - return result - - total_result = 0 - for x in dataset: - per_replica_result = my_strategy.run(replica_fn, args=(x,)) - total_result += my_strategy.reduce(tf.distribute.ReduceOp.SUM, - per_replica_result, axis=None) - return total_result - - dist_dataset = my_strategy.experimental_distribute_dataset(dataset) - for _ in range(EPOCHS): - train_result = distribute_train_epoch(dist_dataset) - ``` - Args: devices: a list of device strings such as `['/gpu:0', '/gpu:1']`. If `None`, all available GPUs are used. If no GPUs are found, CPU is used. @@ -267,11 +239,9 @@ class MirroredStrategy(distribute_lib.Strategy): the particular hardware is available. """ - def __init__(self, devices=None, cross_device_ops=None, - replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): + def __init__(self, devices=None, cross_device_ops=None): extended = MirroredExtended(self, devices=devices, - cross_device_ops=cross_device_ops, - replication_mode=replication_mode) + cross_device_ops=cross_device_ops) super(MirroredStrategy, self).__init__(extended) distribute_lib.distribution_strategy_gauge.get_cell("V2").set( "MirroredStrategy") @@ -294,8 +264,7 @@ class MirroredStrategyV1(distribute_lib.StrategyV1): # pylint: disable=g-missin class MirroredExtended(distribute_lib.StrategyExtendedV1): """Implementation of MirroredStrategy.""" - def __init__(self, container_strategy, devices=None, cross_device_ops=None, - replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): + def __init__(self, container_strategy, devices=None, cross_device_ops=None): super(MirroredExtended, self).__init__(container_strategy) if context.executing_eagerly(): if devices and not _is_device_list_single_worker(devices): @@ -316,7 +285,6 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): assert devices, ("Got an empty `devices` list and unable to recognize " "any local devices.") self._cross_device_ops = cross_device_ops - self._replication_mode = replication_mode self._initialize_strategy(devices) # TODO(b/128995245): Enable last partial batch support in graph mode. @@ -341,12 +309,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): def _initialize_single_worker(self, devices): """Initializes the object for single-worker training.""" self._devices = tuple(device_util.canonicalize(d) for d in devices) - if self._replication_mode == distribute_lib.InputReplicationMode.PER_WORKER: - self._input_workers_devices = ( - (device_util.canonicalize("/device:CPU:0", devices[0]), devices),) - else: - self._input_workers_devices = ( - tuple((device_util.canonicalize("/device:CPU:0", d),(d,)) for d in devices)) + self._input_workers_devices = ( + (device_util.canonicalize("/device:CPU:0", devices[0]), devices),) self._inferred_cross_device_ops = None if self._cross_device_ops else ( cross_device_ops_lib.choose_the_best(devices)) @@ -530,7 +494,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): dataset_fn, input_workers, input_contexts, - self._container_strategy()) + self._container_strategy(), + replication_mode) def _experimental_distribute_values_from_function(self, value_fn): per_replica_values = [] @@ -723,10 +688,6 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): def worker_devices(self): return self._devices - @property - def replication_mode(self): - return self._replication_mode - @property def worker_devices_by_replica(self): return [[d] for d in self._devices] From 920df6d33068abc4d4e2a35db2a61232c3b50ce7 Mon Sep 17 00:00:00 2001 From: kushanam Date: Mon, 27 Apr 2020 22:24:03 -0700 Subject: [PATCH 0055/1447] Adding per-replica dataset distribution --- tensorflow/python/distribute/mirrored_strategy.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 676e396735b..bad8d68f38c 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -264,7 +264,8 @@ class MirroredStrategyV1(distribute_lib.StrategyV1): # pylint: disable=g-missin class MirroredExtended(distribute_lib.StrategyExtendedV1): """Implementation of MirroredStrategy.""" - def __init__(self, container_strategy, devices=None, cross_device_ops=None): + def __init__(self, container_strategy, devices=None, cross_device_ops=None, + replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): super(MirroredExtended, self).__init__(container_strategy) if context.executing_eagerly(): if devices and not _is_device_list_single_worker(devices): @@ -285,6 +286,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): assert devices, ("Got an empty `devices` list and unable to recognize " "any local devices.") self._cross_device_ops = cross_device_ops + self._replication_mode = replication_mode self._initialize_strategy(devices) # TODO(b/128995245): Enable last partial batch support in graph mode. From 6dfba965a1f5ffdca7648fd15e392ec5dd54c0a3 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 26 Jul 2020 22:02:41 -0700 Subject: [PATCH 0056/1447] moving the replicaiton_mode to input_lib --- tensorflow/python/distribute/mirrored_strategy.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index bad8d68f38c..676e396735b 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -264,8 +264,7 @@ class MirroredStrategyV1(distribute_lib.StrategyV1): # pylint: disable=g-missin class MirroredExtended(distribute_lib.StrategyExtendedV1): """Implementation of MirroredStrategy.""" - def __init__(self, container_strategy, devices=None, cross_device_ops=None, - replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): + def __init__(self, container_strategy, devices=None, cross_device_ops=None): super(MirroredExtended, self).__init__(container_strategy) if context.executing_eagerly(): if devices and not _is_device_list_single_worker(devices): @@ -286,7 +285,6 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): assert devices, ("Got an empty `devices` list and unable to recognize " "any local devices.") self._cross_device_ops = cross_device_ops - self._replication_mode = replication_mode self._initialize_strategy(devices) # TODO(b/128995245): Enable last partial batch support in graph mode. From bc2e3ec92668f8cb5af5c492d1fc0d42f82b476e Mon Sep 17 00:00:00 2001 From: kushanam Date: Tue, 4 Aug 2020 12:46:16 -0700 Subject: [PATCH 0057/1447] addressing review for input worker --- tensorflow/python/distribute/distribute_lib.py | 9 ++++++--- tensorflow/python/distribute/input_lib.py | 6 +++--- tensorflow/python/distribute/mirrored_strategy.py | 13 ++++++++----- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index a7de8ae070b..66889d5e7e2 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -620,6 +620,7 @@ class RunOptions( class InputOptions( collections.namedtuple("InputOptions", [ "experimental_prefetch_to_device", + "replication_mode", ])): """Run options for `experimental_distribute_dataset(s_from_function)`. @@ -637,7 +638,8 @@ class InputOptions( strategy.experimental_distribute_dataset( dataset, tf.distribute.InputOptions( - experimental_prefetch_to_device=False))) + experimental_prefetch_to_device=False, + replication_mode=InputReplicationMode.PER_WORKER))) ``` Attributes: @@ -645,9 +647,11 @@ class InputOptions( elements will be prefetched to accelerator device memory. When False, dataset elements are prefetched to host device memory. Must be False when using TPUEmbedding API. + replication_mode: Replication mode for the input function. """ - def __new__(cls, experimental_prefetch_to_device=True): + def __new__(cls, experimental_prefetch_to_device=True, + replication_mode=InputReplicationMode.PER_WORKER): return super(InputOptions, cls).__new__(cls, experimental_prefetch_to_device) @@ -1062,7 +1066,6 @@ class StrategyBase(object): Args: dataset: `tf.data.Dataset` that will be sharded across all replicas using the rules stated above. - replication_mode: Replication mode for the input function. options: `tf.distribute.InputOptions` used to control options on how this dataset is distributed. diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index c8deb35e7fc..30d964af546 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -35,7 +35,7 @@ from tensorflow.python.distribute import distribution_strategy_context from tensorflow.python.distribute import input_ops from tensorflow.python.distribute import reduce_util from tensorflow.python.distribute import values -from tensorflow.python.distribute import distribute_lib +from tensorflow.python.distribute.distribute_lib import InputReplicationMode from tensorflow.python.eager import context from tensorflow.python.framework import composite_tensor from tensorflow.python.framework import constant_op @@ -108,7 +108,7 @@ def get_distributed_datasets_from_function(dataset_fn, input_workers, input_contexts, strategy, - replication_mode=distribute_lib.InputReplicationMode.PER_WORKER): + replication_mode=InputReplicationMode.PER_WORKER): """Returns a distributed dataset from the given input function. This is a common function that is used by all strategies to return a @@ -132,7 +132,7 @@ def get_distributed_datasets_from_function(dataset_fn, A distributed dataset instance. """ if tf2.enabled(): - if replication_mode == distribute_lib.InputReplicationMode.PER_WORKER: + if replication_mode == InputReplicationMode.PER_WORKER: return DistributedDatasetsFromFunction( dataset_fn, input_workers, diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 676e396735b..1e032613259 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -366,13 +366,15 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): logging.info("Using MirroredStrategy with remote devices %r", devices) - def _input_workers_with_options(self, options=None): + def _input_workers_with_options(self, options=None, input_workers_devices=None): + if not self._input_workers_devices: + input_workers_devices = self._input_workers_devices if not options or options.experimental_prefetch_to_device: - return input_lib.InputWorkers(self._input_workers_devices) + return input_lib.InputWorkers(input_workers_devices) else: return input_lib.InputWorkers( [(host_device, (host_device,) * len(compute_devices)) for - host_device, compute_devices in self._input_workers_devices]) + host_device, compute_devices in input_workers_devices]) @property def _input_workers(self): @@ -482,7 +484,8 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): def _distribute_datasets_from_function(self, dataset_fn, options): input_contexts = [] - input_workers = self._input_workers_with_options(options) + input_workers = self._input_workers_with_options( + options, input_workers_devices) num_workers = input_workers.num_workers for i in range(num_workers): input_contexts.append(distribute_lib.InputContext( @@ -495,7 +498,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): input_workers, input_contexts, self._container_strategy(), - replication_mode) + options.replication_mode) def _experimental_distribute_values_from_function(self, value_fn): per_replica_values = [] From e71480982b0e7db09ee27b4157bacb540dfdd8b9 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 9 Aug 2020 21:40:46 -0700 Subject: [PATCH 0058/1447] applying review changes --- tensorflow/python/distribute/distribute_lib.py | 4 +++- tensorflow/python/distribute/input_lib.py | 2 +- tensorflow/python/distribute/mirrored_strategy.py | 5 ++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 66889d5e7e2..e620773a5f0 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -647,7 +647,9 @@ class InputOptions( elements will be prefetched to accelerator device memory. When False, dataset elements are prefetched to host device memory. Must be False when using TPUEmbedding API. - replication_mode: Replication mode for the input function. + replication_mode: Replication mode for the input function. Currently, the + InputReplicationMode.PER_WORKER works only under mirrored_strategy for the + supported input functions. """ def __new__(cls, experimental_prefetch_to_device=True, diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 30d964af546..759f969d72b 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -139,7 +139,7 @@ def get_distributed_datasets_from_function(dataset_fn, input_contexts, strategy) else: - return DistributedDatasetsFromFunctionForReplicas( + return DistributedDatasetsFromFunctionForReplicas( dataset_fn, input_workers, input_contexts, diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 1e032613259..3d965d59671 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -367,7 +367,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): logging.info("Using MirroredStrategy with remote devices %r", devices) def _input_workers_with_options(self, options=None, input_workers_devices=None): - if not self._input_workers_devices: + if not input_workers_devices: input_workers_devices = self._input_workers_devices if not options or options.experimental_prefetch_to_device: return input_lib.InputWorkers(input_workers_devices) @@ -472,6 +472,9 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): self._container_strategy()) def _experimental_distribute_dataset(self, dataset, options): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise RuntimeError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function`.") return input_lib.get_distributed_dataset( dataset, self._input_workers_with_options(options), From 259f11a194662745907fbf9797acb59a7e424044 Mon Sep 17 00:00:00 2001 From: kushanam Date: Mon, 17 Aug 2020 16:13:06 -0700 Subject: [PATCH 0059/1447] correcting InputOptions class desc --- tensorflow/python/distribute/distribute_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index e620773a5f0..85897b51553 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -648,7 +648,7 @@ class InputOptions( dataset elements are prefetched to host device memory. Must be False when using TPUEmbedding API. replication_mode: Replication mode for the input function. Currently, the - InputReplicationMode.PER_WORKER works only under mirrored_strategy for the + InputReplicationMode.PER_REPLICA works only under mirrored_strategy for the supported input functions. """ From 69d5b75f22d5186a5dfe33cc5a81d985e54c6552 Mon Sep 17 00:00:00 2001 From: kushanam Date: Thu, 20 Aug 2020 21:56:48 -0700 Subject: [PATCH 0060/1447] apply review changes p1 --- .../python/distribute/distribute_lib.py | 7 ++-- tensorflow/python/distribute/input_lib.py | 34 ++++++++++--------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 85897b51553..13583a01daf 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -647,9 +647,10 @@ class InputOptions( elements will be prefetched to accelerator device memory. When False, dataset elements are prefetched to host device memory. Must be False when using TPUEmbedding API. - replication_mode: Replication mode for the input function. Currently, the - InputReplicationMode.PER_REPLICA works only under mirrored_strategy for the - supported input functions. + replication_mode: Replication mode for the input function. Currently, the + InputReplicationMode.PER_REPLICA works only under + tf.distribute.MirroredStrategy for the supported input functions. + The default value is InputReplicationMode.PER_WORKER. """ def __new__(cls, experimental_prefetch_to_device=True, diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 759f969d72b..560b254ea82 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -680,21 +680,6 @@ class DistributedIteratorBase(DistributedIteratorInterface): return distribute_utils.regroup(replicas) -class DistributedIteratorForReplicas(DistributedIterator): - """Input Iterator for a distributed dataset on replicas.""" - def __init__(self, input_workers, iterators, strategy): - super(DistributedIteratorForReplicas, self).__init__(input_workers, iterators, strategy) - - def get_next(self, name=None): - """Returns the next input from the iterator for all replicas.""" - if not self._enable_get_next_as_optional: - replicas = [] - for iterator in self._iterators: - with ops.device(iterator._worker): - next_out = iterator.get_next_as_list_static_shapes(iterator._worker) - replicas.append(next_out) - return values.regroup(replicas) - class DistributedIteratorV1(DistributedIteratorBase): """Input Iterator for a distributed dataset.""" @@ -902,6 +887,23 @@ class DistributedIterator(DistributedIteratorBase, self._enable_get_next_as_optional) + +class DistributedIteratorForReplicas(DistributedIterator): + """Input Iterator for a distributed dataset on replicas.""" + def __init__(self, input_workers, iterators, strategy): + super(DistributedIteratorForReplicas, self).__init__(input_workers, iterators, strategy) + + def get_next(self, name=None): + """Returns the next input from the iterator for all replicas.""" + if not self._enable_get_next_as_optional: + replicas = [] + for iterator in self._iterators: + with ops.device(iterator._worker): + next_out = iterator.get_next_as_list_static_shapes(iterator._worker) + replicas.append(next_out) + return values.regroup(replicas) + + class _IterableInput(DistributedDatasetInterface): """Base class for iterable inputs for distribution strategies.""" @@ -1803,7 +1805,7 @@ class _SingleWorkerCallableIterator(object): def _create_iterators_per_replica_with_input_context(input_contexts, input_workers, dataset_fn): - """Create a multidevice iterator per workers given a dataset function.""" + """Create a multidevice iterator per workers given a dataset function.""" iterators = [] for i, ctx in enumerate(input_contexts): devices = input_workers.compute_devices_for_worker(i) From 080cff4b8c0e2a8ea2bd6c8ec5698b85157c508c Mon Sep 17 00:00:00 2001 From: kushanam Date: Fri, 21 Aug 2020 21:17:02 -0700 Subject: [PATCH 0061/1447] apply review changes - bug fixes --- tensorflow/python/distribute/distribute_lib.py | 3 ++- tensorflow/python/distribute/mirrored_strategy.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 13583a01daf..0511a757aab 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -656,7 +656,8 @@ class InputOptions( def __new__(cls, experimental_prefetch_to_device=True, replication_mode=InputReplicationMode.PER_WORKER): return super(InputOptions, cls).__new__(cls, - experimental_prefetch_to_device) + experimental_prefetch_to_device, + replication_mode) # ------------------------------------------------------------------------------ # Base classes for all distribution strategies. diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 3d965d59671..77bc8bb542c 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -488,7 +488,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): def _distribute_datasets_from_function(self, dataset_fn, options): input_contexts = [] input_workers = self._input_workers_with_options( - options, input_workers_devices) + options, self._input_workers_devices) num_workers = input_workers.num_workers for i in range(num_workers): input_contexts.append(distribute_lib.InputContext( From ff6404544ce0463bcbd388554f9a0366e98f9fd8 Mon Sep 17 00:00:00 2001 From: kushanam Date: Wed, 26 Aug 2020 11:55:32 -0700 Subject: [PATCH 0062/1447] fix get_next for DistributedIteratorForReplicas --- tensorflow/python/distribute/input_lib.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 560b254ea82..4673db3dc72 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -898,10 +898,9 @@ class DistributedIteratorForReplicas(DistributedIterator): if not self._enable_get_next_as_optional: replicas = [] for iterator in self._iterators: - with ops.device(iterator._worker): - next_out = iterator.get_next_as_list_static_shapes(iterator._worker) - replicas.append(next_out) - return values.regroup(replicas) + next_out = iterator.get_next_as_list_static_shapes(iterator._worker) + replicas.append(next_out) + return distribute_utils.regroup(replicas) class _IterableInput(DistributedDatasetInterface): From 6ff64653b59e8ae6691e06b0c441343287bba7a0 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 6 Sep 2020 21:54:32 -0700 Subject: [PATCH 0063/1447] add per_replica input_lib test --- .../python/distribute/input_lib_test.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py index ec0b591d710..e65a724c3d8 100644 --- a/tensorflow/python/distribute/input_lib_test.py +++ b/tensorflow/python/distribute/input_lib_test.py @@ -1250,5 +1250,45 @@ class DistributedIteratorTensorTypeTest(DistributedIteratorTestBase, self.assertAllEqual(nest.flatten(sums), [expected_for_sum] * 3) +class DistributedIteratorPerReplicaTest(test.TestCase, parameterized.TestCase): + + @combinations.generate( + combinations.combine( + mode=["eager"], + distribution=[ + strategy_combinations.mirrored_strategy_with_two_gpus])) + def testInputSignatureForPerReplicaValues(self, distribution): + with distribution.scope(): + def dataset_fn(input_context): + return tf.data.Dataset.from_tensor_slices(np.zeros([4, 4])).apply(tf.data.experimental.prefetch_to_device('gpu')) + + input_options = tf.distribute.InputOptions(replication_mode = tf.distribute.InputReplicationMode.PER_REPLICA) + + ds = distribution.experimental_distribute_datasets_from_function( + dataset_fn, input_options) + + iterator = iter(ds) + type_spec = iterator.element_spec + + @def_function.function(input_signature=[type_spec]) + def process_inputs(inputs): + distribution.run(lambda inputs: inputs, args=(inputs,)) + + for x in ds: + process_inputs(x) + self.assertEqual( + x.values[0].device, + distribution.extended.worker_devices[0]) + self.assertEqual( + x.values[0].backing_device, + distribution.extended.worker_devices[0]) + self.assertEqual( + x.values[1].device, + distribution.extended.worker_devices[1]) + self.assertEqual( + x.values[1].backing_device, + distribution.extended.worker_devices[1]) + break + if __name__ == "__main__": combinations.main() From 0496fe469afb473961a6085536100ab63d344931 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 6 Sep 2020 22:07:26 -0700 Subject: [PATCH 0064/1447] remove extra spaces --- tensorflow/python/distribute/input_lib.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 4673db3dc72..26cee30272c 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -468,8 +468,6 @@ class DistributedDatasetInterface(collections_abc.Iterable, "DistributedDataset.reduce must be implemented in descendants.") - - class InputWorkers(object): """A 1-to-many mapping from input worker devices to compute devices.""" From 9939c59b1b06a00c6f8400e33c38ad3cf8698e67 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 13 Sep 2020 22:36:02 -0700 Subject: [PATCH 0065/1447] removing DistributedDatasetsFromFunctionForReplicas --- tensorflow/python/distribute/input_lib.py | 96 ++++++------------- .../python/distribute/mirrored_strategy.py | 13 ++- 2 files changed, 37 insertions(+), 72 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 26cee30272c..baac3e79d62 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -132,18 +132,12 @@ def get_distributed_datasets_from_function(dataset_fn, A distributed dataset instance. """ if tf2.enabled(): - if replication_mode == InputReplicationMode.PER_WORKER: - return DistributedDatasetsFromFunction( - dataset_fn, - input_workers, - input_contexts, - strategy) - else: - return DistributedDatasetsFromFunctionForReplicas( - dataset_fn, - input_workers, - input_contexts, - strategy) + return DistributedDatasetsFromFunction( + dataset_fn, + input_workers, + input_contexts, + strategy, + replication_mode) else: return DistributedDatasetsFromFunctionV1( dataset_fn, @@ -1147,7 +1141,7 @@ class DistributedDatasetV1(DistributedDataset): class DistributedDatasetsFromFunction(_IterableInput): """Inputs created from dataset function.""" - def __init__(self, dataset_fn, input_workers, input_contexts, strategy): + def __init__(self, dataset_fn, input_workers, input_contexts, strategy, replication_mode): """Makes an iterable from datasets created by the given function. Args: @@ -1158,6 +1152,7 @@ class DistributedDatasetsFromFunction(_IterableInput): `worker_device_pairs`. strategy: a `tf.distribute.Strategy` object, used to run all-reduce to handle last partial batch. + replication_mode: Replication mode for the input function. """ super(DistributedDatasetsFromFunction, self).__init__( input_workers=input_workers) @@ -1168,9 +1163,11 @@ class DistributedDatasetsFromFunction(_IterableInput): "input_contexts (%d)" % (input_workers.num_workers, len(input_contexts))) + self._dataset_fn = dataset_fn self._input_workers = input_workers self._input_contexts = input_contexts self._strategy = strategy + self._replication_mode = replication_mode self._datasets, element_spec = ( _create_datasets_per_worker_with_input_context(self._input_contexts, self._input_workers, @@ -1189,23 +1186,25 @@ class DistributedDatasetsFromFunction(_IterableInput): # out this change. enable_legacy_iterators = getattr(self._strategy, "_enable_legacy_iterators", False) + if self._replication_mode == InputReplicationMode.PER_WORKER: + iterators = _create_iterators_per_worker(self._datasets, + self._input_workers, + enable_legacy_iterators) - iterators = _create_iterators_per_worker(self._datasets, - self._input_workers, - enable_legacy_iterators) - - if enable_legacy_iterators: - iterator = DistributedIteratorV1( - self._input_workers, - iterators, - self._strategy, - enable_get_next_as_optional=self._enable_get_next_as_optional) + if enable_legacy_iterators: + iterator = DistributedIteratorV1(self._input_workers, iterators, + self._strategy) + else: + iterator = DistributedIterator(self._input_workers, iterators, + self._strategy) else: - iterator = DistributedIterator( - self._input_workers, - iterators, - self._strategy, - enable_get_next_as_optional=self._enable_get_next_as_optional) + iterators, element_spec = _create_iterators_per_replica_with_input_context( + self._input_contexts, self._input_workers, self._dataset_fn) + iterator = DistributedIteratorForReplicas( + self._input_workers, iterators, self._strategy) + self._element_spec = _create_distributed_tensor_spec( + self._strategy, element_spec) + iterator._element_spec = self._element_spec # pylint: disable=protected-access # When async eager is enabled, sometimes the iterator may not finish @@ -1225,47 +1224,6 @@ class DistributedDatasetsFromFunction(_IterableInput): return self._element_spec -class DistributedDatasetsFromFunctionForReplicas(_IterableInput): - """Inputs created from dataset function.""" - - def __init__(self, dataset_fn, input_workers, input_contexts, strategy): - super(DistributedDatasetsFromFunctionForReplicas, self).__init__( - input_workers=input_workers) - - self._dataset_fn = dataset_fn - self._input_workers = input_workers - self._input_contexts = input_contexts - self._strategy = strategy - self._element_spec = None - - - def __iter__(self): - if not (context.executing_eagerly() or - ops.get_default_graph().building_function): - raise RuntimeError("__iter__() is only supported inside of tf.function " - "or when eager execution is enabled.") - - iterators, element_spec = _create_iterators_per_replica_with_input_context( - self._input_contexts, self._input_workers, self._dataset_fn) - iterator = DistributedIteratorForReplicas(self._input_workers, iterators, self._strategy) - self._element_spec = _create_distributed_tensor_spec( - self._strategy, element_spec) - iterator._element_spec = self._element_spec # pylint: disable=protected-access - return iterator - - @property - def element_spec(self): - """The type specification of an element of this dataset.""" - if self._element_spec is None: - raise ValueError("You must create an iterator before calling " - "`element_spec` on the distributed dataset or iterator. " - "This is because the dataset function is not called " - "before an iterator is created.") - return self._element_spec - - - - class DistributedDatasetsFromFunctionV1(DistributedDatasetsFromFunction): """Inputs created from dataset function.""" diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 77bc8bb542c..b3cc98cd59b 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -485,10 +485,17 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): return numpy_dataset.one_host_numpy_dataset( numpy_input, self._host_input_device, session) - def _distribute_datasets_from_function(self, dataset_fn, options): + def _distribute_datasets_from_function(self, dataset_fn, + options): + if options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + self._input_workers_devices = ( + tuple((device_util.canonicalize("/device:CPU:0", d), (d,)) for d in self._devices)) + input_workers = self._input_workers_with_options( + None, self._input_workers_devices) + else: + input_workers = self._input_workers_with_options( + options, self._input_workers_devices) input_contexts = [] - input_workers = self._input_workers_with_options( - options, self._input_workers_devices) num_workers = input_workers.num_workers for i in range(num_workers): input_contexts.append(distribute_lib.InputContext( From bf704ce82b3a98d03d28d17087d8c175588ed007 Mon Sep 17 00:00:00 2001 From: kushanam Date: Tue, 15 Sep 2020 20:19:39 -0700 Subject: [PATCH 0066/1447] addressing input_lib reviews --- tensorflow/python/distribute/distribute_lib.py | 2 +- tensorflow/python/distribute/input_lib.py | 14 +++++--------- tensorflow/python/distribute/mirrored_strategy.py | 2 +- 3 files changed, 7 insertions(+), 11 deletions(-) diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index 0511a757aab..f45a3f9b601 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -649,7 +649,7 @@ class InputOptions( using TPUEmbedding API. replication_mode: Replication mode for the input function. Currently, the InputReplicationMode.PER_REPLICA works only under - tf.distribute.MirroredStrategy for the supported input functions. + tf.distribute.MirroredStrategy for the input functions. The default value is InputReplicationMode.PER_WORKER. """ diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index baac3e79d62..f1c404a3c77 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -1190,7 +1190,6 @@ class DistributedDatasetsFromFunction(_IterableInput): iterators = _create_iterators_per_worker(self._datasets, self._input_workers, enable_legacy_iterators) - if enable_legacy_iterators: iterator = DistributedIteratorV1(self._input_workers, iterators, self._strategy) @@ -1198,12 +1197,10 @@ class DistributedDatasetsFromFunction(_IterableInput): iterator = DistributedIterator(self._input_workers, iterators, self._strategy) else: - iterators, element_spec = _create_iterators_per_replica_with_input_context( + iterators = _create_iterators_per_replica( self._input_contexts, self._input_workers, self._dataset_fn) iterator = DistributedIteratorForReplicas( self._input_workers, iterators, self._strategy) - self._element_spec = _create_distributed_tensor_spec( - self._strategy, element_spec) iterator._element_spec = self._element_spec # pylint: disable=protected-access @@ -1713,7 +1710,7 @@ class _SingleWorkerDatasetIterator(_SingleWorkerDatasetIteratorBase): return dataset_ops.get_legacy_output_types(self._iterator) -class _SingleReplicaDatasetIterator(_SingleWorkerDatasetIterator): +class _SingleReplicaDatasetIterator(_SingleWorkerOwnedDatasetIterator): def __init__(self, dataset, device): super(_SingleReplicaDatasetIterator, self).__init__(dataset, device, []) @@ -1757,9 +1754,8 @@ class _SingleWorkerCallableIterator(object): return [] -def _create_iterators_per_replica_with_input_context(input_contexts, - input_workers, - dataset_fn): +def _create_iterators_per_replica(input_contexts,input_workers, + dataset_fn): """Create a multidevice iterator per workers given a dataset function.""" iterators = [] for i, ctx in enumerate(input_contexts): @@ -1769,7 +1765,7 @@ def _create_iterators_per_replica_with_input_context(input_contexts, # Wrapping dataset here (ex. applying options) might result in moving it to the CPU iterator = _SingleReplicaDatasetIterator(dataset, devices[0]) iterators.append(iterator) - return iterators, dataset.element_spec + return iterators diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index b3cc98cd59b..66a7e75cd02 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -473,7 +473,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): def _experimental_distribute_dataset(self, dataset, options): if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: - raise RuntimeError("InputReplicationMode.PER_REPLICA " + raise NotImplementedError("InputReplicationMode.PER_REPLICA " "is only supported in `experimental_distribute_datasets_from_function`.") return input_lib.get_distributed_dataset( dataset, From 0edc2f70e2ecba161ca092874a732343bcfaa1ab Mon Sep 17 00:00:00 2001 From: kushanam Date: Tue, 15 Sep 2020 21:39:53 -0700 Subject: [PATCH 0067/1447] apply test input_lib_test reviews --- .../python/distribute/input_lib_test.py | 62 +++++++++++-------- 1 file changed, 35 insertions(+), 27 deletions(-) diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py index e65a724c3d8..6edac457c88 100644 --- a/tensorflow/python/distribute/input_lib_test.py +++ b/tensorflow/python/distribute/input_lib_test.py @@ -1256,39 +1256,47 @@ class DistributedIteratorPerReplicaTest(test.TestCase, parameterized.TestCase): combinations.combine( mode=["eager"], distribution=[ - strategy_combinations.mirrored_strategy_with_two_gpus])) + strategy_combinations.mirrored_strategy_with_two_gpus, + strategy_combinations.mirrored_strategy_with_cpu_1_and_2, + strategy_combinations.mirrored_strategy_with_gpu_and_cpu, + ])) + + def setUp(self): + super(DistributedIteratorPerReplicaTest, self).setUp() + strategy_combinations.set_virtual_cpus_to_at_least(3) + def testInputSignatureForPerReplicaValues(self, distribution): - with distribution.scope(): - def dataset_fn(input_context): - return tf.data.Dataset.from_tensor_slices(np.zeros([4, 4])).apply(tf.data.experimental.prefetch_to_device('gpu')) + def dataset_fn(input_context): + return tf.data.Dataset.from_tensor_slices( + np.array(np.arange(0, 16)).reshape(4, 4)) - input_options = tf.distribute.InputOptions(replication_mode = tf.distribute.InputReplicationMode.PER_REPLICA) + input_options = tf.distribute.InputOptions(replication_mode = tf.distribute.InputReplicationMode.PER_REPLICA) - ds = distribution.experimental_distribute_datasets_from_function( - dataset_fn, input_options) + ds = distribution.experimental_distribute_datasets_from_function( + dataset_fn, input_options) - iterator = iter(ds) - type_spec = iterator.element_spec + iterator = iter(ds) + type_spec = iterator.element_spec - @def_function.function(input_signature=[type_spec]) - def process_inputs(inputs): - distribution.run(lambda inputs: inputs, args=(inputs,)) + @def_function.function(input_signature=[type_spec]) + def process_inputs(inputs): + distribution.run(lambda inputs: inputs, args=(inputs,)) - for x in ds: - process_inputs(x) - self.assertEqual( - x.values[0].device, - distribution.extended.worker_devices[0]) - self.assertEqual( - x.values[0].backing_device, - distribution.extended.worker_devices[0]) - self.assertEqual( - x.values[1].device, - distribution.extended.worker_devices[1]) - self.assertEqual( - x.values[1].backing_device, - distribution.extended.worker_devices[1]) - break + for x in ds: + process_inputs(x) + self.assertEqual( + x.values[0].device, + distribution.extended.worker_devices[0]) + self.assertEqual( + x.values[0].backing_device, + distribution.extended.worker_devices[0]) + self.assertEqual( + x.values[1].device, + distribution.extended.worker_devices[1]) + self.assertEqual( + x.values[1].backing_device, + distribution.extended.worker_devices[1]) + break if __name__ == "__main__": combinations.main() From 8911c89eede18881270dda5fb429777ff954f00a Mon Sep 17 00:00:00 2001 From: kushanam Date: Tue, 15 Sep 2020 21:42:27 -0700 Subject: [PATCH 0068/1447] fix layout --- .../python/distribute/input_lib_test.py | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/tensorflow/python/distribute/input_lib_test.py b/tensorflow/python/distribute/input_lib_test.py index 6edac457c88..c69c26500ef 100644 --- a/tensorflow/python/distribute/input_lib_test.py +++ b/tensorflow/python/distribute/input_lib_test.py @@ -1270,7 +1270,8 @@ class DistributedIteratorPerReplicaTest(test.TestCase, parameterized.TestCase): return tf.data.Dataset.from_tensor_slices( np.array(np.arange(0, 16)).reshape(4, 4)) - input_options = tf.distribute.InputOptions(replication_mode = tf.distribute.InputReplicationMode.PER_REPLICA) + input_options = tf.distribute.InputOptions( + replication_mode=tf.distribute.InputReplicationMode.PER_REPLICA) ds = distribution.experimental_distribute_datasets_from_function( dataset_fn, input_options) @@ -1285,18 +1286,19 @@ class DistributedIteratorPerReplicaTest(test.TestCase, parameterized.TestCase): for x in ds: process_inputs(x) self.assertEqual( - x.values[0].device, - distribution.extended.worker_devices[0]) + x.values[0].device, + distribution.extended.worker_devices[0]) self.assertEqual( - x.values[0].backing_device, - distribution.extended.worker_devices[0]) + x.values[0].backing_device, + distribution.extended.worker_devices[0]) self.assertEqual( - x.values[1].device, - distribution.extended.worker_devices[1]) + x.values[1].device, + distribution.extended.worker_devices[1]) self.assertEqual( - x.values[1].backing_device, - distribution.extended.worker_devices[1]) + x.values[1].backing_device, + distribution.extended.worker_devices[1]) break + if __name__ == "__main__": combinations.main() From 28327c4681c3db98b764ce36ffae95925a2ef2ff Mon Sep 17 00:00:00 2001 From: kushanam Date: Tue, 15 Sep 2020 22:42:58 -0700 Subject: [PATCH 0069/1447] extending NI error to other strategies --- tensorflow/python/distribute/central_storage_strategy.py | 3 +++ tensorflow/python/distribute/collective_all_reduce_strategy.py | 3 +++ tensorflow/python/distribute/one_device_strategy.py | 3 +++ tensorflow/python/distribute/parameter_server_strategy.py | 3 +++ 4 files changed, 12 insertions(+) diff --git a/tensorflow/python/distribute/central_storage_strategy.py b/tensorflow/python/distribute/central_storage_strategy.py index e61570dd6bd..3264f2dd587 100644 --- a/tensorflow/python/distribute/central_storage_strategy.py +++ b/tensorflow/python/distribute/central_storage_strategy.py @@ -102,6 +102,9 @@ class CentralStorageStrategy(distribute_lib.Strategy): Returns: A "distributed `Dataset`" that the caller can iterate over. """ + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function`.") return super(CentralStorageStrategy, self).experimental_distribute_dataset( dataset, options) diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py index a3e63e8a6f1..964c3f06bd7 100644 --- a/tensorflow/python/distribute/collective_all_reduce_strategy.py +++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py @@ -471,6 +471,9 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): return input_context def _experimental_distribute_dataset(self, dataset, options): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function`.") input_context = self._make_input_context() return input_lib.get_distributed_dataset( dataset, diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py index c256c2df78f..d08ea624930 100644 --- a/tensorflow/python/distribute/one_device_strategy.py +++ b/tensorflow/python/distribute/one_device_strategy.py @@ -312,6 +312,9 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1): def _experimental_distribute_dataset(self, dataset, options): # Note that split_batch_by argument is not passed because it is always 1 in # this strategy, and adding it adds unnecessary overhead to the dataset. + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function`.") return input_lib.get_distributed_dataset( dataset, self._input_workers_with_options(options), diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py index b60ea74dd04..e7ccdce6b5a 100644 --- a/tensorflow/python/distribute/parameter_server_strategy.py +++ b/tensorflow/python/distribute/parameter_server_strategy.py @@ -123,6 +123,9 @@ class ParameterServerStrategy(distribute_lib.Strategy): len(self.extended.parameter_devices)) def experimental_distribute_dataset(self, dataset, options=None): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function`.") self._raise_pss_error_if_eager() super(ParameterServerStrategy, self).experimental_distribute_dataset(dataset=dataset, From a66976034153967f01740dc06a667cb437da9bf7 Mon Sep 17 00:00:00 2001 From: kushanam Date: Thu, 17 Sep 2020 15:11:19 -0700 Subject: [PATCH 0070/1447] refactor iterators per replica --- tensorflow/python/distribute/input_lib.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index f1c404a3c77..f14fdcdb5ec 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -1754,17 +1754,16 @@ class _SingleWorkerCallableIterator(object): return [] -def _create_iterators_per_replica(input_contexts,input_workers, +def _create_iterators_per_replica(input_contexts, input_workers, dataset_fn): """Create a multidevice iterator per workers given a dataset function.""" iterators = [] for i, ctx in enumerate(input_contexts): devices = input_workers.compute_devices_for_worker(i) - with ops.device(devices[0]): - dataset = dataset_fn(ctx) - # Wrapping dataset here (ex. applying options) might result in moving it to the CPU - iterator = _SingleReplicaDatasetIterator(dataset, devices[0]) - iterators.append(iterator) + dataset = dataset_fn(ctx) + # Wrapping dataset here (ex. applying options) might result in moving it to the CPU + iterator = _SingleReplicaDatasetIterator(dataset, devices[0]) + iterators.append(iterator) return iterators From ebb4c6746ffdf8df89654dfbd58dd9d6e96dc39a Mon Sep 17 00:00:00 2001 From: kushanam Date: Mon, 21 Sep 2020 21:42:58 -0700 Subject: [PATCH 0071/1447] apply review changes --- .../collective_all_reduce_strategy.py | 7 ++++- .../python/distribute/distribute_lib.py | 4 +-- .../python/distribute/mirrored_strategy.py | 26 ++++++++++++++++++- .../python/distribute/one_device_strategy.py | 7 ++++- .../distribute/parameter_server_strategy.py | 7 ++++- tensorflow/python/distribute/tpu_strategy.py | 10 ++++++- 6 files changed, 54 insertions(+), 7 deletions(-) diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py index 964c3f06bd7..7bca1810bfd 100644 --- a/tensorflow/python/distribute/collective_all_reduce_strategy.py +++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py @@ -482,7 +482,12 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): split_batch_by=self._num_replicas_in_sync, input_context=input_context) - def _distribute_datasets_from_function(self, dataset_fn, options): + def _distribute_datasets_from_function(self, dataset_fn, + options): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function` " + "of mirrored_strategy") input_context = self._make_input_context() return input_lib.get_distributed_datasets_from_function( dataset_fn=dataset_fn, diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py index f45a3f9b601..68a739c9cf5 100644 --- a/tensorflow/python/distribute/distribute_lib.py +++ b/tensorflow/python/distribute/distribute_lib.py @@ -648,8 +648,8 @@ class InputOptions( dataset elements are prefetched to host device memory. Must be False when using TPUEmbedding API. replication_mode: Replication mode for the input function. Currently, the - InputReplicationMode.PER_REPLICA works only under - tf.distribute.MirroredStrategy for the input functions. + InputReplicationMode.PER_REPLICA is only supported with + tf.distribute.MirroredStrategy.experimental_distribute_datasets_from_function. The default value is InputReplicationMode.PER_WORKER. """ diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index 66a7e75cd02..f75d0ddc3e9 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -230,6 +230,30 @@ class MirroredStrategy(distribute_lib.Strategy): 1: } + `experimental_distribute_dataset` can be used to distribute the dataset across + the replicas when writing your own training loop. If you are using `.fit` and + `.compile` methods available in `tf.keras`, then `tf.keras` will handle the + distribution for you. + For example: + ```python + my_strategy = tf.distribute.MirroredStrategy() + with my_strategy.scope(): + @tf.function + def distribute_train_epoch(dataset): + def replica_fn(input): + # process input and return result + return result + total_result = 0 + for x in dataset: + per_replica_result = my_strategy.run(replica_fn, args=(x,)) + total_result += my_strategy.reduce(tf.distribute.ReduceOp.SUM, + per_replica_result, axis=None) + return total_result + dist_dataset = my_strategy.experimental_distribute_dataset(dataset) + for _ in range(EPOCHS): + train_result = distribute_train_epoch(dist_dataset) + ``` + Args: devices: a list of device strings such as `['/gpu:0', '/gpu:1']`. If `None`, all available GPUs are used. If no GPUs are found, CPU is used. @@ -700,7 +724,7 @@ class MirroredExtended(distribute_lib.StrategyExtendedV1): @property def worker_devices(self): return self._devices - + @property def worker_devices_by_replica(self): return [[d] for d in self._devices] diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py index d08ea624930..003a24c2b6e 100644 --- a/tensorflow/python/distribute/one_device_strategy.py +++ b/tensorflow/python/distribute/one_device_strategy.py @@ -320,7 +320,12 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1): self._input_workers_with_options(options), self._container_strategy()) - def _distribute_datasets_from_function(self, dataset_fn, options): + def _distribute_datasets_from_function(self, dataset_fn, + options): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function` " + "of mirrored_strategy") return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers_with_options(options), diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py index e7ccdce6b5a..4b07980e8ec 100644 --- a/tensorflow/python/distribute/parameter_server_strategy.py +++ b/tensorflow/python/distribute/parameter_server_strategy.py @@ -131,7 +131,12 @@ class ParameterServerStrategy(distribute_lib.Strategy): self).experimental_distribute_dataset(dataset=dataset, options=options) - def distribute_datasets_from_function(self, dataset_fn, options=None): + def distribute_datasets_from_function(self, dataset_fn, + options=None): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function` " + "of mirrored_strategy") self._raise_pss_error_if_eager() super(ParameterServerStrategy, self).distribute_datasets_from_function( dataset_fn=dataset_fn, options=options) diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py index 1a3d49a2032..a2cf20e5c39 100644 --- a/tensorflow/python/distribute/tpu_strategy.py +++ b/tensorflow/python/distribute/tpu_strategy.py @@ -802,6 +802,9 @@ class TPUExtended(distribute_lib.StrategyExtendedV1): "distribution function.".format(path, type(spec))) def _experimental_distribute_dataset(self, dataset, options): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function`.") if options is None or options.experimental_prefetch_to_device: self._check_spec(dataset.element_spec) @@ -811,7 +814,12 @@ class TPUExtended(distribute_lib.StrategyExtendedV1): self._container_strategy(), split_batch_by=self._num_replicas_in_sync) - def _distribute_datasets_from_function(self, dataset_fn, options): + def _distribute_datasets_from_function(self, dataset_fn, + options): + if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: + raise NotImplementedError("InputReplicationMode.PER_REPLICA " + "is only supported in `experimental_distribute_datasets_from_function` " + "of mirrored_strategy") input_workers = self._get_input_workers(options) input_contexts = [] num_workers = input_workers.num_workers From 3e0688d8c81a5aa217af320296e2f41bd5a59048 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 27 Sep 2020 19:08:11 -0700 Subject: [PATCH 0072/1447] unifying the DistributedDatasetsFromFunction --- tensorflow/python/distribute/input_lib.py | 76 ++++++++++------------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index f14fdcdb5ec..7e044227552 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -879,22 +879,6 @@ class DistributedIterator(DistributedIteratorBase, self._enable_get_next_as_optional) - -class DistributedIteratorForReplicas(DistributedIterator): - """Input Iterator for a distributed dataset on replicas.""" - def __init__(self, input_workers, iterators, strategy): - super(DistributedIteratorForReplicas, self).__init__(input_workers, iterators, strategy) - - def get_next(self, name=None): - """Returns the next input from the iterator for all replicas.""" - if not self._enable_get_next_as_optional: - replicas = [] - for iterator in self._iterators: - next_out = iterator.get_next_as_list_static_shapes(iterator._worker) - replicas.append(next_out) - return distribute_utils.regroup(replicas) - - class _IterableInput(DistributedDatasetInterface): """Base class for iterable inputs for distribution strategies.""" @@ -1179,29 +1163,23 @@ class DistributedDatasetsFromFunction(_IterableInput): def __iter__(self): if (ops.executing_eagerly_outside_functions() or - ops.get_default_graph().building_function): + ops.get_default_graph().building_function): # This is an optional flag that can be used to turn off using # OwnedMultiDeviceIterators and instead use the legacy # MultiDeviceIterators as a stop gap solution that will allow us to roll # out this change. enable_legacy_iterators = getattr(self._strategy, "_enable_legacy_iterators", False) - if self._replication_mode == InputReplicationMode.PER_WORKER: - iterators = _create_iterators_per_worker(self._datasets, - self._input_workers, - enable_legacy_iterators) - if enable_legacy_iterators: - iterator = DistributedIteratorV1(self._input_workers, iterators, - self._strategy) - else: - iterator = DistributedIterator(self._input_workers, iterators, + iterators = _create_iterators_per_worker(self._datasets, + self._input_workers, + enable_legacy_iterators, + self._replication_mode) + if enable_legacy_iterators: + iterator = DistributedIteratorV1(self._input_workers, iterators, self._strategy) else: - iterators = _create_iterators_per_replica( - self._input_contexts, self._input_workers, self._dataset_fn) - iterator = DistributedIteratorForReplicas( - self._input_workers, iterators, self._strategy) - + iterator = DistributedIterator(self._input_workers, iterators, + self._strategy) iterator._element_spec = self._element_spec # pylint: disable=protected-access # When async eager is enabled, sometimes the iterator may not finish @@ -1436,7 +1414,8 @@ def _recover_shape_fn(data, value_structure): class _SingleWorkerDatasetIteratorBase(object): """Iterator for a single `tf.data.Dataset`.""" - def __init__(self, dataset, worker, devices): + def __init__(self, dataset, worker, devices, + replication_mode=InputReplicationMode.PER_WORKER): """Create iterator for the `dataset` to fetch data to worker's `devices` . A `MultiDeviceIterator` or `OwnedMultiDeviceIterator` is used to prefetch @@ -1451,6 +1430,7 @@ class _SingleWorkerDatasetIteratorBase(object): self._worker = worker self._devices = devices self._element_spec = dataset.element_spec + self._replication_mode = replication_mode self._make_iterator() def _make_iterator(self): @@ -1578,7 +1558,7 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, """Iterator for a DistributedDataset instance.""" def __init__(self, dataset=None, worker=None, devices=None, components=None, - element_spec=None): + element_spec=None, replication_mode=InputReplicationMode.PER_WORKER): """Create iterator for the `dataset` to fetch data to worker's `devices` . `OwnedMultiDeviceIterator` is used to prefetch input to the devices on the @@ -1608,21 +1588,28 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, self._worker = worker self._devices = devices self._iterator = components[0] + self._replication_mode = replication_mode else: if (components is not None or element_spec is not None): raise ValueError(error_message) - super(_SingleWorkerOwnedDatasetIterator, self).__init__(dataset, worker, - devices) + super(_SingleWorkerOwnedDatasetIterator, self).__init__(dataset=dataset, + worker=worker, + devices=devices, + replication_mode=replication_mode) def _make_iterator(self): """Make appropriate iterator on the dataset.""" if not self._worker: raise ValueError("Worked device must be specified when creating an " "owned iterator.") - host_device = device_util.get_host_for_device(self._worker) - with ops.device(self._worker): - self._iterator = multi_device_iterator_ops.OwnedMultiDeviceIterator( - self._dataset, self._devices, source_device=host_device) + if self._replication_mode == InputReplicationMode.PER_WORKER: + host_device = device_util.get_host_for_device(self._worker) + with ops.device(self._worker): + self._iterator = multi_device_iterator_ops.OwnedMultiDeviceIterator( + self._dataset, self._devices, source_device=host_device) + else: + with ops.device(self._devices[0]): + self._iterator = iter(self._dataset) @property def element_spec(self): @@ -1767,20 +1754,21 @@ def _create_iterators_per_replica(input_contexts, input_workers, return iterators - def _create_iterators_per_worker(worker_datasets, input_workers, - enable_legacy_iterators): + enable_legacy_iterators, + replication_mode=InputReplicationMode.PER_WORKER): """Create a multidevice iterator on each of the workers.""" assert isinstance(input_workers, InputWorkers) - assert len(worker_datasets) == len(input_workers.worker_devices) iterators = [] for i, worker in enumerate(input_workers.worker_devices): with ops.device(worker): worker_devices = input_workers.compute_devices_for_worker(i) if tf2.enabled() and not enable_legacy_iterators: - iterator = _SingleWorkerOwnedDatasetIterator(worker_datasets[i], worker, - worker_devices) + iterator = _SingleWorkerOwnedDatasetIterator(dataset=worker_datasets[i], + worker=worker, + devices=worker_devices, + replication_mode=replication_mode) else: iterator = _SingleWorkerDatasetIterator(worker_datasets[i], worker, worker_devices) From fe52a4dac4440cbce9df7c3a54badb799286791f Mon Sep 17 00:00:00 2001 From: "ag.ramesh" Date: Wed, 30 Sep 2020 19:50:57 -0700 Subject: [PATCH 0073/1447] Removed redundant \ --- third_party/llvm/expand_cmake_vars.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/third_party/llvm/expand_cmake_vars.py b/third_party/llvm/expand_cmake_vars.py index 73f071aa504..a3a1e524691 100644 --- a/third_party/llvm/expand_cmake_vars.py +++ b/third_party/llvm/expand_cmake_vars.py @@ -25,7 +25,7 @@ import sys _CMAKE_DEFINE_REGEX = re.compile(r"\s*#cmakedefine\s+([A-Za-z_0-9]*)(\s.*)?$") _CMAKE_DEFINE01_REGEX = re.compile(r"\s*#cmakedefine01\s+([A-Za-z_0-9]*)") _CMAKE_VAR_REGEX = re.compile(r"\${([A-Za-z_0-9]*)}") -_CMAKE_ATVAR_REGEX = re.compile(r"\@([A-Za-z_0-9]*)@") +_CMAKE_ATVAR_REGEX = re.compile(r"@([A-Za-z_0-9]*)@") def _parse_args(argv): From b905066f7e946f440d84f7b8667a47fb261563f1 Mon Sep 17 00:00:00 2001 From: "ag.ramesh" Date: Thu, 1 Oct 2020 22:07:31 -0700 Subject: [PATCH 0074/1447] Updated comments --- third_party/llvm/expand_cmake_vars.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/llvm/expand_cmake_vars.py b/third_party/llvm/expand_cmake_vars.py index a3a1e524691..067e4f88d5a 100644 --- a/third_party/llvm/expand_cmake_vars.py +++ b/third_party/llvm/expand_cmake_vars.py @@ -38,10 +38,10 @@ def _parse_args(argv): def _expand_variables(input_str, cmake_vars): - """Expands ${VARIABLE}s in 'input_str', using dictionary 'cmake_vars'. + """Expands ${VARIABLE} and @VARIABLE@s in 'input_str', using dictionary 'cmake_vars'. Args: - input_str: the string containing ${VARIABLE} expressions to expand. + input_str: the string containing ${VARIABLE} and @VARIABLE@ expressions to expand. cmake_vars: a dictionary mapping variable names to their values. Returns: From e65436218d85570e599948e18168a69136a6ee10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Fri, 2 Oct 2020 09:47:38 +0200 Subject: [PATCH 0075/1447] TFlu: Add Cortex-M generic target makefile --- .../lite/micro/cortex-m-generic/debug_log.cc | 26 ++++ .../micro/tools/make/download_and_extract.sh | 8 ++ .../make/ext_libs/{cmsis.inc => cmsis_nn.inc} | 18 ++- .../targets/cortex_m_generic_makefile.inc | 125 ++++++++++++++++++ 4 files changed, 174 insertions(+), 3 deletions(-) create mode 100644 tensorflow/lite/micro/cortex-m-generic/debug_log.cc rename tensorflow/lite/micro/tools/make/ext_libs/{cmsis.inc => cmsis_nn.inc} (92%) create mode 100644 tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc diff --git a/tensorflow/lite/micro/cortex-m-generic/debug_log.cc b/tensorflow/lite/micro/cortex-m-generic/debug_log.cc new file mode 100644 index 00000000000..baebe1f5964 --- /dev/null +++ b/tensorflow/lite/micro/cortex-m-generic/debug_log.cc @@ -0,0 +1,26 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/lite/micro/debug_log.h" + +#ifdef DEBUG +#include +#endif + +extern "C" void DebugLog(const char* s) { +#ifdef DEBUG + fprintf(stderr, "%s", s); +#endif +} diff --git a/tensorflow/lite/micro/tools/make/download_and_extract.sh b/tensorflow/lite/micro/tools/make/download_and_extract.sh index f69c2bf19ed..29f06997568 100755 --- a/tensorflow/lite/micro/tools/make/download_and_extract.sh +++ b/tensorflow/lite/micro/tools/make/download_and_extract.sh @@ -185,6 +185,14 @@ patch_cmsis() { -iname '*.*' -exec \ sed -i -E $'s@#include "dsp/matrix_functions.h"@#include "cmsis/CMSIS/DSP/Include/dsp/matrix_functions.h"@g' {} \; + find tensorflow/lite/micro/tools/make/downloads/cmsis \ + -iname '*.*' -exec \ + sed -i -E $'s@#include "cmsis_compiler.h"@#include "cmsis/CMSIS/Core/Include/cmsis_compiler.h"@g' {} \; + + find tensorflow/lite/micro/tools/make/downloads/cmsis \ + -iname '*.*' -exec \ + sed -i -E $'s@#include "arm_helium_utils.h"@#include "cmsis/CMSIS/DSP/Include/arm_helium_utils.h"@g' {} \; + # Until the fix for https://github.com/ARMmbed/mbed-os/issues/12568 is # rolled into Mbed version used on the Arduino IDE, we have to replace # one intrinsic with a patched equivalent. diff --git a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc similarity index 92% rename from tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc rename to tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc index 448bc8a8536..7cfaa9c4bfe 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/cmsis.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/cmsis_nn.inc @@ -8,7 +8,17 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),) THIRD_PARTY_DOWNLOADS += \ $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis)) - CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/ + ifeq ($(CMSIS_PATH),) + CMSIS_PATH = $(MAKEFILE_DIR)/downloads/cmsis/ + else + # Currently needed when jacking in an unpatched CMSIS-NN. + # TODO: Remove when CMSIS-NN no longer need to be patched. + INCLUDES += \ + -I$(CMSIS_PATH) \ + -I$(CMSIS_PATH)/CMSIS/Core/Include \ + -I$(CMSIS_PATH)/CMSIS/DSP/Include/ \ + -I$(CMSIS_PATH)/CMSIS/NN/Include/ + endif # List of files generated with: # find tensorflow/lite/micro/tools/make/downloads/cmsis/CMSIS/NN/Source/ -iname "*.c" @@ -91,6 +101,7 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),) # optimized kernels. We don't include all the possible CMSIS headers because # of their large number. See the RFC document for more details: # https://docs.google.com/document/d/14GRxeVEgSKgKBKAijO7oxnI49nLoTYBFQmPok-rG0cw + # Note: If you add a .h here, you must update patch_cmsis() in download_and_extract.sh as well. THIRD_PARTY_CC_HDRS += \ $(CMSIS_PATH)CMSIS/NN/Include/arm_nnfunctions.h \ $(CMSIS_PATH)CMSIS/NN/Include/arm_nnsupportfunctions.h \ @@ -115,7 +126,8 @@ ifneq ($(filter cmsis-nn,$(ALL_TAGS)),) $(CMSIS_PATH)CMSIS/DSP/Include/dsp/fast_math_functions.h \ $(CMSIS_PATH)CMSIS/DSP/Include/dsp/filtering_functions.h \ $(CMSIS_PATH)CMSIS/DSP/Include/dsp/interpolation_functions.h \ - $(CMSIS_PATH)CMSIS/DSP/Include/dsp/matrix_functions.h - + $(CMSIS_PATH)CMSIS/DSP/Include/dsp/matrix_functions.h \ + $(CMSIS_PATH)CMSIS/Core/Include/cmsis_compiler.h \ + $(CMSIS_PATH)CMSIS/DSP/Include/arm_helium_utils.h endif diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc new file mode 100644 index 00000000000..6b19f50eea2 --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc @@ -0,0 +1,125 @@ +# Generic Makefile target for ARM Cortex M builds. +# REQUIRED: +# - TAGS: armclang or armgcc +# For Cortex-M55, ARM Compiler 6.14 or later is required. +# Example: +# make -f tensorflow/lite/micro/tools/make/Makefile TAGS="cmsis-nn armclang" TARGET=cortex-m55-generic microlite + +ifneq ($(filter cortex-%-generic,$(TARGET)),) + + # Pick up cortex-m-generic debug log instead of default. + ALL_TAGS := $(filter-out $(TARGET),$(ALL_TAGS)) + ALL_TAGS += cortex-m-generic + + FLOAT := soft + + ifeq ($(TARGET),$(filter $(TARGET),cortex-m55-generic)) + CORE=M55 + ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.fp + TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1 + FLOAT=hard + TARGET_ARCH := cortex-m55 + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m55+nodsp+nofp-generic)) + CORE=M55 + ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_dsp.no_fp + TARGET_SPECIFIC_FLAGS += + TARGET_ARCH := cortex-m55+nodsp+nofp + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m55+nofp-generic)) + CORE=M55 + ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_fp + TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 + TARGET_ARCH := cortex-m55+nofp + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m33+nodsp-generic)) + CORE=M33 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M33.no_dsp.no_fp + TARGET_SPECIFIC_FLAGS += + TARGET_ARCH := cortex-m33+nodsp + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m33-generic)) + CORE=M33 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M33 + TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1 -D__VTOR_PRESENT=1 -D__FPU_USED=1 + FLOAT=hard + TARGET_ARCH := cortex-m33 + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m0-generic)) + CORE=M0 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M0 + TARGET_SPECIFIC_FLAGS += + TARGET_ARCH := cortex-m0 + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m3-generic)) + CORE=M3 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M3 + TARGET_SPECIFIC_FLAGS += + TARGET_ARCH := cortex-m3 + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m4-generic)) + CORE=M4 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M4.no_fp + TARGET_SPECIFIC_FLAGS+= + TARGET_ARCH := cortex-m4 + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m7+fp-generic)) + CORE=M7 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M7.fp + TARGET_SPECIFIC_FLAGS += + FLOAT=hard + TARGET_ARCH := cortex-m7+fp + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m7-generic)) + CORE=M7 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M7.no_fp + TARGET_SPECIFIC_FLAGS += + TARGET_ARCH := cortex-m7 + else + $(error "$(TARGET) not supported") + endif + + # Toolchain specfic flags + ifeq ($(filter armclang,$(ALL_TAGS)),armclang) + CXX_TOOL := armclang + CC_TOOL := armclang + AR_TOOL := armar + LD := armlink + FLAGS_ARMC = \ + --target=arm-arm-none-eabi \ + -mcpu=$(TARGET_ARCH) + CXXFLAGS += $(FLAGS_ARMC) + CCFLAGS += $(FLAGS_ARMC) + LDFLAGS += $(ARM_LDFLAGS) + + # Arm Compiler will not link the Math library (see below), therefore we're filtering it out. + # See Fatal error: L6450U: Cannot find library m: + # "Arm Compiler is designed to run in a bare metal environment, + # and automatically includes implementations of these functions, + # and so no such flag is necessary." + # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice + MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) + else ifeq ($(filter armgcc,$(ALL_TAGS)),armgcc) + CXX_TOOL := arm-none-eabi-gcc + CC_TOOL := arm-none-eabi-gcc + AR_TOOL := arm-none-eabi-gcc-ar + LD := arm-none-eabi-ld + ifneq ($(filter cortex-m55%,$(TARGET_ARCH)),) + $(error Micro architecure support is not available yet for $(TARGET_ARCH)) + else + FLAGS_GCC = -mcpu=$(TARGET_ARCH) + endif + CXXFLAGS += $(FLAGS_GCC) + CCFLAGS += $(FLAGS_GCC) + LDFLAGS += -Wl,--gc-sections + endif + + PLATFORM_FLAGS = \ + -DTF_LITE_STATIC_MEMORY \ + -mthumb \ + -mfloat-abi=$(FLOAT) \ + -funsigned-char \ + -mlittle-endian \ + -fno-function-sections \ + -MD \ + -DCPU_$(CORE)=1 \ + $(TARGET_SPECIFIC_FLAGS) + + # Common + C/C++ flags + CXXFLAGS += $(PLATFORM_FLAGS) -fno-rtti + CCFLAGS += $(PLATFORM_FLAGS) + + TEST_SCRIPT := + +endif From 819c4dc8133f138738e0f366b3a2c490c268eb60 Mon Sep 17 00:00:00 2001 From: mdfaijul Date: Fri, 2 Oct 2020 10:16:36 -0700 Subject: [PATCH 0076/1447] Enabled DNNL support for BatchMatMul with broadcast. --- .../core/kernels/mkl/mkl_batch_matmul_op.cc | 246 +++++++----------- .../core/kernels/mkl/mkl_matmul_ops_common.h | 67 ----- tensorflow/workspace.bzl | 8 +- third_party/mkl_dnn/mkldnn_v1.BUILD | 4 +- 4 files changed, 101 insertions(+), 224 deletions(-) diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc index da5a239c224..66903d8ff7a 100644 --- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc +++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc @@ -15,27 +15,16 @@ limitations under the License. // See docs in ../ops/math_ops.cc. -// This file uses both oneDNN and MKL CBLAS batched xGEMM for acceleration of -// Batch Matrix-Matrix Multiplication (MatMul) operations. -// We currently register this kernel only for oneDNN supported data -// types (float, bfloat16). This file can be built with and without the use of -// the binary MKL CBLAS calls, controlled by the macro INTEL_MKL_DNN_ONLY. -// If INTEL_MKL_DNN_ONLY is defined, only oneDNN is used. For cases not -// supported by oneDNN (ex. Batchmatmul with broadcasting) we fall back to the -// default CPU implementation. -// if INTEL_MKL_DNN_ONLY is not defined, both oneDNN and MKL CBLAS -// implementations are used. This is only temporary, once we are able handle all -// cases with oneDNN, CBLAS calls will be removed. +// This file uses oneDNN library for acceleration of Batch Matrix-Matrix +// Multiplication (MatMul) operations. We currently register this kernel only +// for oneDNN supported data types (float, bfloat16). The maximum number of +// dimensions (rank) for output tensor is 12 in oneDNN. If output tensor rank +// exceeds 12, we fallback to Eigen library based kernel. #define EIGEN_USE_THREADS #if defined(INTEL_MKL) -#include -#if !defined(INTEL_MKL_DNN_ONLY) -#include "mkl_cblas.h" -#endif // !INTEL_MKL_DNN_ONLY -#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" #include "tensorflow/core/framework/op.h" #include "tensorflow/core/framework/op_kernel.h" #include "tensorflow/core/framework/register_types.h" @@ -50,6 +39,7 @@ limitations under the License. #include "tensorflow/core/platform/types.h" #include "tensorflow/core/util/matmul_bcast.h" #include "tensorflow/core/util/mkl_util.h" +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" namespace tensorflow { @@ -100,8 +90,8 @@ class BatchMatMulMkl : public OpKernel { } // lhs and rhs can have different dimensions - const int ndims_lhs = lhs.dims(); - const int ndims_rhs = rhs.dims(); + const auto ndims_lhs = lhs.dims(); + const auto ndims_rhs = rhs.dims(); // Get broadcast info MatMulBCast bcast(lhs.shape().dim_sizes(), rhs.shape().dim_sizes()); @@ -111,16 +101,7 @@ class BatchMatMulMkl : public OpKernel { "In[0] and In[1] must have compatible batch dimensions: ", lhs.shape().DebugString(), " vs. ", rhs.shape().DebugString())); -#if defined(INTEL_MKL_DNN_ONLY) - if (bcast.IsBroadcastingRequired()) { - // Calling Eigen Kernel for broadcasting case and return. Eigen does - // not have BF16 support, so we have to fail graciously in that case. - eigen_batch_mm_v2_.Compute(ctx); - return; - } -#endif // INTEL_MKL_DNN_ONLY TensorShape out_shape = bcast.output_batch_shape(); - auto batch_size = bcast.output_batch_size(); auto lhs_rows = lhs.dim_size(ndims_lhs - 2); auto lhs_cols = lhs.dim_size(ndims_lhs - 1); @@ -137,6 +118,12 @@ class BatchMatMulMkl : public OpKernel { out_shape.AddDim(lhs_rows); out_shape.AddDim(rhs_cols); + // The maximum number of dimensions for a tensor in DNNL is 12. + OP_REQUIRES(ctx, out_shape.dims() <= 12, + errors::InvalidArgument( + "Rank of output tensor is required as <= 12, ", "but is ", + out_shape.dims(), ". Current implementation supports upto ", + "rank 12 tensors.")); Tensor* out = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); @@ -149,75 +136,17 @@ class BatchMatMulMkl : public OpKernel { return; } - auto rhs_reshaped = rhs.template flat_inner_dims(); - auto lhs_reshaped = lhs.template flat_inner_dims(); - auto out_reshaped = out->template flat_inner_dims(); - const uint64 M = lhs_reshaped.dimension(adj_x_ ? 2 : 1); - const uint64 K = lhs_reshaped.dimension(adj_x_ ? 1 : 2); - const uint64 N = rhs_reshaped.dimension(adj_y_ ? 1 : 2); - - std::vector m_array(batch_size, M); - std::vector n_array(batch_size, N); - std::vector k_array(batch_size, K); - std::vector lda_array(batch_size, adj_x_ ? M : K); - std::vector ldb_array(batch_size, adj_y_ ? K : N); - std::vector ldc_array(batch_size, N); - std::vector group_size(1, batch_size); - - bool bcast_not_supported = false; -#if defined(INTEL_MKL_DNN_ONLY) - bcast_not_supported = true; -#endif // INTEL_MKL_DNN_ONLY - if (std::is_same::value || bcast_not_supported) { - // DNNL bfloat16 API requires a, b, and c as pointers to tensors - // represented as flat-byte array. - const Scalar* a = nullptr; - const Scalar* b = nullptr; - Scalar* c = nullptr; - a = &lhs_reshaped(0, 0, 0); - b = &rhs_reshaped(0, 0, 0); - OP_REQUIRES(ctx, !bcast.IsBroadcastingRequired(), - errors::Unimplemented("Broadcasting is not supported for " - "_MklBatchMatMul yet.")); - c = &out_reshaped(0, 0, 0); - // TODO(nhasabni): Use appropriate cast instead of passing addresses of - // a,b and c. - MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array, - k_array, &a, lda_array, &b, ldb_array, &c, ldc_array, 1, - group_size, ctx); - } else { - std::vector a_array; - std::vector b_array; - std::vector c_array; - a_array.reserve(batch_size); - b_array.reserve(batch_size); - c_array.reserve(batch_size); - - if (!bcast.IsBroadcastingRequired()) { - for (int64 i = 0; i < batch_size; i++) { - a_array.push_back(&lhs_reshaped(i, 0, 0)); - b_array.push_back(&rhs_reshaped(i, 0, 0)); - c_array.push_back(&out_reshaped(i, 0, 0)); - } - } else { - // Broadcasting is needed, so get the mapping from flattened output - // batch indices to x's and y's flattened batch indices. - const std::vector& a_batch_indices = bcast.x_batch_indices(); - const std::vector& b_batch_indices = bcast.y_batch_indices(); - - for (int64 i = 0; i < batch_size; i++) { - a_array.push_back(&lhs_reshaped(a_batch_indices[i], 0, 0)); - b_array.push_back(&rhs_reshaped(b_batch_indices[i], 0, 0)); - c_array.push_back(&out_reshaped(i, 0, 0)); - } - } - - // MKL CBLAS API requires a, b, and c as array of pointers, where each - // pointer is to 2D matrix. - MklCblasGemmBatch(CblasRowMajor, adj_x_, adj_y_, m_array, n_array, - k_array, &a_array[0], lda_array, &b_array[0], ldb_array, - &c_array[0], ldc_array, 1, group_size, ctx); - } + // Compute parameters for DNNL matmul primitive. + auto params = CreateMatMulParams(lhs.shape(), rhs.shape(), out_shape); + // Create or retrieve matmul primitive from cache. + MklMatMulPrimitive* matmul_prim = + MklMatMulPrimitiveFactory::Get( + *params, false /* value for do_not_cache */); + // Execute matmul primitive. + std::shared_ptr cpu_stream; + cpu_stream.reset(CreateStream(ctx, matmul_prim->GetEngine())); + matmul_prim->Execute(lhs.flat().data(), rhs.flat().data(), + out->flat().data(), cpu_stream); } private: @@ -225,60 +154,78 @@ class BatchMatMulMkl : public OpKernel { bool adj_y_; BatchMatMulV2Op eigen_batch_mm_v2_; - void MklCblasGemmBatch( - const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB, - const std::vector& M_Array, const std::vector& N_Array, - const std::vector& K_Array, const float** A_Array, - const std::vector& lda_Array, const float** B_Array, - const std::vector& ldb_Array, float** C_Array, - const std::vector& ldc_Array, const MKL_INT group_count, - const std::vector& group_size, OpKernelContext* ctx) { -#if !defined(INTEL_MKL_DNN_ONLY) - std::vector TransA_Array( - group_size[0], TransA ? CblasTrans : CblasNoTrans); - std::vector TransB_Array( - group_size[0], TransB ? CblasTrans : CblasNoTrans); - std::vector alpha_Array(group_size[0], 1.0); - std::vector beta_Array(group_size[0], 0.0); - cblas_sgemm_batch(Layout, &TransA_Array[0], &TransB_Array[0], &M_Array[0], - &N_Array[0], &K_Array[0], &alpha_Array[0], - reinterpret_cast(A_Array), &lda_Array[0], - reinterpret_cast(B_Array), &ldb_Array[0], - &beta_Array[0], reinterpret_cast(C_Array), - &ldc_Array[0], group_count, &group_size[0]); -#else - DCHECK(Layout == CblasRowMajor); - std::vector TransA_Array(group_size[0], TransA); - std::vector TransB_Array(group_size[0], TransB); - std::vector alpha_Array(group_size[0], 1.0); - std::vector beta_Array(group_size[0], 0.0); - dnnl_gemm_batch(TransA_Array, TransB_Array, M_Array, N_Array, - K_Array, alpha_Array, *A_Array, *B_Array, beta_Array, - *C_Array, group_count, group_size, ctx); -#endif // !INTEL_MKL_DNN_ONLY + using dims = dnnl::memory::dims; + + // This method makes the rank (ndims) of input same as the output by creating + // new axes to the input. For example, if input shape is [a, b, c, d] and + // output shape is [e, f, g, h, i, j], then the reshaped input would have a + // shape of [1, 1, a, b, c, d]. + void ExpandInputDimsToOutputShape(const TensorShape& input_shape, + const TensorShape& output_shape, + dims* reshaped_dims) { + auto ndims_input = input_shape.dims(); + auto ndims_output = output_shape.dims(); + auto dim_offset = ndims_output - ndims_input; + DCHECK(dim_offset > 0); + reshaped_dims->clear(); + reshaped_dims->resize(ndims_output, 1); + auto input_dims = input_shape.dim_sizes(); + for (int dim_idx = 0; dim_idx < ndims_input; ++dim_idx) + reshaped_dims->at(dim_idx + dim_offset) = input_dims[dim_idx]; } -// BatchMatMul BFloat16 support only exists in DNNL 1.2 onwards. -#if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16) - void MklCblasGemmBatch( - const CBLAS_LAYOUT Layout, const bool TransA, const bool TransB, - const std::vector& M_Array, const std::vector& N_Array, - const std::vector& K_Array, const bfloat16** A_Array, - const std::vector& lda_Array, const bfloat16** B_Array, - const std::vector& ldb_Array, bfloat16** C_Array, - const std::vector& ldc_Array, const MKL_INT group_count, - const std::vector& group_size, OpKernelContext* ctx) { - DCHECK(Layout == CblasRowMajor); - std::vector TransA_Array(group_size[0], TransA); - std::vector TransB_Array(group_size[0], TransB); - std::vector alpha_Array(group_size[0], 1.0); - std::vector beta_Array(group_size[0], 0.0); - // TODO(nhasabni): Remove *A when we pass a, b, and c correctly. - // MKLDNN API does not require lda, ldb, and ldc. - dnnl_gemm_batch( - TransA_Array, TransB_Array, M_Array, N_Array, K_Array, alpha_Array, - *A_Array, *B_Array, beta_Array, *C_Array, group_count, group_size, ctx); + + std::unique_ptr CreateMatMulParams( + const TensorShape& lhs_shape, const TensorShape& rhs_shape, + const TensorShape& out_shape) { + const auto ndims_lhs = lhs_shape.dims(); + const auto ndims_rhs = rhs_shape.dims(); + const auto ndims_out = out_shape.dims(); + auto lhs_dims = TFShapeToMklDnnDims(lhs_shape); + auto rhs_dims = TFShapeToMklDnnDims(rhs_shape); + auto out_dims = TFShapeToMklDnnDims(out_shape); + + // DNNL matmul_primitive requires ranks of inputs and output to be same. + // Create dnnl::memory::dims for inputs and output of same rank. + // It is assumed here that MatMulBCast object creates output_batch_shape as + // a conforming superset of input batch shapes, i.e., ndims_out >= + // ndims_lhs and ndims_out >= ndims_lhs. + if (ndims_lhs < ndims_out) { + ExpandInputDimsToOutputShape(lhs_shape, out_shape, &lhs_dims); + } + if (ndims_rhs < ndims_out) { + ExpandInputDimsToOutputShape(rhs_shape, out_shape, &rhs_dims); + } + + using dim = dnnl::memory::dim; + dim m; // number of rows in x + dim k; // number of columns in x + dim n; // number of columns in y + auto lhs_strides = CalculateTFStrides(lhs_dims); + auto rhs_strides = CalculateTFStrides(rhs_dims); + auto out_strides = CalculateTFStrides(out_dims); + + if (adj_x_) { + int m_idx = ndims_out - 1; + int k_idx = ndims_out - 2; + m = lhs_dims[m_idx]; + k = lhs_dims[k_idx]; + std::swap(lhs_dims[m_idx], lhs_dims[k_idx]); + lhs_strides[m_idx] = m; + lhs_strides[k_idx] = 1; + } + + if (adj_y_) { + int k_idx = ndims_out - 1; + int n_idx = ndims_out - 2; + k = rhs_dims[k_idx]; + n = rhs_dims[n_idx]; + std::swap(rhs_dims[k_idx], rhs_dims[n_idx]); + rhs_strides[k_idx] = k; + rhs_strides[n_idx] = 1; + } + return std::make_unique( + lhs_dims, rhs_dims, out_dims, lhs_strides, rhs_strides, out_strides); } -#endif // ENABLE_MKLDNN_V1 && ENABLE_INTEL_MKL_BFLOAT16 }; #define REGISTER_BATCH_MATMUL_MKL(TYPE) \ @@ -294,14 +241,11 @@ class BatchMatMulMkl : public OpKernel { .TypeConstraint("T") \ .Label(mkl_op_registry::kMklNameChangeOpLabel), \ BatchMatMulMkl) - #ifdef ENABLE_MKL TF_CALL_float(REGISTER_BATCH_MATMUL_MKL); TF_CALL_float(REGISTER_BATCH_MATMUL_MKL_V2); -#if defined(ENABLE_MKLDNN_V1) && defined(ENABLE_INTEL_MKL_BFLOAT16) TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL); TF_CALL_bfloat16(REGISTER_BATCH_MATMUL_MKL_V2); -#endif // ENABLE_MKLDNN_V1 && ENABLE_INTEL_MKL_BFLOAT16 #endif // ENABLE_MKL } // end namespace tensorflow diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h index e084b25f737..b77d033c9de 100644 --- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h +++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h @@ -35,12 +35,6 @@ using mkldnn::stream; namespace tensorflow { typedef Eigen::ThreadPoolDevice CPUDevice; -#ifdef INTEL_MKL_DNN_ONLY -// Temporarily copying some definitions from mkl_cblas.h so the same code can -// be used when calling oneDNN or CBLAS batchmatmul in mkl_batch_matmul_op.cc. -typedef enum { CblasRowMajor, CblasColumnMajor } CBLAS_LAYOUT; -#define MKL_INT int -#endif // This structure aggregates multiple inputs to MklDnnMatMul* methods. struct MklDnnMatMulFwdParams { @@ -729,67 +723,6 @@ class MklMatMulPrimitiveFactory : public MklPrimitiveFactory { } }; -template -void dnnl_gemm_batch(const std::vector& transa, - const std::vector& transb, const std::vector& m, - const std::vector& n, const std::vector& k, - const std::vector& alpha, const T* a, const T* b, - const std::vector& beta, T* c, - const int group_count, const std::vector& group_size, - OpKernelContext* ctx = nullptr) { - // Current BatchMatMul support in Tensorflow is narrower than the one offered - // by MKL and MKL-DNN. Current BatchMatMul support in Tensorflow uses only 1 - // group of size equal to batch_size, and all MatMul parameters (m, n, k, - // alpha, beta) within that group are same. - DCHECK(group_size.size() == 1); - DCHECK(transa.size() == group_size[0]); - DCHECK(transb.size() == group_size[0]); - DCHECK(alpha.size() == group_size[0]); - DCHECK(beta.size() == group_size[0]); - DCHECK(m.size() == group_size[0]); - DCHECK(n.size() == group_size[0]); - DCHECK(k.size() == group_size[0]); - for (int64_t idx = 0; idx < group_size[0]; idx++) - DCHECK(transa[0] == transa[idx]); - for (int64_t idx = 0; idx < group_size[0]; idx++) - DCHECK(transb[0] == transb[idx]); - for (int64_t idx = 0; idx < group_size[0]; idx++) - DCHECK(alpha[0] == alpha[idx]); - for (int64_t idx = 0; idx < group_size[0]; idx++) - DCHECK(beta[0] == beta[idx]); - for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(m[0] == m[idx]); - for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(n[0] == n[idx]); - for (int64_t idx = 0; idx < group_size[0]; idx++) DCHECK(k[0] == k[idx]); - - using dims = mkldnn::memory::dims; - // Prepare strides based on the transa and transb flags: transposed - // matrices have strides swapped BatchMatMul in MKL-DNN supports 3D metrices - // so far. That is why strides are 3D also. - dims a_sizes = dims{group_size[0], m[0], k[0]}; - dims b_sizes = dims{group_size[0], k[0], n[0]}; - dims c_sizes = dims{group_size[0], m[0], n[0]}; - dims a_strides = - !transa[0] ? dims{m[0] * k[0], k[0], 1} : dims{k[0] * m[0], 1, m[0]}; - dims b_strides = - !transb[0] ? dims{k[0] * n[0], n[0], 1} : dims{n[0] * k[0], 1, k[0]}; - dims c_strides = dims{m[0] * n[0], n[0], 1}; - - // MklMatMul uses const alpha and beta, make guarantee here to ensure - // they are never changed. - DCHECK_EQ(alpha, 1.0f); - DCHECK_EQ(beta, 0.f); - - MklMatMulParams params(a_sizes, b_sizes, c_sizes, a_strides, b_strides, - c_strides); - MklMatMulPrimitive* matmul_prim = - MklMatMulPrimitiveFactory::Get(params, 0); - - // Execute matmul primitive. - std::shared_ptr cpu_stream; - cpu_stream.reset(CreateStream(ctx, matmul_prim->GetEngine())); - matmul_prim->Execute(a, b, c, cpu_stream); -} - template void dnnl_gemm(char transa, char transb, int64_t m, int64_t n, int64_t k, float alpha, const T* a, int64_t lda, const T* b, int64_t ldb, diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index cee2c29a0b0..9560353a3ae 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -209,11 +209,11 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): tf_http_archive( name = "mkl_dnn_v1", build_file = clean_dep("//third_party/mkl_dnn:mkldnn_v1.BUILD"), - sha256 = "aef4d2a726f76f5b98902491a1a4ac69954039aa8e5a1d67ef6ce58ed00e23a6", - strip_prefix = "oneDNN-1.5.1", + sha256 = "5369f7b2f0b52b40890da50c0632c3a5d1082d98325d0f2bff125d19d0dcaa1d", + strip_prefix = "oneDNN-1.6.4", urls = [ - "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz", - "https://github.com/oneapi-src/oneDNN/archive/v1.5.1.tar.gz", + "https://storage.googleapis.com/mirror.tensorflow.org/github.com/oneapi-src/oneDNN/archive/v1.6.4.tar.gz", + "https://github.com/oneapi-src/oneDNN/archive/v1.6.4.tar.gz", ], ) diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD index 0e6acc2fadd..32a3fa7351b 100644 --- a/third_party/mkl_dnn/mkldnn_v1.BUILD +++ b/third_party/mkl_dnn/mkldnn_v1.BUILD @@ -58,8 +58,8 @@ template_rule( out = "include/dnnl_version.h", substitutions = { "@DNNL_VERSION_MAJOR@": "1", - "@DNNL_VERSION_MINOR@": "5", - "@DNNL_VERSION_PATCH@": "1", + "@DNNL_VERSION_MINOR@": "6", + "@DNNL_VERSION_PATCH@": "4", "@DNNL_VERSION_HASH@": "N/A", }, ) From 68401de6f246b494b14d4b1eecb0f9b5d1850ffd Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Tue, 4 Aug 2020 15:28:53 -0700 Subject: [PATCH 0077/1447] Removes unnecessary omp_set_num() calls Signed-off-by: Vishakha Agrawal --- tensorflow/core/common_runtime/threadpool_device.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 44fa5bf2d3a..4e308cd09b5 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -60,13 +60,8 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, // Default to available physical cores const int mkl_intra_op = port::NumSchedulableCPUs(); const int ht = port::NumHyperthreadsPerCore(); - omp_set_num_threads((mkl_intra_op + ht - 1) / ht); - } else { - uint64 user_val = 0; - if (strings::safe_strtou64(user_omp_threads, &user_val)) { - // Superflous but triggers OpenMP loading - omp_set_num_threads(user_val); - } + std::call_once(omp_setting_flag, omp_set_num_threads, + (mkl_intra_op + ht - 1) / ht); } #endif // _OPENMP #endif // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL) From d2dfb6850d73ff63543314b6af09f10732bdbb69 Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Wed, 5 Aug 2020 09:42:39 -0700 Subject: [PATCH 0078/1447] Apply code format Signed-off-by: Vishakha Agrawal --- tensorflow/core/common_runtime/threadpool_device.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 4e308cd09b5..0b955d63aa6 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -61,7 +61,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, const int mkl_intra_op = port::NumSchedulableCPUs(); const int ht = port::NumHyperthreadsPerCore(); std::call_once(omp_setting_flag, omp_set_num_threads, - (mkl_intra_op + ht - 1) / ht); + (mkl_intra_op + ht - 1) / ht); } #endif // _OPENMP #endif // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL) From 769fdabfa4d3f90e914fe8a8b710fc39543ea268 Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Wed, 5 Aug 2020 11:58:27 -0700 Subject: [PATCH 0079/1447] Added call once flag, by got deleted by mistake Signed-off-by: Vishakha Agrawal --- tensorflow/core/common_runtime/threadpool_device.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 0b955d63aa6..f580aadd84f 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -41,6 +41,7 @@ limitations under the License. #endif namespace tensorflow { +std::once_flag omp_setting_flag; ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, const string& name, Bytes memory_limit, From 9835aaae5c70bfbc864f09563bf5ab1736eba086 Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Thu, 6 Aug 2020 10:32:41 -0700 Subject: [PATCH 0080/1447] Removing unit test that depended on pre seting of OMP threads Signed-off-by: Vishakha Agrawal --- .../core/common_runtime/mkl_threadpool_device_test.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc index c29752d3c2c..1b64060bb02 100644 --- a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc +++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc @@ -37,15 +37,6 @@ TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) { EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht); } -TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) { - SessionOptions options; - setenv("OMP_NUM_THREADS", "314", 1); - - ThreadPoolDevice* tp = new ThreadPoolDevice( - options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator()); - - EXPECT_EQ(omp_get_max_threads(), 314); -} #endif // defined(_OPENMP) && !defined(ENABLE_MKLDNN_THREADPOOL) } // namespace tensorflow From 508fddb3af9824680d1c594115f5b90e9a99fac9 Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Mon, 10 Aug 2020 14:43:27 -0700 Subject: [PATCH 0081/1447] replaced std with absl Signed-off-by: Vishakha Agrawal --- tensorflow/core/common_runtime/threadpool_device.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index f580aadd84f..d709883441d 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +#include "absl/base/call_once.h" #include "tensorflow/core/common_runtime/threadpool_device.h" @@ -41,7 +42,7 @@ limitations under the License. #endif namespace tensorflow { -std::once_flag omp_setting_flag; +absl::once_flag omp_setting_flag; ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, const string& name, Bytes memory_limit, @@ -61,8 +62,8 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, // Default to available physical cores const int mkl_intra_op = port::NumSchedulableCPUs(); const int ht = port::NumHyperthreadsPerCore(); - std::call_once(omp_setting_flag, omp_set_num_threads, - (mkl_intra_op + ht - 1) / ht); + absl::call_once(omp_setting_flag, omp_set_num_threads, + (mkl_intra_op + ht - 1) / ht); } #endif // _OPENMP #endif // !defined(ENABLE_MKLDNN_THREADPOOL) && defined(INTEL_MKL) From f63da8dfc0a40765829e795d2327eb20aaf5203c Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Tue, 11 Aug 2020 16:27:54 -0700 Subject: [PATCH 0082/1447] Moving once_flag as requested by reviewer Signed-off-by: Vishakha Agrawal --- tensorflow/core/common_runtime/threadpool_device.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index d709883441d..b60d622c346 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -42,7 +42,6 @@ limitations under the License. #endif namespace tensorflow { -absl::once_flag omp_setting_flag; ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, const string& name, Bytes memory_limit, @@ -57,6 +56,7 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options, if (DisableMKL()) return; #ifdef _OPENMP const char* user_omp_threads = getenv("OMP_NUM_THREADS"); + static absl::once_flag omp_setting_flag; if (user_omp_threads == nullptr) { // OMP_NUM_THREADS controls MKL's intra-op parallelization // Default to available physical cores From 88c613add6e6fa4fd4e0358c7f6a31c8773ea3d9 Mon Sep 17 00:00:00 2001 From: Vishakha Agrawal Date: Wed, 12 Aug 2020 14:14:11 -0700 Subject: [PATCH 0083/1447] Removed lines as requested Signed-off-by: Vishakha Agrawal --- tensorflow/core/common_runtime/threadpool_device.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index b60d622c346..0470f4975b4 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -13,9 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "absl/base/call_once.h" - #include "tensorflow/core/common_runtime/threadpool_device.h" - #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/common_runtime/scoped_allocator.h" #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h" From 49c4c4cb1f5ffa8d66503e827aa052fefd6249f0 Mon Sep 17 00:00:00 2001 From: Kasra Bigdeli Date: Sat, 3 Oct 2020 19:54:36 -0400 Subject: [PATCH 0084/1447] Minor grammar fix --- tensorflow/python/keras/metrics.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py index c4bc03aed8c..1bf08197e2d 100644 --- a/tensorflow/python/keras/metrics.py +++ b/tensorflow/python/keras/metrics.py @@ -731,7 +731,7 @@ class BinaryAccuracy(MeanMetricWrapper): @keras_export('keras.metrics.CategoricalAccuracy') class CategoricalAccuracy(MeanMetricWrapper): - """Calculates how often predictions matches one-hot labels. + """Calculates how often predictions match one-hot labels. You can provide logits of classes as `y_pred`, since argmax of logits and probabilities are same. @@ -783,7 +783,7 @@ class CategoricalAccuracy(MeanMetricWrapper): @keras_export('keras.metrics.SparseCategoricalAccuracy') class SparseCategoricalAccuracy(MeanMetricWrapper): - """Calculates how often predictions matches integer labels. + """Calculates how often predictions match integer labels. ```python acc = np.dot(sample_weight, np.equal(y_true, np.argmax(y_pred, axis=1)) @@ -3220,7 +3220,7 @@ def accuracy(y_true, y_pred): @keras_export('keras.metrics.binary_accuracy') @dispatch.add_dispatch_support def binary_accuracy(y_true, y_pred, threshold=0.5): - """Calculates how often predictions matches binary labels. + """Calculates how often predictions match binary labels. Standalone usage: >>> y_true = [[1], [1], [0], [0]] @@ -3248,7 +3248,7 @@ def binary_accuracy(y_true, y_pred, threshold=0.5): @keras_export('keras.metrics.categorical_accuracy') @dispatch.add_dispatch_support def categorical_accuracy(y_true, y_pred): - """Calculates how often predictions matches one-hot labels. + """Calculates how often predictions match one-hot labels. Standalone usage: >>> y_true = [[0, 0, 1], [0, 1, 0]] @@ -3277,7 +3277,7 @@ def categorical_accuracy(y_true, y_pred): @keras_export('keras.metrics.sparse_categorical_accuracy') @dispatch.add_dispatch_support def sparse_categorical_accuracy(y_true, y_pred): - """Calculates how often predictions matches integer labels. + """Calculates how often predictions match integer labels. Standalone usage: >>> y_true = [2, 1] From d6b5f296d2d92e2a025170352d2da41e570ba48b Mon Sep 17 00:00:00 2001 From: kushanam Date: Sun, 4 Oct 2020 22:31:56 -0700 Subject: [PATCH 0085/1447] add context to create dataset func --- tensorflow/python/distribute/input_lib.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 7e044227552..36e42bf2a6b 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -1155,7 +1155,8 @@ class DistributedDatasetsFromFunction(_IterableInput): self._datasets, element_spec = ( _create_datasets_per_worker_with_input_context(self._input_contexts, self._input_workers, - dataset_fn)) + dataset_fn, + self._replication_mode)) self._enable_get_next_as_optional = _enable_get_next_as_optional( self._strategy, element_spec) self._element_spec = _create_distributed_tensor_spec( @@ -1777,11 +1778,16 @@ def _create_iterators_per_worker(worker_datasets, input_workers, def _create_datasets_per_worker_with_input_context(input_contexts, - input_workers, dataset_fn): + input_workers, + dataset_fn, + replication_mode): """Create device datasets per worker given a dataset function.""" datasets = [] for i, ctx in enumerate(input_contexts): - worker = input_workers.worker_devices[i] + if replication_mode == InputReplicationMode.PER_WORKER: + worker = input_workers.worker_devices[i] + else: + worker = input_workers._worker_device_pairs[i][1][0] with ops.device(worker): dataset = dataset_fn(ctx) datasets.append(dataset) From 37333a49b58844651f420dc8cdffb46b28ad1b5f Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 21:04:13 +1100 Subject: [PATCH 0086/1447] Fix formatting issue in cuda_blas.cc --- tensorflow/stream_executor/cuda/cuda_blas.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index b285142276d..63d9b24368a 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -3224,7 +3224,8 @@ blas::ComputationType ToComputationType() { template <> blas::ComputationType ToComputationType>() { return blas::ComputationType::kComplexF32; -}template <> +} +template <> blas::ComputationType ToComputationType>() { return blas::ComputationType::kComplexF64; } From 2a34bf9ccef3e56c1ba3d4efccccb6378559567d Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 21:07:15 +1100 Subject: [PATCH 0087/1447] Rename kF32FastTF32/BF32 to kTF32/BF16AsF32 --- .../core/kernels/batch_matmul_op_impl.h | 2 +- tensorflow/stream_executor/blas.h | 4 ++-- tensorflow/stream_executor/cuda/cuda_blas.cc | 22 ++++++++++--------- 3 files changed, 15 insertions(+), 13 deletions(-) diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index ac5a45b99ba..8786573a312 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -378,7 +378,7 @@ bool GetBlasComputationType(const DataType& dtype, bool allow_tf32, using se::blas::ComputationType; static bool use_f32_for_f16_computation = MatmulDoFP32ComputationFP16Input(); ComputationType f32_type = - allow_tf32 ? ComputationType::kF32FastTF32 : ComputationType::kF32; + allow_tf32 ? ComputationType::kTF32AsF32 : ComputationType::kF32; switch (dtype) { case DT_HALF: case DT_BFLOAT16: diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index ac3a788aaef..4b7e6e86bc8 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -107,8 +107,8 @@ enum class ComputationType { // The below values are only supported for BlasLt routines (both real and // complex). They use float32 for accumulation but round the input mantissas // to a smaller number of bits. - kF32FastTF32, // 32-bit floating-point with reduced (>=10-bit) mantissa - kF32FastBF16, // 32-bit floating-point with reduced (7-bit) mantissa + kTF32AsF32, // 32-bit floating-point with reduced (>=10-bit) mantissa + kBF16AsF32, // 32-bit floating-point with reduced (7-bit) mantissa }; enum class Epilogue { diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 63d9b24368a..26f4730f9e6 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -410,8 +410,8 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) { return CUDA_C_32F; case blas::ComputationType::kComplexF64: return CUDA_C_64F; - case blas::ComputationType::kF32FastTF32: // fall-through - case blas::ComputationType::kF32FastBF16: + case blas::ComputationType::kTF32AsF32: // fall-through + case blas::ComputationType::kBF16AsF32: // These cases are currently only supported in the blasLt routines, which // use CUBLASComputationType() instead. LOG(FATAL) << "Invalid value of blas::ComputationType."; @@ -431,9 +431,9 @@ cublasComputeType_t CUBLASComputationType(blas::ComputationType ty) { return CUBLAS_COMPUTE_64F; case blas::ComputationType::kI32: return CUBLAS_COMPUTE_32I; - case blas::ComputationType::kF32FastTF32: + case blas::ComputationType::kTF32AsF32: return CUBLAS_COMPUTE_32F_FAST_TF32; - case blas::ComputationType::kF32FastBF16: + case blas::ComputationType::kBF16AsF32: return CUBLAS_COMPUTE_32F_FAST_16BF; } } @@ -446,14 +446,16 @@ blas::DataType GetScaleType(blas::DataType data_type, switch (compute_type) { case blas::ComputationType::kF16: return blas::DataType::kHalf; - case blas::ComputationType::kF32: // fall-through - case blas::ComputationType::kComplexF32: // fall-through - case blas::ComputationType::kF32FastTF32: // fall-through - case blas::ComputationType::kF32FastBF16: - return is_complex ? blas::DataType::kComplexFloat : blas::DataType::kFloat; + case blas::ComputationType::kF32: // fall-through + case blas::ComputationType::kComplexF32: // fall-through + case blas::ComputationType::kTF32AsF32: // fall-through + case blas::ComputationType::kBF16AsF32: + return is_complex ? blas::DataType::kComplexFloat + : blas::DataType::kFloat; case blas::ComputationType::kF64: // fall-through case blas::ComputationType::kComplexF64: - return is_complex ? blas::DataType::kComplexDouble : blas::DataType::kDouble; + return is_complex ? blas::DataType::kComplexDouble + : blas::DataType::kDouble; case blas::ComputationType::kI32: return blas::DataType::kInt32; } From a87442f8456224f3b6cdba969c202dd41ac29723 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 21:09:51 +1100 Subject: [PATCH 0088/1447] Add type checks in DoBlasLtMatmul --- tensorflow/stream_executor/blas.h | 18 ++++++++++++++++++ tensorflow/stream_executor/cuda/cuda_blas.cc | 10 +++++----- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 4b7e6e86bc8..884231cd67c 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -194,6 +194,10 @@ class AlgorithmConfig { }; struct IBlasLtMatmulPlan { + // Returns the data type of the A and B (input) matrices. + virtual DataType ab_type() const = 0; + // Returns the data type of the C (input/output) matrix. + virtual DataType c_type() const = 0; virtual ~IBlasLtMatmulPlan() {} }; @@ -1494,6 +1498,20 @@ class BlasSupport { const blas::IBlasLtMatmulAlgorithm* algorithm, const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr) { + constexpr blas::DataType ab_type = blas::ToDataType::value; + if (ab_type != plan->ab_type()) { + VLOG(2) << "DoBlasLtMatmul returning false because a and b type does " + "not match plan: expected " + << plan->ab_type() << ", got " << ab_type; + return false; + } + constexpr blas::DataType c_type = blas::ToDataType::value; + if (c_type != plan->c_type()) { + VLOG(2) << "DoBlasLtMatmul returning false because c type does " + "not match plan: expected " + << plan->c_type() << ", got " << c_type; + return false; + } return DoBlasLtMatmul(stream, plan, alpha, a, b, beta, *c, scratch_allocator, algorithm, bias, output_profile_result); diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 26f4730f9e6..d8095f7d8d9 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -3243,8 +3243,8 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { cublasLtMatrixLayout_t d_desc() const { return d_desc_.get(); } bool ok() { return op_desc_ && a_desc_ && b_desc_ && c_desc_ && d_desc_; } - blas::DataType ab_type() const { return ab_type_; } - blas::DataType cd_type() const { return cd_type_; } + blas::DataType ab_type() const override { return ab_type_; } + blas::DataType c_type() const override { return c_type_; } blas::DataType scale_type() const { return scale_type_; } blas::PointerMode pointer_mode() const { return pointer_mode_; } blas::Epilogue epilogue() const { return epilogue_; } @@ -3265,7 +3265,7 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { UniqueLayoutDesc c_desc_; UniqueLayoutDesc d_desc_; blas::DataType ab_type_; - blas::DataType cd_type_; + blas::DataType c_type_; blas::DataType scale_type_; blas::PointerMode pointer_mode_; blas::Epilogue epilogue_; @@ -3458,7 +3458,7 @@ bool CUDABlas::DoBlasLtMatmulInternal( beta.data_type() != cuda_plan.scale_type()) { VLOG(2) << "DoBlasLtMatmul returning false because alpha and beta types do " "not match plan: expected " - << cuda_plan.cd_type() << ", got alpha=" << alpha.data_type() + << cuda_plan.c_type() << ", got alpha=" << alpha.data_type() << " beta=" << beta.data_type(); return false; } @@ -3542,7 +3542,7 @@ bool CUDABlas::DoBlasLtMatmul( const auto& cuda_plan = *static_cast(plan); HostOrDeviceScalar alpha_cast = alpha; HostOrDeviceScalar beta_cast = beta; - if (cuda_plan.cd_type() == blas::DataType::kHalf && + if (cuda_plan.c_type() == blas::DataType::kHalf && cuda_plan.scale_type() == blas::DataType::kFloat) { // The given alpha and beta types are F16 (they always match c), but F32* // computation type requires that they be F32, so we must cast them. From a3dfb6f36692c5a887c2ae10713f408772b00d2f Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 21:13:50 +1100 Subject: [PATCH 0089/1447] Change ThenBlasLtMatmul implem to a template - This is required to ensure that the template version of DoBlasLtMatmul is called (which is important because it performs additional type checks). --- tensorflow/stream_executor/stream.cc | 79 +++++++++++++++++++++++----- tensorflow/stream_executor/stream.h | 27 +++++----- 2 files changed, 80 insertions(+), 26 deletions(-) diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 7233056df0a..536ee87d219 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -4809,28 +4809,79 @@ Stream &Stream::ThenBlasGemmStridedBatched( c, ldc, stride_c, batch_count); } -Stream& Stream::ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - DeviceMemoryBase a, DeviceMemoryBase b, - const HostOrDeviceScalar& beta, - DeviceMemoryBase c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - DeviceMemoryBase bias, - blas::ProfileResult* output_profile_result) { +template +Stream& Stream::ThenBlasLtMatmulImpl( + const blas::IBlasLtMatmulPlan* plan, const HostOrDeviceScalar& alpha, + const DeviceMemory& a, const DeviceMemory& b, + const HostOrDeviceScalar& beta, DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, + blas::ProfileResult* output_profile_result) { VLOG_CALL(PARAM(plan), PARAM(alpha), PARAM(a), PARAM(b), PARAM(beta), PARAM(c), PARAM(algorithm), PARAM(bias)); - ThenBlasWithProfileImpl&, DeviceMemoryBase, - DeviceMemoryBase, const HostOrDeviceScalar&, - DeviceMemoryBase, ScratchAllocator*, - const blas::IBlasLtMatmulAlgorithm*, DeviceMemoryBase> + ThenBlasWithProfileImpl< + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&> impl; return impl(this, &blas::BlasSupport::DoBlasLtMatmul, plan, alpha, a, b, beta, c, scratch_allocator, algorithm, bias, output_profile_result); } +// Explicit template instantiations for each supported type combination. +template Stream& Stream::ThenBlasLtMatmulImpl( + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&, + blas::ProfileResult*); + +template Stream& Stream::ThenBlasLtMatmulImpl( + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, + ScratchAllocator*, const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory&, blas::ProfileResult*); + +template Stream& Stream::ThenBlasLtMatmulImpl( + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&, + blas::ProfileResult*); + +template Stream& Stream::ThenBlasLtMatmulImpl( + const blas::IBlasLtMatmulPlan*, const HostOrDeviceScalar&, + const DeviceMemory&, const DeviceMemory&, + const HostOrDeviceScalar&, DeviceMemory*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, const DeviceMemory&, + blas::ProfileResult*); + +template Stream& +Stream::ThenBlasLtMatmulImpl, std::complex>( + const blas::IBlasLtMatmulPlan*, + const HostOrDeviceScalar>&, + const DeviceMemory>&, + const DeviceMemory>&, + const HostOrDeviceScalar>&, + DeviceMemory>*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory>&, blas::ProfileResult*); + +template Stream& +Stream::ThenBlasLtMatmulImpl, std::complex>( + const blas::IBlasLtMatmulPlan*, + const HostOrDeviceScalar>&, + const DeviceMemory>&, + const DeviceMemory>&, + const HostOrDeviceScalar>&, + DeviceMemory>*, ScratchAllocator*, + const blas::IBlasLtMatmulAlgorithm*, + const DeviceMemory>&, blas::ProfileResult*); + Stream &Stream::ThenSetRngSeed(const uint8 *seed, uint64 seed_bytes) { VLOG_CALL(PARAM(seed), PARAM(seed_bytes)); diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h index b82c34b6c02..fdb8c6bb86e 100644 --- a/tensorflow/stream_executor/stream.h +++ b/tensorflow/stream_executor/stream.h @@ -1679,16 +1679,6 @@ class Stream { DeviceMemory> *b, int ldb); // See BlasSupport::DoBlatLtMatmul. - Stream& ThenBlasLtMatmul(const blas::IBlasLtMatmulPlan* plan, - const HostOrDeviceScalar& alpha, - DeviceMemoryBase a, DeviceMemoryBase b, - const HostOrDeviceScalar& beta, - DeviceMemoryBase c, - ScratchAllocator* scratch_allocator, - const blas::IBlasLtMatmulAlgorithm* algorithm, - DeviceMemoryBase bias, - blas::ProfileResult* output_profile_result); - // Note that we prevent alpha and beta from being used to deduce CType so that // they can be constructed implicitly from values of type CType. Without this, // type deduction would fail when this function is called with a value of type @@ -1703,8 +1693,8 @@ class Stream { const blas::IBlasLtMatmulAlgorithm* algorithm, const DeviceMemory& bias = {}, blas::ProfileResult* output_profile_result = nullptr) { - return ThenBlasLtMatmul(plan, alpha, a, b, beta, *c, scratch_allocator, - algorithm, bias, output_profile_result); + return ThenBlasLtMatmulImpl(plan, alpha, a, b, beta, c, scratch_allocator, + algorithm, bias, output_profile_result); } // See FftSupport::DoFft. @@ -2139,6 +2129,19 @@ class Stream { const dnn::BatchDescriptor &bias_descriptor, DeviceMemory *backward_bias_data); + // Implementation of ThenBlasLtMatmul that is shared by all types. + template + Stream& ThenBlasLtMatmulImpl(const blas::IBlasLtMatmulPlan* plan, + const HostOrDeviceScalar& alpha, + const DeviceMemory& a, + const DeviceMemory& b, + const HostOrDeviceScalar& beta, + DeviceMemory* c, + ScratchAllocator* scratch_allocator, + const blas::IBlasLtMatmulAlgorithm* algorithm, + const DeviceMemory& bias, + blas::ProfileResult* output_profile_result); + SE_DISALLOW_COPY_AND_ASSIGN(Stream); }; From c491ca455c1a242771651174def060242370e5d8 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 21:17:18 +1100 Subject: [PATCH 0090/1447] Refactor blasLt APIs to return Status, not bool - This does not include DoBlasLtMatmul because the helpers in stream.cc require it to return bool. --- .../core/kernels/batch_matmul_op_impl.h | 60 ++-- tensorflow/stream_executor/blas.h | 28 +- tensorflow/stream_executor/cuda/cuda_blas.cc | 302 ++++++++++-------- .../stream_executor/stream_executor_pimpl.cc | 26 +- .../stream_executor/stream_executor_pimpl.h | 12 +- 5 files changed, 227 insertions(+), 201 deletions(-) diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h index 8786573a312..53a1f560e86 100644 --- a/tensorflow/core/kernels/batch_matmul_op_impl.h +++ b/tensorflow/core/kernels/batch_matmul_op_impl.h @@ -469,6 +469,17 @@ struct CoefficientType { typedef float type; }; +inline Status FromExecutorStatus(const se::port::Status& s) { + return s.ok() ? Status::OK() + : Status(static_cast(static_cast(s.code())), + s.error_message()); +} + +template +inline Status FromExecutorStatus(const se::port::StatusOr& s) { + return FromExecutorStatus(s.status()); +} + } // namespace template @@ -554,38 +565,25 @@ struct LaunchBatchMatMul { context, GetBlasComputationType(dtype, allow_tf32, &computation_type), errors::Internal("Unsupported dtype for batched matmul")); + + auto status_or_plan = stream->parent()->CreateBlasLtMatmulPlan( + {/*ab_type=*/blas_dtype, + /*c_type=*/blas_dtype, computation_type, + se::blas::PointerMode::kHost, se::blas::Epilogue::kDefault, + blas_transpose_b, blas_transpose_a, n, m, k, + /*lda=*/in_y.dim_size(2), /*ldb=*/in_x.dim_size(2), /*ldc=*/n, + batch_size, b_stride, a_stride, c_stride}); + OP_REQUIRES(context, status_or_plan.ok(), + FromExecutorStatus(status_or_plan)); std::unique_ptr plan = - stream->parent()->CreateBlasLtMatmulPlan( - {/*ab_type=*/blas_dtype, - /*c_type=*/blas_dtype, computation_type, - se::blas::PointerMode::kHost, se::blas::Epilogue::kDefault, - blas_transpose_b, blas_transpose_a, n, m, k, - /*lda=*/in_y.dim_size(2), /*ldb=*/in_x.dim_size(2), /*ldc=*/n, - batch_size, b_stride, a_stride, c_stride}); - OP_REQUIRES( - context, plan, - errors::Internal("CreateBlasLtMatmulPlan failed : a.shape=(", - in_x.dim_size(0), ", ", in_x.dim_size(1), ", ", - in_x.dim_size(2), "), b.shape=(", in_y.dim_size(0), - ", ", in_y.dim_size(1), ", ", in_y.dim_size(2), - "), m=", m, ", n=", n, ", k=", k, - ", batch_size=", batch_size, ", adjoint_a=", adj_x, - ", adjoint_b=", adj_x, ", dtype=", dtype, - ", computation_type=", computation_type)); - std::vector> - algorithms; - OP_REQUIRES( - context, - stream->parent()->GetBlasLtMatmulAlgorithms( - plan.get(), max_scratch_size, max_algorithm_count, &algorithms), - errors::Internal("GetBlasLtMatmulAlgorithms failed: a.shape=(", - in_x.dim_size(0), ", ", in_x.dim_size(1), ", ", - in_x.dim_size(2), "), b.shape=(", in_y.dim_size(0), - ", ", in_y.dim_size(1), ", ", in_y.dim_size(2), - "), m=", m, ", n=", n, ", k=", k, - ", batch_size=", batch_size, ", adjoint_a=", adj_x, - ", adjoint_b=", adj_x, ", dtype=", dtype, - ", computation_type=", computation_type)); + status_or_plan.ConsumeValueOrDie(); + + auto status_or_algorithms = stream->parent()->GetBlasLtMatmulAlgorithms( + plan.get(), max_scratch_size, max_algorithm_count); + OP_REQUIRES(context, status_or_algorithms.ok(), + FromExecutorStatus(status_or_algorithms)); + auto algorithms = status_or_algorithms.ConsumeValueOrDie(); + plan_and_algorithms = BatchMatmulPlanMapSingleton::GetInstance()->Insert( matmul_parameters, {std::move(plan), std::move(algorithms)}); diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h index 884231cd67c..15019267b3e 100644 --- a/tensorflow/stream_executor/blas.h +++ b/tensorflow/stream_executor/blas.h @@ -1454,19 +1454,18 @@ class BlasSupport { // Creates a backend-specific plan object for a blaslt matmul operation, which // can then be passed to DoBlasLtMatmul(). When possible, plans should be // created once and reused for multiple calls to DoBlasLtMatmul(). - // Returns a null pointer on failure. - virtual std::unique_ptr CreateBlasLtMatmulPlan( - const blas::BlasLtMatmulPlanParams& params) = 0; + virtual port::StatusOr> + CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& params) = 0; // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are // returned in the order of increasing estimated compute time according to an // internal heuristic. The first returned algorithm can be used as the default // algorithm if no autotuning is to be performed. - virtual bool GetBlasLtMatmulAlgorithms( - const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, - int max_algorithm_count, - std::vector>* - out_algorithms) = 0; + virtual port::StatusOr< + std::vector>> + GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan* plan, + size_t max_workspace_size, + int max_algorithm_count) = 0; // Executes a blaslt matmul operation on the stream. If output_profile_result // is not nullptr, the operation is profiled, error messages are @@ -2330,13 +2329,12 @@ class BlasSupport { uint64 n, std::complex alpha, \ const DeviceMemory> &a, int lda, \ DeviceMemory> *b, int ldb) override; \ - std::unique_ptr CreateBlasLtMatmulPlan( \ - const blas::BlasLtMatmulPlanParams& params) override; \ - bool GetBlasLtMatmulAlgorithms( \ - const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, \ - int max_algorithm_count, \ - std::vector>* \ - out_algorithms) override; \ + port::StatusOr> \ + CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& params) override; \ + port::StatusOr>> \ + GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan* plan, \ + size_t max_workspace_size, \ + int max_algorithm_count) override; \ bool DoBlasLtMatmul( \ Stream* stream, const blas::IBlasLtMatmulPlan* plan, \ const HostOrDeviceScalar& alpha, DeviceMemoryBase a, \ diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index d8095f7d8d9..6ec1a82a905 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -3057,45 +3057,48 @@ bool CUDABlas::DoBlasTrsm(Stream *stream, blas::Side side, namespace { template -inline bool SetCublasLtAttr(cublasLtMatrixLayout_t handle, - cublasLtMatrixLayoutAttribute_t attr, - const T& value) { +inline port::Status SetCublasLtAttr(cublasLtMatrixLayout_t handle, + cublasLtMatrixLayoutAttribute_t attr, + const T& value) { cublasStatus_t status = cublasLtMatrixLayoutSetAttribute(handle, attr, &value, sizeof(T)); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatrixLayoutSetAttribute(attr=" << attr - << ", value=" << value << ") failed: " << ToString(status); - return false; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatrixLayoutSetAttribute(attr=", attr, + ", value=", value, ") failed: ", ToString(status))); } - return true; + return port::Status::OK(); } template -inline bool SetCublasLtAttr(cublasLtMatmulAlgo_t* handle, - cublasLtMatmulAlgoConfigAttributes_t attr, - const T& value) { +inline port::Status SetCublasLtAttr(cublasLtMatmulAlgo_t* handle, + cublasLtMatmulAlgoConfigAttributes_t attr, + const T& value) { cublasStatus_t status = cublasLtMatmulAlgoConfigSetAttribute(handle, attr, &value, sizeof(T)); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulAlgoConfigSetAttribute(attr=" << attr - << ", value=" << value << ") failed: " << ToString(status); - return false; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatmulAlgoConfigSetAttribute(attr=", attr, + ", value=", value, ") failed: ", ToString(status))); } - return true; + return port::Status::OK(); } template -inline bool SetCublasLtAttr(cublasLtMatmulPreference_t handle, - cublasLtMatmulPreferenceAttributes_t attr, - const T& value) { +inline port::Status SetCublasLtAttr(cublasLtMatmulPreference_t handle, + cublasLtMatmulPreferenceAttributes_t attr, + const T& value) { cublasStatus_t status = cublasLtMatmulPreferenceSetAttribute(handle, attr, &value, sizeof(value)); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulPreferenceSetAttribute(attr=" << attr - << ", value=" << value << ") failed: " << ToString(status); - return false; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatmulPreferenceSetAttribute(attr=", attr, + ", value=", value, ") failed: ", ToString(status))); } - return true; + return port::Status::OK(); } template @@ -3111,17 +3114,27 @@ inline bool GetCublasLtAttr(const cublasLtMatmulAlgo_t* handle, } template -inline bool SetCublasLtAttr(cublasLtMatmulDesc_t handle, - cublasLtMatmulDescAttributes_t attr, - const T& value) { +inline const T& ValueForStrCat(const T& value) { + return value; +} +template +inline absl::Hex ValueForStrCat(T* ptr) { + return absl::Hex(reinterpret_cast(ptr)); +} + +template +inline port::Status SetCublasLtAttr(cublasLtMatmulDesc_t handle, + cublasLtMatmulDescAttributes_t attr, + const T& value) { cublasStatus_t status = cublasLtMatmulDescSetAttribute(handle, attr, &value, sizeof(value)); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulDescSetAttribute(attr=" << attr - << ", value=" << value << ") failed: " << ToString(status); - return false; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatmulDescSetAttribute(attr=", attr, ", value=", + ValueForStrCat(value), ") failed: ", ToString(status))); } - return true; + return port::Status::OK(); } struct MatmulDescDestroyer { @@ -3149,12 +3162,10 @@ using UniqueMatmulPreference = std::unique_ptr::type, MatmulPreferenceDestroyer>; -UniqueOpDesc CreateCublasLtOperationDesc(blas::ComputationType computation_type, - blas::DataType scale_type, - blas::PointerMode pointer_mode, - blas::Epilogue epilogue, - blas::Transpose transa, - blas::Transpose transb) { +port::StatusOr CreateCublasLtOperationDesc( + blas::ComputationType computation_type, blas::DataType scale_type, + blas::PointerMode pointer_mode, blas::Epilogue epilogue, + blas::Transpose transa, blas::Transpose transb) { cublasLtMatmulDesc_t desc; cublasComputeType_t cublas_compute_type = CUBLASComputationType(computation_type); @@ -3162,40 +3173,39 @@ UniqueOpDesc CreateCublasLtOperationDesc(blas::ComputationType computation_type, cublasStatus_t status = cublasLtMatmulDescCreate(&desc, cublas_compute_type, cuda_scale_type); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulDescCreate(computation_type=" << computation_type - << ") failed: " << ToString(status); - return nullptr; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatmulDescCreate(computation_type=", + computation_type, ") failed: ", ToString(status))); } UniqueOpDesc unique_desc(desc); - if (!SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, - CUBLASPointerMode(pointer_mode)) || - !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_EPILOGUE, - CUBLASEpilogue(epilogue)) || - !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA, - CUDABlasTranspose(transa)) || - !SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB, - CUDABlasTranspose(transb))) { - return nullptr; - } + SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_POINTER_MODE, + CUBLASPointerMode(pointer_mode))); + SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_EPILOGUE, + CUBLASEpilogue(epilogue))); + SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSA, + CUDABlasTranspose(transa))); + SE_RETURN_IF_ERROR(SetCublasLtAttr(desc, CUBLASLT_MATMUL_DESC_TRANSB, + CUDABlasTranspose(transb))); return unique_desc; } -UniqueLayoutDesc CreateCublasLtLayoutDesc(blas::DataType data_type, uint64 rows, - uint64 cols, int64 ld, int64 stride, - int batch_count) { +port::StatusOr CreateCublasLtLayoutDesc( + blas::DataType data_type, uint64 rows, uint64 cols, int64 ld, int64 stride, + int batch_count) { cublasLtMatrixLayout_t desc; cublasStatus_t status = cublasLtMatrixLayoutCreate( &desc, GetCUDADataType(data_type), rows, cols, ld); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatrixLayoutCreate failed: " << ToString(status); - return nullptr; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatrixLayoutCreate failed: ", ToString(status))); } UniqueLayoutDesc unique_desc(desc); - if (!SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, batch_count) || - !SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, - stride)) { - return nullptr; - } + SE_RETURN_IF_ERROR( + SetCublasLtAttr(desc, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, batch_count)); + SE_RETURN_IF_ERROR(SetCublasLtAttr( + desc, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stride)); return unique_desc; } @@ -3234,7 +3244,28 @@ blas::ComputationType ToComputationType>() { class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { public: - CUDABlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& params); + CUDABlasLtMatmulPlan(UniqueOpDesc op_desc, UniqueLayoutDesc a_desc, + UniqueLayoutDesc b_desc, UniqueLayoutDesc c_desc, + UniqueLayoutDesc d_desc, blas::DataType ab_type, + blas::DataType c_type, blas::DataType scale_type, + blas::PointerMode pointer_mode, blas::Epilogue epilogue, + int batch_count, int64 stride_a, int64 stride_b, + int64 stride_c, int64 stride_d) + : op_desc_(std::move(op_desc)), + a_desc_(std::move(a_desc)), + b_desc_(std::move(b_desc)), + c_desc_(std::move(c_desc)), + d_desc_(std::move(d_desc)), + ab_type_(ab_type), + c_type_(c_type), + scale_type_(scale_type), + pointer_mode_(pointer_mode), + epilogue_(epilogue), + batch_count_(batch_count), + stride_a_(stride_a), + stride_b_(stride_b), + stride_c_(stride_c), + stride_d_(stride_d) {} cublasLtMatmulDesc_t op_desc() const { return op_desc_.get(); } cublasLtMatrixLayout_t a_desc() const { return a_desc_.get(); } @@ -3276,40 +3307,9 @@ class CUDABlasLtMatmulPlan final : public blas::IBlasLtMatmulPlan { int64 stride_d_; }; -CUDABlasLtMatmulPlan::CUDABlasLtMatmulPlan( - const blas::BlasLtMatmulPlanParams& p) - : op_desc_(CreateCublasLtOperationDesc( - p.computation_type, GetScaleType(p.c_type, p.computation_type), - p.pointer_mode, p.epilogue, p.transa, p.transb)), - a_desc_(nullptr), - b_desc_(nullptr), - c_desc_(CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c, - p.batch_count)), - d_desc_(CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, p.stride_c, - p.batch_count)), - ab_type_(p.ab_type), - cd_type_(p.c_type), - scale_type_(GetScaleType(p.c_type, p.computation_type)), - pointer_mode_(p.pointer_mode), - epilogue_(p.epilogue), - batch_count_(p.batch_count), - stride_a_(p.stride_a), - stride_b_(p.stride_b), - stride_c_(p.stride_c), - stride_d_(p.stride_c) { - uint64 rows_a = p.transa == blas::Transpose::kNoTranspose ? p.m : p.k; - uint64 cols_a = p.transa == blas::Transpose::kNoTranspose ? p.k : p.m; - uint64 rows_b = p.transb == blas::Transpose::kNoTranspose ? p.k : p.n; - uint64 cols_b = p.transb == blas::Transpose::kNoTranspose ? p.n : p.k; - a_desc_ = CreateCublasLtLayoutDesc(p.ab_type, rows_a, cols_a, p.lda, - p.stride_a, p.batch_count); - b_desc_ = CreateCublasLtLayoutDesc(p.ab_type, rows_b, cols_b, p.ldb, - p.stride_b, p.batch_count); -} - bool CUDABlasLtMatmulPlan::SetBiasPointer(const void* bias) const { return SetCublasLtAttr(op_desc_.get(), CUBLASLT_MATMUL_DESC_BIAS_POINTER, - bias); + bias).ok(); } class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm { @@ -3336,20 +3336,19 @@ class CUDABlasLtMatmulAlgorithm final : public blas::IBlasLtMatmulAlgorithm { size_t workspace_size_; }; -UniqueMatmulPreference CreateCublasLtMatmulPreference( - const blas::IBlasLtMatmulPlan* plan, - size_t max_workspace_bytes) { +port::StatusOr CreateCublasLtMatmulPreference( + const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_bytes) { cublasLtMatmulPreference_t preference; cublasStatus_t status = cublasLtMatmulPreferenceCreate(&preference); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulPreferenceCreate failed: " << ToString(status); - return nullptr; + return port::Status(port::error::INTERNAL, + absl::StrCat("cublasLtMatmulPreferenceCreate failed: ", + ToString(status))); } UniqueMatmulPreference unique_preference(preference); - if (!SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, - max_workspace_bytes)) { - return nullptr; - } + SE_RETURN_IF_ERROR(SetCublasLtAttr(preference, + CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, + max_workspace_bytes)); const auto& cuda_plan = *static_cast(plan); if (cuda_plan.batch_count() == 0) { @@ -3361,25 +3360,28 @@ UniqueMatmulPreference CreateCublasLtMatmulPreference( auto get_alignment_bytes = [](int64 stride, blas::DataType dtype) { return (stride & -stride) * GetDataTypeSizeBytes(dtype); }; - if ((cuda_plan.stride_a() && - !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, + if (cuda_plan.stride_a()) { + SE_RETURN_IF_ERROR( + SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, (uint32)get_alignment_bytes(cuda_plan.stride_a(), - cuda_plan.ab_type()))) || - (cuda_plan.stride_b() && - !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, - (uint32)get_alignment_bytes(cuda_plan.stride_b(), - cuda_plan.ab_type()))) || - (cuda_plan.stride_c() && - !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, - (uint32)get_alignment_bytes(cuda_plan.stride_c(), - cuda_plan.cd_type()))) || - (cuda_plan.stride_d() && - !SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES, - (uint32)get_alignment_bytes(cuda_plan.stride_d(), - cuda_plan.cd_type())))) { - return nullptr; + cuda_plan.ab_type()))); + } + if (cuda_plan.stride_b()) { + SE_RETURN_IF_ERROR( + SetCublasLtAttr(preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_b(), + cuda_plan.ab_type()))); + } + if (cuda_plan.stride_c()) { + SE_RETURN_IF_ERROR(SetCublasLtAttr( + preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_c(), cuda_plan.c_type()))); + } + if (cuda_plan.stride_d()) { + SE_RETURN_IF_ERROR(SetCublasLtAttr( + preference, CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES, + (uint32)get_alignment_bytes(cuda_plan.stride_d(), cuda_plan.c_type()))); } - return unique_preference; } @@ -3387,28 +3389,50 @@ UniqueMatmulPreference CreateCublasLtMatmulPreference( #endif // CUDA_VERSION >= 11000 -std::unique_ptr CUDABlas::CreateBlasLtMatmulPlan( - const blas::BlasLtMatmulPlanParams& params) { +port::StatusOr> +CUDABlas::CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& p) { #if CUDA_VERSION >= 11000 - auto result = std::make_unique(params); - if (!result->ok()) { - result.reset(); - } - return result; + SE_ASSIGN_OR_RETURN( + auto op_desc, + CreateCublasLtOperationDesc( + p.computation_type, GetScaleType(p.c_type, p.computation_type), + p.pointer_mode, p.epilogue, p.transa, p.transb)); + uint64 rows_a = p.transa == blas::Transpose::kNoTranspose ? p.m : p.k; + uint64 cols_a = p.transa == blas::Transpose::kNoTranspose ? p.k : p.m; + uint64 rows_b = p.transb == blas::Transpose::kNoTranspose ? p.k : p.n; + uint64 cols_b = p.transb == blas::Transpose::kNoTranspose ? p.n : p.k; + SE_ASSIGN_OR_RETURN(auto a_desc, + CreateCublasLtLayoutDesc(p.ab_type, rows_a, cols_a, p.lda, + p.stride_a, p.batch_count)); + SE_ASSIGN_OR_RETURN(auto b_desc, + CreateCublasLtLayoutDesc(p.ab_type, rows_b, cols_b, p.ldb, + p.stride_b, p.batch_count)); + SE_ASSIGN_OR_RETURN(auto c_desc, + CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, + p.stride_c, p.batch_count)); + SE_ASSIGN_OR_RETURN(auto d_desc, + CreateCublasLtLayoutDesc(p.c_type, p.m, p.n, p.ldc, + p.stride_c, p.batch_count)); + blas::DataType scale_type = GetScaleType(p.c_type, p.computation_type); + + return static_cast>( + std::make_unique( + std::move(op_desc), std::move(a_desc), std::move(b_desc), + std::move(c_desc), std::move(d_desc), p.ab_type, p.c_type, scale_type, + p.pointer_mode, p.epilogue, p.batch_count, p.stride_a, p.stride_b, + p.stride_c, p.stride_c)); #else return nullptr; #endif } -bool CUDABlas::GetBlasLtMatmulAlgorithms( - const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, - int max_algorithm_count, - std::vector>* - out_algorithms) { +port::StatusOr>> +CUDABlas::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan* plan, + size_t max_workspace_size, + int max_algorithm_count) { #if CUDA_VERSION >= 11000 - UniqueMatmulPreference preference = - CreateCublasLtMatmulPreference(plan, max_workspace_size); - if (!preference) return false; + SE_ASSIGN_OR_RETURN(UniqueMatmulPreference preference, + CreateCublasLtMatmulPreference(plan, max_workspace_size)); std::vector results(max_algorithm_count); { @@ -3425,21 +3449,27 @@ bool CUDABlas::GetBlasLtMatmulAlgorithms( cuda_plan.c_desc(), cuda_plan.d_desc(), preference.get(), max_algorithm_count, results.data(), &found_algorithm_count); if (status != CUBLAS_STATUS_SUCCESS) { - VLOG(2) << "cublasLtMatmulAlgoGetHeuristic failed: " << ToString(status); - return false; + return port::Status( + port::error::INTERNAL, + absl::StrCat("cublasLtMatmulAlgoGetHeuristic failed: ", + ToString(status))); } results.resize(found_algorithm_count); } + std::vector> out_algorithms; + out_algorithms.reserve(results.size()); for (size_t i = 0; i < results.size(); ++i) { const auto& result = results[i]; if (result.state != CUBLAS_STATUS_SUCCESS) continue; // Skip failed algos - out_algorithms->emplace_back(std::make_unique( + out_algorithms.emplace_back(std::make_unique( i, result.algo, result.workspaceSize)); } - return true; + return out_algorithms; #else // if CUDA_VERSION < 11000 - return false; + return port::Status( + port::error::UNIMPLEMENTED, + "GetBlasLtMatmulAlgorithms is not supported with this version of CUDA"); #endif } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index d40b6adc285..cf215b35d76 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -336,26 +336,28 @@ bool StreamExecutor::GetBlasGemmAlgorithms( return blas_support->GetBlasGemmAlgorithms(out_algorithms); } -std::unique_ptr StreamExecutor::CreateBlasLtMatmulPlan( +port::StatusOr> +StreamExecutor::CreateBlasLtMatmulPlan( const blas::BlasLtMatmulPlanParams& params) { - blas::BlasSupport *blas_support = AsBlas(); + blas::BlasSupport* blas_support = AsBlas(); if (!blas_support) { - return nullptr; + return port::Status(port::error::UNKNOWN, + "Fail to find the blas implementation."); } return blas_support->CreateBlasLtMatmulPlan(params); } -bool StreamExecutor::GetBlasLtMatmulAlgorithms( - const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, - int max_algorithm_count, - std::vector>* - out_algorithms) { - blas::BlasSupport *blas_support = AsBlas(); +port::StatusOr>> +StreamExecutor::GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan* plan, + size_t max_workspace_size, + int max_algorithm_count) { + blas::BlasSupport* blas_support = AsBlas(); if (!blas_support) { - return false; + return port::Status(port::error::UNKNOWN, + "Fail to find the blas implementation."); } - return blas_support->GetBlasLtMatmulAlgorithms( - plan, max_workspace_size, max_algorithm_count, out_algorithms); + return blas_support->GetBlasLtMatmulAlgorithms(plan, max_workspace_size, + max_algorithm_count); } port::StatusOr> diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index ce801bf0f28..d9a1096ee52 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -398,18 +398,16 @@ class StreamExecutor { // can then be passed to DoBlasLtMatmul(). When possible, plans should be // created once and reused for multiple calls to DoBlasLtMatmul(). // Returns a null pointer on failure. - std::unique_ptr CreateBlasLtMatmulPlan( - const blas::BlasLtMatmulPlanParams& params); + port::StatusOr> + CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& params); // Gets a list of supported algorithms for DoBlasLtMatmul. The algorithms are // returned in the order of increasing estimated compute time according to an // internal heuristic. The first returned algorithm can be used as the default // algorithm if no autotuning is to be performed. - bool GetBlasLtMatmulAlgorithms( - const blas::IBlasLtMatmulPlan* plan, size_t max_workspace_size, - int max_algorithm_count, - std::vector>* - out_algorithms); + port::StatusOr>> + GetBlasLtMatmulAlgorithms(const blas::IBlasLtMatmulPlan* plan, + size_t max_workspace_size, int max_algorithm_count); // Create an RNN descriptor based on model shapes and configurations. // The caller retains the ownership of the descriptor. From fef68d4b601c6f4c145157e24c3cc02846348940 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 21:30:05 +1100 Subject: [PATCH 0091/1447] Remove unneeded code for HostOrDeviceScalar --- .../stream_executor/host_or_device_scalar.h | 23 ------------------- tensorflow/stream_executor/stream.cc | 8 ------- 2 files changed, 31 deletions(-) diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h index 5f06cf027a0..e5319e7d187 100644 --- a/tensorflow/stream_executor/host_or_device_scalar.h +++ b/tensorflow/stream_executor/host_or_device_scalar.h @@ -137,29 +137,6 @@ class HostOrDeviceScalar { } DataType data_type() const { return dtype_; } - template - ResultType CallWithValue(GenericUnaryFunc func) const { - CHECK(!is_pointer()); - switch (dtype_) { - case DataType::kFloat: - return func(float_); - case DataType::kDouble: - return func(double_); - case DataType::kHalf: - return func(half_); - case DataType::kInt8: - return func(int8_); - case DataType::kInt32: - return func(int32_); - case DataType::kComplexFloat: - return func(complex_float_); - case DataType::kComplexDouble: - return func(complex_double_); - default: - return {}; - } - } - private: template const T& value_impl() const; diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc index 536ee87d219..d497181209c 100644 --- a/tensorflow/stream_executor/stream.cc +++ b/tensorflow/stream_executor/stream.cc @@ -140,14 +140,6 @@ std::string ToVlogString(const HostOrDeviceScalar &memory_or_constant) { return ToVlogString(memory_or_constant.value()); } -std::string ToVlogString(const HostOrDeviceScalar& memory_or_constant) { - if (memory_or_constant.is_pointer()) { - return ToVlogString(memory_or_constant.opaque_pointer()); - } - return memory_or_constant.CallWithValue( - [](const auto& value) { return ToVlogString(value); }); -} - template std::string ToVlogString(port::ArraySlice elements) { std::string str = absl::StrCat( From 273cfeea17f15bcfa4ca31938da092647e5a0536 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Mon, 5 Oct 2020 22:48:11 +1100 Subject: [PATCH 0092/1447] Fix cuda_blas.cc compilation for old CUDA versions --- tensorflow/stream_executor/cuda/cuda_blas.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc index 6ec1a82a905..58c63ed396d 100644 --- a/tensorflow/stream_executor/cuda/cuda_blas.cc +++ b/tensorflow/stream_executor/cuda/cuda_blas.cc @@ -3422,7 +3422,9 @@ CUDABlas::CreateBlasLtMatmulPlan(const blas::BlasLtMatmulPlanParams& p) { p.pointer_mode, p.epilogue, p.batch_count, p.stride_a, p.stride_b, p.stride_c, p.stride_c)); #else - return nullptr; + return port::Status( + port::error::UNIMPLEMENTED, + "CreateBlasLtMatmulPlan is not supported with this version of CUDA"); #endif } From 6db06a91c8b065cb546dc923e4c4daa45bb2bbf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Mon, 5 Oct 2020 15:27:29 +0200 Subject: [PATCH 0093/1447] TFLu: Update review comment in ethosu.inc --- tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc index e29f5df1eea..c136b3e7d1f 100644 --- a/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc +++ b/tensorflow/lite/micro/tools/make/ext_libs/ethosu.inc @@ -1,5 +1,10 @@ ifneq ($(filter ethos-u,$(ALL_TAGS)),) - # Do not link Math library + # Arm Compiler will not link the Math library (see below), therefore we're filtering it out. + # See Fatal error: L6450U: Cannot find library m: + # "Arm Compiler is designed to run in a bare metal environment, + # and automatically includes implementations of these functions, + # and so no such flag is necessary." + # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) ifneq (,$(filter $(TARGET_ARCH), x86_64)) From 6710acef648d5b8215ef058fd46251ed75c5a5c3 Mon Sep 17 00:00:00 2001 From: Yoav Ramon Date: Mon, 5 Oct 2020 23:45:00 +0300 Subject: [PATCH 0094/1447] Updated documentation of TimeDistributed It is not clear that the weights are shared at the current documentation. I saw this question repeats at stack overflow (https://stackoverflow.com/questions/43265084/keras-timedistributed-are-weights-shared), and I thought it is worth writing that explicitly. --- tensorflow/python/keras/layers/wrappers.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py index 6798e5c8fff..5f8dc86462b 100644 --- a/tensorflow/python/keras/layers/wrappers.py +++ b/tensorflow/python/keras/layers/wrappers.py @@ -102,6 +102,9 @@ class TimeDistributed(Wrapper): >>> outputs.shape TensorShape([None, 10, 126, 126, 64]) + Because `TimeDistributed` applies an instance of `Conv2D` to each of the + timestamps, the weights used at each timestamp are shared. + Arguments: layer: a `tf.keras.layers.Layer` instance. From 9fa0e727bb42a13d9b5a0eb5ae37c62749623d71 Mon Sep 17 00:00:00 2001 From: xiaohong1031 Date: Mon, 5 Oct 2020 13:46:56 -0700 Subject: [PATCH 0095/1447] fix control_flow_ops_test unit test failure --- .../core/common_runtime/mkl_layout_pass.cc | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc index 176670c8aa5..977ce9d06da 100644 --- a/tensorflow/core/common_runtime/mkl_layout_pass.cc +++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc @@ -527,7 +527,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { CopyAttrsAll, NonDepthBatchWisePoolRewrite, GetRewriteCause()}); rinfo_.push_back({csinfo_.max_pool3d_grad, mkl_op_registry::GetMklOpName(csinfo_.max_pool3d_grad), - CopyAttrsAll, AlwaysRewrite, GetRewriteCause()}); + CopyAttrsAll, Maxpool3DGradRewrite, GetRewriteCause()}); rinfo_.push_back( {csinfo_.maximum, mkl_op_registry::GetMklOpName(csinfo_.maximum), CopyAttrsAll, RewriteIfAtleastOneMklInput, GetRewriteCause()}); @@ -1694,6 +1694,24 @@ class MklLayoutRewritePass : public GraphOptimizationPass { return do_rewrite; } + static bool Maxpool3DGradRewrite(const Node* n) { + CHECK_NOTNULL(n); + bool do_rewrite = false; + for (const Edge* e : n->in_edges()) { + // Rewrite only if there is corresponding Maxpool3D, i.e., workspace is + // available + if (e->dst()->type_string() == csinfo_.max_pool3d_grad && + e->dst_input() == 1 && + e->src()->type_string() == + mkl_op_registry::GetMklOpName(csinfo_.max_pool3d) && + e->src_output() == 0) { + do_rewrite = true; + break; + } + } + return do_rewrite; + } + static bool FusedBatchNormExRewrite(const Node* n) { DCHECK(n); From 152a9384c52ede9ba47ba8ef2743c810a591da57 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Tue, 6 Oct 2020 11:40:35 +0200 Subject: [PATCH 0096/1447] TFlu: Fix bug in micro allocator test and update review comments --- tensorflow/lite/micro/micro_allocator_test.cc | 236 ++++++++---------- tensorflow/lite/micro/test_helpers.cc | 2 +- 2 files changed, 109 insertions(+), 129 deletions(-) diff --git a/tensorflow/lite/micro/micro_allocator_test.cc b/tensorflow/lite/micro/micro_allocator_test.cc index 87484d0dfb4..a0048d68150 100644 --- a/tensorflow/lite/micro/micro_allocator_test.cc +++ b/tensorflow/lite/micro/micro_allocator_test.cc @@ -28,6 +28,12 @@ namespace testing { namespace { constexpr int kExpectedAlignment = 4; +constexpr int t0 = 0; +constexpr int t1 = 1; +constexpr int t2 = 2; +constexpr int t3 = 3; +constexpr int t4 = 4; +constexpr int t5 = 5; void VerifyMockTfLiteTensor(TfLiteTensor* tensor, bool is_variable = false) { TF_LITE_MICRO_EXPECT_EQ(kTfLiteInt32, tensor->type); @@ -434,18 +440,18 @@ TF_LITE_MICRO_TEST(TestAllocationForComplexModelAllocation) { TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) { int version = 1; int subgraph = 0; - constexpr int nbr_tensors = 4; + constexpr int number_tensors = 4; tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); tflite::NodeAndRegistration* node_and_registration; const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + - nbr_tensors] = {version, subgraph, - nbr_tensors, // header + number_tensors] = {version, subgraph, + number_tensors, // header // memory offsets: -1, -1, -1, -1}; // The structure is identical to the one in // TestAllocationForModelsWithBranches - int num_conns = 3; + int number_connections = 3; tflite::testing::NodeConnection node_list[3] = {{ {0}, // input {1} // output @@ -460,7 +466,7 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) { }}; const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( - nbr_tensors, metadata_buffer, node_list, num_conns); + number_tensors, metadata_buffer, node_list, number_connections); TfLiteEvalTensor* eval_tensors = nullptr; constexpr size_t arena_size = 4096; @@ -491,37 +497,33 @@ TF_LITE_MICRO_TEST(OfflinePlannerBranchesAllOnline) { } TF_LITE_MICRO_TEST(OfflinePlannerBasic) { - constexpr int nbr_tensors = 4; + constexpr int number_tensors = 4; tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); tflite::NodeAndRegistration* node_and_registration; const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + - nbr_tensors] = {1, 0, nbr_tensors, - 0, // t0 - 48, // t1 - 0, // t2 - 48}; // t3 - - int t0 = 0; - int t1 = 1; - int t2 = 2; - int t3 = 3; - - int num_conns = 3; - tflite::testing::NodeConnection node_list[3] = {{ - {t0}, // input - {t1} // output - }, - { - {t1}, // input - {t2} // output - }, - { - {t2}, // input - {t3} // output - }}; + number_tensors] = {1, 0, number_tensors, + /*t0=*/0, + /*t1=*/48, + /*t2=*/0, + /*t3=*/48}; + constexpr int number_connections = 3; + tflite::testing::NodeConnection node_list[number_connections] = { + { + /*input=*/{tflite::testing::t0}, + /*output=*/{tflite::testing::t1} + }, + { + /*input=*/{tflite::testing::t1}, + /*output=*/{tflite::testing::t2} + }, + { + /*input=*/{tflite::testing::t2}, + /*output=*/{tflite::testing::t3} + } + }; const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( - nbr_tensors, metadata_buffer, node_list, num_conns); + number_tensors, metadata_buffer, node_list, number_connections); TfLiteEvalTensor* eval_tensors = nullptr; constexpr size_t arena_size = 4096; @@ -544,37 +546,32 @@ TF_LITE_MICRO_TEST(OfflinePlannerBasic) { } TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) { - constexpr int nbr_tensors = 4; + constexpr int number_tensors = 4; tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); tflite::NodeAndRegistration* node_and_registration; const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + - nbr_tensors] = { - 1, 0, nbr_tensors, // header: version, subgraph, nbr tensors - // memory offsets: - 0, // t0 - 0, // t1 - 48, // t2 - -1}; // t3 + number_tensors] = {/*version=*/1, + /*subgraph=*/0, + number_tensors, + /*t0=*/0, + /*t1=*/0, + /*t2=*/48, + /*t3=*/-1}; - int t0 = 0; - int t1 = 1; - int t2 = 2; - int t3 = 3; - - int num_conns = 2; + int number_connections = 2; tflite::testing::NodeConnection node_list[2] = { { - {t0, t1}, // input, scratch - {t2} // output + /*input, scratch=*/{tflite::testing::t0, tflite::testing::t1}, + /*output=*/{tflite::testing::t2} }, { - {t2}, // input - {t3} // output + /*input=*/{tflite::testing::t2}, + /*output=*/{tflite::testing::t3} }, }; const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( - nbr_tensors, metadata_buffer, node_list, num_conns); + number_tensors, metadata_buffer, node_list, number_connections); TfLiteEvalTensor* eval_tensors = nullptr; constexpr size_t arena_size = 4096; @@ -598,39 +595,33 @@ TF_LITE_MICRO_TEST(OfflinePlannerOverlappingAllocation) { } TF_LITE_MICRO_TEST(OfflinePlannerOfflineOnline) { - constexpr int nbr_tensors = 5; + constexpr int number_tensors = 5; tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); tflite::NodeAndRegistration* node_and_registration; const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + - nbr_tensors] = { - 1, 0, nbr_tensors, // header: version, subgraph, nbr tensors - // memory offsets: - 0, // t0 - 48, // t1 - -1, // t2 - 0, // t3 - -1}; // t4 + number_tensors] = {/*version=*/1, + /*subgraph=*/0, + number_tensors, + /*t0=*/0, + /*t1=*/48, + /*t2=*/-1, + /*t3=*/0, + /*t4=*/-1}; - int t0 = 0; - int t1 = 1; - int t2 = 2; - int t3 = 3; - int t4 = 4; - - int num_conns = 2; - tflite::testing::NodeConnection node_list[2] = { + constexpr int number_connections = 2; + tflite::testing::NodeConnection node_list[number_connections] = { { - {t0, t1}, // input, scratch - {t2}, // output + /*input, scratch=*/{tflite::testing::t0, tflite::testing::t1}, + /*output=*/{tflite::testing::t2}, }, { - {t2}, // input - {t3, t4}, // output1, output2 + /*input=*/{tflite::testing::t2}, + /*output1, output2=*/{tflite::testing::t3, tflite::testing::t4}, }, }; const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( - nbr_tensors, metadata_buffer, node_list, num_conns); + number_tensors, metadata_buffer, node_list, number_connections); TfLiteEvalTensor* eval_tensors = nullptr; constexpr size_t arena_size = 4096; @@ -736,43 +727,38 @@ TF_LITE_MICRO_TEST(TestAllocateTfLiteTensorWithReset) { } TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) { - constexpr int nbr_tensors = 5; + constexpr int number_tensors = 5; tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); tflite::NodeAndRegistration* node_and_registration; const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + - nbr_tensors] = { - 1, 0, nbr_tensors, // header: version, subgraph, nbr tensors - // memory offsets: - 0, // t0 - 0, // t1 - 0, // t2 - 48, // t3 - -1}; // t4 + number_tensors] = {/*version=*/1, + /*subgraph=*/0, + number_tensors, + /*t0=*/0, + /*t1=*/0, + /*t2=*/0, + /*t3=*/48, + /*t4=*/-1}; - constexpr int t0 = 0; - constexpr int t1 = 1; - constexpr int t2 = 2; - constexpr int t3 = 3; - constexpr int t4 = 4; - - constexpr int num_conns = 2; - tflite::testing::NodeConnection node_list[num_conns] = { + constexpr int number_connections = 2; + tflite::testing::NodeConnection node_list[number_connections] = { { - {t0, t1, t2}, // t0: input (actual input part of subgraph inputs as - // well as operator inputs) - // t1: scratch1 (only in operator inputs) - // t2: scratch2 (only in operator inputs) - {t3} // output + // t0: input (actual input part of subgraph inputs as + // well as operator inputs) + // t1: scratch1 (only in operator inputs) + // t2: scratch2 (only in operator inputs) + {tflite::testing::t0, tflite::testing::t1, tflite::testing::t2}, + /*t3: output=*/{tflite::testing::t3} }, { - {t3}, // input - {t4} // output + /*t3: input=*/{tflite::testing::t3}, + /*t4: output=*/{tflite::testing::t4} }, }; const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( - nbr_tensors, metadata_buffer, node_list, num_conns, - 1 /* only first tensor (t0) is in subgraph input list*/); + number_tensors, metadata_buffer, node_list, number_connections, + /*Only first tensor (t0) is in subgraph input list=*/1); TfLiteEvalTensor* eval_tensors = nullptr; constexpr size_t arena_size = 4096; @@ -796,49 +782,43 @@ TF_LITE_MICRO_TEST(TestOperatorInputsNotInSubgraphInputs) { } TF_LITE_MICRO_TEST(TestTypicalFirstOpAndSecondOpWithScratchTensors) { - constexpr int nbr_tensors = 6; + constexpr int number_tensors = 6; tflite::AllOpsResolver op_resolver = tflite::testing::GetOpResolver(); tflite::NodeAndRegistration* node_and_registration; const int32_t metadata_buffer[tflite::testing::kOfflinePlannerHeaderSize + - nbr_tensors] = { - 1, 0, nbr_tensors, // header: version, subgraph, nbr tensors - // memory offsets: - 0, // t0 - 0, // t1 - 0, // t2 - 0, // t3 - 48, // t4 - -1}; // t5 + number_tensors] = {/*version=*/1, + /*subgraph=*/0, + number_tensors, + /*t0=*/0, + /*t1=*/0, + /*t2=*/0, + /*t3=*/0, + /*t4=*/48, + /*t5=*/-1}; - constexpr int t0 = 0; - constexpr int t1 = 1; - constexpr int t2 = 2; - constexpr int t3 = 3; - constexpr int t4 = 4; - constexpr int t5 = 5; - - constexpr int num_conns = 3; - tflite::testing::NodeConnection node_list[num_conns] = { + constexpr int number_connections = 3; + tflite::testing::NodeConnection node_list[number_connections] = { { - {t0}, // t0: input (actual input part of subgraph inputs as - // well as operator inputs) - {t1} // t1: output + /*t0: input (subgraph and operator input)=*/{tflite::testing::t0}, + /*t1: output=*/{tflite::testing::t1} }, { - {t1, t2, t3}, // t1: input - // t2: scratch1 (only in operator inputs) - // t3: scratch2 (only in operator inputs) - {t4} // t4: output + // t1: input + // t2: scratch1 (only in operator inputs) + // t3: scratch2 (only in operator inputs) + {tflite::testing::t1, tflite::testing::t2, tflite::testing::t3}, + + /*t4: output=*/{tflite::testing::t4} }, { - {t4}, // input - {t5} // output + /*t4: input=*/{tflite::testing::t4}, + /*t5: output=*/{tflite::testing::t5} }, }; const tflite::Model* model = tflite::testing::GetModelWithOfflinePlanning( - nbr_tensors, metadata_buffer, node_list, num_conns, - 1 /* only first tensor (t0) is in subgraph input list*/); + number_tensors, metadata_buffer, node_list, number_connections, + /*Only first tensor (t0) is in subgraph input list=*/1); TfLiteEvalTensor* eval_tensors = nullptr; constexpr size_t arena_size = 4096; diff --git a/tensorflow/lite/micro/test_helpers.cc b/tensorflow/lite/micro/test_helpers.cc index 26575a4d98d..6a7665a9048 100644 --- a/tensorflow/lite/micro/test_helpers.cc +++ b/tensorflow/lite/micro/test_helpers.cc @@ -205,7 +205,7 @@ const Model* ModelBuilder::BuildModel( } else { // A non-zero value of num_subgraph_inputs means that some of // the operator input tensors are not subgraph inputs. - TFLITE_DCHECK(num_subgraph_inputs < inputs.size()); + TFLITE_DCHECK(num_subgraph_inputs <= inputs.size()); } const flatbuffers::Offset subgraphs[subgraphs_size] = { From 670762556b0a8779efec4d9d6908df4f8b68e48d Mon Sep 17 00:00:00 2001 From: Mahmoud Abuzaina Date: Tue, 6 Oct 2020 08:56:57 -0700 Subject: [PATCH 0097/1447] Addressed review comments --- .../core/common_runtime/mkl_layout_pass.cc | 2 +- tensorflow/core/framework/common_shape_fns.cc | 8 +++++ tensorflow/core/framework/common_shape_fns.h | 6 ++++ .../core/kernels/mkl/mkl_maxpooling_op.cc | 6 ++-- tensorflow/core/ops/mkl_nn_ops.cc | 28 ++++++++--------- tensorflow/core/ops/nn_ops.cc | 30 +++++++------------ 6 files changed, 40 insertions(+), 40 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc index 1f42e3814f2..b0ebb2a1963 100644 --- a/tensorflow/core/common_runtime/mkl_layout_pass.cc +++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc @@ -3669,7 +3669,7 @@ Status MklLayoutRewritePass::RewriteNodeForJustOpNameChange( AddWorkSpaceEdgeIfNeeded(g, orig_node, &nb, &workspace_tensors, &are_workspace_tensors_available); if (are_workspace_tensors_available) { - CHECK_EQ(workspace_tensors.size(), 1); + DCHECK_EQ(workspace_tensors.size(), 1); nb.Input(workspace_tensors[0].node, workspace_tensors[0].index); } diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc index 2d30d41c7a6..1ace8cc0113 100644 --- a/tensorflow/core/framework/common_shape_fns.cc +++ b/tensorflow/core/framework/common_shape_fns.cc @@ -1583,6 +1583,10 @@ Status MaxPoolShape(shape_inference::InferenceContext* c) { return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/false); } +Status MaxPoolGradShape(shape_inference::InferenceContext* c) { + return UnchangedShapeWithRank(c, 4); +} + Status MaxPoolShapeWithExplicitPadding(shape_inference::InferenceContext* c) { return MaxPoolShapeImpl(c, /*supports_explicit_padding=*/true); } @@ -1771,6 +1775,10 @@ Status Pool3DShape(shape_inference::InferenceContext* c) { return Status::OK(); } +Status MaxPool3DGradShape(shape_inference::InferenceContext* c) { + return UnchangedShapeWithRank(c, 5); +} + Status UnknownShape(shape_inference::InferenceContext* c) { for (int i = 0; i < c->num_outputs(); ++i) { c->set_output(i, c->UnknownShape()); diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h index 3b14666305e..306802f133b 100644 --- a/tensorflow/core/framework/common_shape_fns.h +++ b/tensorflow/core/framework/common_shape_fns.h @@ -178,9 +178,15 @@ Status MaxPoolShape(shape_inference::InferenceContext* c); // Shape function for MaxPoolV2-like operations. Status MaxPoolV2Shape(shape_inference::InferenceContext* c, int num_inputs); +// Shape function for MaxPoolGrad-like operations. +Status MaxPoolGradShape(shape_inference::InferenceContext* c); + // Shape function for 3D Pooling operations. Status Pool3DShape(shape_inference::InferenceContext* c); +// Shape function for MaxPool3DGrad-like operations. +Status MaxPool3DGradShape(shape_inference::InferenceContext* c); + // Shape function for use with ops whose output shapes are unknown. Status UnknownShape(shape_inference::InferenceContext* c); diff --git a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc index 276027eb56d..cce9739bc08 100644 --- a/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc +++ b/tensorflow/core/kernels/mkl/mkl_maxpooling_op.cc @@ -341,9 +341,9 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase { std::shared_ptr pooling_bwd_pd = pooling_bwd->GetPoolingBwdPd(); T* diff_dst_data = nullptr; - if (IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd, - pooling_bwd) && - !this->native_format_) { + if (!this->native_format_ && + IS_DIFF_DST_REORDER_NEEDED(diff_dst_md, pooling_bwd_pd, + pooling_bwd)) { grad_dnn_data.SetUsrMem(diff_dst_md, &grad_tensor); grad_dnn_data.CheckReorderToOpMem( MEMORY_PD_WITHOUT_DATA(GET_DIFF_DST_DESC_FROM_OP_PD(pooling_bwd_pd), diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc index 99a6e22a5aa..413eed45338 100644 --- a/tensorflow/core/ops/mkl_nn_ops.cc +++ b/tensorflow/core/ops/mkl_nn_ops.cc @@ -428,7 +428,7 @@ REGISTER_OP("_MklNativeMaxPool") .Output("workspace: uint8") .SetShapeFn(shape_inference::MaxPoolShape) .Doc(R"doc( -MKL version of MaxPool operator that does not depend +oneDNN version of MaxPool operator that does not depend on layout propagation. Uses oneDNN APIs to perform max pooling on the input. @@ -449,13 +449,10 @@ REGISTER_OP("_MklNativeMaxPoolGrad") .Input("grad: T") .Input("workspace: uint8") .Output("output: T") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 4); - }) + .SetShapeFn(shape_inference::MaxPoolGradShape) .Doc(R"doc( -MKL version of MaxPoolGrad that does not depend -on layout propagation. Uses oneDNN APIs to compute gradients of -MaxPool operator. +oneDNN version of MaxPoolGrad that does not depend on layout propagation. +Uses oneDNN APIs to compute gradients of MaxPool operator. *NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. @@ -473,9 +470,8 @@ REGISTER_OP("_MklNativeMaxPool3D") .Attr("workspace_enabled: bool = false") .SetShapeFn(shape_inference::Pool3DShape) .Doc(R"doc( -MKL version of MaxPool3D operator that does not depend -on layout propagation. Uses oneDNN APIs to perform 3D max pooling -on the input. +oneDNN version of MaxPoolGrad that does not depend on layout propagation. +Uses oneDNN APIs to compute gradients of MaxPool operator. *NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. @@ -494,13 +490,13 @@ REGISTER_OP("_MklNativeMaxPool3DGrad") .Attr("T: {half, bfloat16, float} = DT_FLOAT") .Attr("TInput: {half, bfloat16, float} = DT_FLOAT") .Attr("workspace_enabled: bool = false") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 5); - }) + .SetShapeFn(shape_inference::MaxPool3DGradShape) .Doc(R"doc( -MKL version of MaxPool3DGrad operator that does not depend -on layout propagation. Uses oneDNN APIs to compute gradients -of MaxPool3D function. +oneDNN version of MaxPool3DGrad operator that does not depend on layout +propagation. Uses oneDNN APIs to compute gradients of MaxPool3D function. + +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is +expected to invoke these operators. )doc"); REGISTER_OP("_MklQuantizedMaxPool") diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc index 759bf0f0ddf..01cf75fe0ba 100644 --- a/tensorflow/core/ops/nn_ops.cc +++ b/tensorflow/core/ops/nn_ops.cc @@ -773,9 +773,7 @@ REGISTER_OP("MaxPool3DGrad") .Attr(GetConvnet3dDataFormatAttrString()) .Attr("T: {half, bfloat16, float} = DT_FLOAT") .Attr("TInput: {half, bfloat16, float} = DT_FLOAT") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 5); - }); + .SetShapeFn(shape_inference::MaxPool3DGradShape); REGISTER_OP("MaxPool3DGradGrad") .Input("orig_input: T") @@ -879,9 +877,7 @@ REGISTER_OP("MaxPoolGrad") .Input("grad: T") .Output("output: T") .Attr("T: realnumbertype = DT_FLOAT") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 4); - }); + .SetShapeFn(shape_inference::MaxPoolGradShape); REGISTER_OP("MaxPoolGradV2") .Attr(GetPaddingAttrString()) @@ -893,9 +889,7 @@ REGISTER_OP("MaxPoolGradV2") .Input("strides: int32") .Output("output: T") .Attr("T: realnumbertype = DT_FLOAT") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 4); - }); + .SetShapeFn(shape_inference::MaxPoolGradShape); // TODO(b/150813181): Implement explicit padding. REGISTER_OP("MaxPoolGradGrad") @@ -2343,14 +2337,12 @@ REGISTER_OP("_MklMaxPoolGrad") .Input("mkl_workspace: uint8") .Output("output: T") .Output("mkl_output: uint8") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 4); - }) + .SetShapeFn(shape_inference::MaxPoolGradShape) .Doc(R"doc( -MKL version of MaxPoolGrad. Uses MKL DNN APIs to compute gradients of +oneDNN version of MaxPoolGrad. Uses oneDNN APIs to compute gradients of MaxPool operator. -NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); @@ -2486,14 +2478,12 @@ REGISTER_OP("_MklMaxPool3DGrad") .Attr("T: {half, bfloat16, float} = DT_FLOAT") .Attr("TInput: {half, bfloat16, float} = DT_FLOAT") .Attr("workspace_enabled: bool = false") - .SetShapeFn([](InferenceContext* c) { - return UnchangedShapeWithRank(c, 5); - }) + .SetShapeFn(shape_inference::MaxPool3DGradShape) .Doc(R"doc( -MKL version of MklPool3DGrad operator. Uses MKL DNN APIs to compute gradients -of MklPool function. +oneDNN version of MaxPool3DGrad operator. Uses oneDNN APIs to compute gradients +of MaxPool3D function. -NOTE Do not invoke this operator directly in Python. Graph rewrite pass is +*NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); From 13ee2a50166ecc0453f194be26abf055d8d7c43f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Wed, 7 Oct 2020 17:21:12 +0200 Subject: [PATCH 0098/1447] TFLu: Replace target cortex_m_gcc_generic with cortex_m_generic --- .../README.md | 0 .../lite/micro/cortex-m-generic/debug_log.cc | 29 ++++++++++--- .../debug_log_callback.h | 8 ++-- .../micro/cortex_m_gcc_generic/debug_log.cc | 43 ------------------- .../lite/micro/tools/ci_build/test_all.sh | 4 +- ...cc_generic.sh => test_cortex_m_generic.sh} | 14 +++--- tensorflow/lite/micro/tools/make/Makefile | 2 + .../targets/cortex_m_gcc_generic_makefile.inc | 31 ------------- .../targets/cortex_m_generic_makefile.inc | 38 +++++++++++++--- 9 files changed, 69 insertions(+), 100 deletions(-) rename tensorflow/lite/micro/{cortex_m_gcc_generic => cortex-m-generic}/README.md (100%) rename tensorflow/lite/micro/{cortex_m_gcc_generic => cortex-m-generic}/debug_log_callback.h (83%) delete mode 100644 tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc rename tensorflow/lite/micro/tools/ci_build/{test_cortex_m_gcc_generic.sh => test_cortex_m_generic.sh} (70%) delete mode 100644 tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/README.md b/tensorflow/lite/micro/cortex-m-generic/README.md similarity index 100% rename from tensorflow/lite/micro/cortex_m_gcc_generic/README.md rename to tensorflow/lite/micro/cortex-m-generic/README.md diff --git a/tensorflow/lite/micro/cortex-m-generic/debug_log.cc b/tensorflow/lite/micro/cortex-m-generic/debug_log.cc index baebe1f5964..6a3bd4781ca 100644 --- a/tensorflow/lite/micro/cortex-m-generic/debug_log.cc +++ b/tensorflow/lite/micro/cortex-m-generic/debug_log.cc @@ -13,14 +13,31 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ +// Implementation for the DebugLog() function that prints to the debug logger on +// an generic Cortex-M device. + +#ifdef __cplusplus +extern "C" { +#endif // __cplusplus + #include "tensorflow/lite/micro/debug_log.h" -#ifdef DEBUG -#include -#endif +#include "tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h" -extern "C" void DebugLog(const char* s) { -#ifdef DEBUG - fprintf(stderr, "%s", s); +static DebugLogCallback debug_log_callback = nullptr; + +void RegisterDebugLogCallback(void (*cb)(const char* s)) { + debug_log_callback = cb; +} + +void DebugLog(const char* s) { +#ifndef TF_LITE_STRIP_ERROR_STRINGS + if (debug_log_callback != nullptr) { + debug_log_callback(s); + } #endif } + +#ifdef __cplusplus +} // extern "C" +#endif // __cplusplus diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h b/tensorflow/lite/micro/cortex-m-generic/debug_log_callback.h similarity index 83% rename from tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h rename to tensorflow/lite/micro/cortex-m-generic/debug_log_callback.h index d462c8db368..c1afd19a578 100644 --- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h +++ b/tensorflow/lite/micro/cortex-m-generic/debug_log_callback.h @@ -1,4 +1,4 @@ -/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ -#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_ -#define TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_ +#ifndef TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_ +#define TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_ // The application layer must implement and register a callback before calling // the network in a way similar to @@ -46,4 +46,4 @@ void RegisterDebugLogCallback(DebugLogCallback callback); } // extern "C" #endif // __cplusplus -#endif // TENSORFLOW_LITE_MICRO_CORTEX_M_GCC_GENERIC_DEBUG_LOG_CALLBACK_H_ +#endif // TENSORFLOW_LITE_MICRO_CORTEX_M_GENERIC_DEBUG_LOG_CALLBACK_H_ diff --git a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc b/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc deleted file mode 100644 index fce512e199b..00000000000 --- a/tensorflow/lite/micro/cortex_m_gcc_generic/debug_log.cc +++ /dev/null @@ -1,43 +0,0 @@ -/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -==============================================================================*/ - -// Implementation for the DebugLog() function that prints to the debug logger on -// an generic cortex-m device. - -#ifdef __cplusplus -extern "C" { -#endif // __cplusplus - -#include "tensorflow/lite/micro/debug_log.h" - -#include "tensorflow/lite/micro/cortex_m_gcc_generic/debug_log_callback.h" - -static DebugLogCallback debug_log_callback = nullptr; - -void RegisterDebugLogCallback(void (*cb)(const char* s)) { - debug_log_callback = cb; -} - -void DebugLog(const char* s) { -#ifndef TF_LITE_STRIP_ERROR_STRINGS - if (debug_log_callback != nullptr) { - debug_log_callback(s); - } -#endif -} - -#ifdef __cplusplus -} // extern "C" -#endif // __cplusplus diff --git a/tensorflow/lite/micro/tools/ci_build/test_all.sh b/tensorflow/lite/micro/tools/ci_build/test_all.sh index 354d26d9102..a31b5d1382f 100755 --- a/tensorflow/lite/micro/tools/ci_build/test_all.sh +++ b/tensorflow/lite/micro/tools/ci_build/test_all.sh @@ -52,7 +52,7 @@ tensorflow/lite/micro/tools/ci_build/test_stm32f4.sh PRESUBMIT echo "Running Arduino tests at `date`" tensorflow/lite/micro/tools/ci_build/test_arduino.sh -echo "Running cortex_m_gcc_generic tests at `date`" -tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh +echo "Running cortex_m_generic tests at `date`" +tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh echo "Finished all micro tests at `date`" diff --git a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh similarity index 70% rename from tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh rename to tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh index 596c88965e7..054fb5b915c 100755 --- a/tensorflow/lite/micro/tools/ci_build/test_cortex_m_gcc_generic.sh +++ b/tensorflow/lite/micro/tools/ci_build/test_cortex_m_generic.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# Copyright 2020 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -24,23 +24,21 @@ cd "${ROOT_DIR}" source tensorflow/lite/micro/tools/ci_build/helper_functions.sh -TARGET=cortex_m_gcc_generic - # TODO(b/143715361): downloading first to allow for parallel builds. -readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4F third_party_downloads +readable_run make -f tensorflow/lite/micro/tools/make/Makefile TAGS="cmsis-nn armgcc" TARGET=cortex-m4-generic microlite # Build for Cortex-M4 (no FPU) without CMSIS readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean -readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} CORTEX_M_CORE=M4 microlite +readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TAGS=armgcc TARGET=cortex-m4-generic microlite # Build for Cortex-M4F (FPU present) without CMSIS readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean -readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TARGET=${TARGET} CORTEX_M_CORE=M4F microlite +readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TAGS=armgcc TARGET=cortex-m4+fp-generic microlite # Build for Cortex-M4 (no FPU) with CMSIS readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean -readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4 microlite +readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TAGS="cmsis-nn armgcc" TARGET=cortex-m4-generic microlite # Build for Cortex-M4 (FPU present) with CMSIS readable_run make -f tensorflow/lite/micro/tools/make/Makefile clean -readable_run make -j8 -f tensorflow/lite/micro/tools/make/Makefile TAGS=cmsis-nn TARGET=${TARGET} CORTEX_M_CORE=M4F microlite +readable_run make -j$(nproc) -f tensorflow/lite/micro/tools/make/Makefile TAGS="cmsis-nn armgcc" TARGET=cortex-m4+fp-generic microlite diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index f95a9e10580..87c5404e55f 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -128,9 +128,11 @@ CCFLAGS := \ ARFLAGS := -r +ifeq ($(filter armclang,$(ALL_TAGS)),) LDFLAGS += \ -Wl,--fatal-warnings \ -Wl,--gc-sections +endif # override these in the makefile.inc for specific compiler targets TARGET_TOOLCHAIN_PREFIX := diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc deleted file mode 100644 index dd7ccca7ba5..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/cortex_m_gcc_generic_makefile.inc +++ /dev/null @@ -1,31 +0,0 @@ -# Generic Makefile target for ARM Cortex Mx gcc builds. -ifeq ($(TARGET), cortex_m_gcc_generic) - TARGET_ARCH := arm - TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- - export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH) - - $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,)) - - PLATFORM_FLAGS = \ - -DTF_LITE_MCU_DEBUG_LOG \ - -Wno-type-limits \ - -funsigned-char \ - -mcpu=cortex-m4 \ - -mfpu=fpv4-sp-d16 \ - -mthumb \ - -fomit-frame-pointer - -ifeq ($(CORTEX_M_CORE), M4F) - PLATFORM_FLAGS += -mfloat-abi=hard -else ifeq ($(CORTEX_M_CORE), M4) - PLATFORM_FLAGS += -mfloat-abi=softfp -else ifeq ($(CORTEX_M_CORE), ) - $(error CORTEX_M_CORE=[M4|M4F] not defined on the command line) -else - $(error invalid target defined in command line option CORTEX_M_CORE=[M4|M4F]) -endif - - CXXFLAGS += $(PLATFORM_FLAGS) - CCFLAGS += $(PLATFORM_FLAGS) - -endif diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc index 6b19f50eea2..d56ae722b9f 100644 --- a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc @@ -19,53 +19,70 @@ ifneq ($(filter cortex-%-generic,$(TARGET)),) TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1 FLOAT=hard TARGET_ARCH := cortex-m55 + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m55+nodsp+nofp-generic)) CORE=M55 ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_dsp.no_fp TARGET_SPECIFIC_FLAGS += TARGET_ARCH := cortex-m55+nodsp+nofp + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m55+nofp-generic)) CORE=M55 ARM_LDFLAGS := -Wl,--cpu=8.1-M.Main.mve.no_fp TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 TARGET_ARCH := cortex-m55+nofp + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m33+nodsp-generic)) CORE=M33 ARM_LDFLAGS := -Wl,--cpu=Cortex-M33.no_dsp.no_fp TARGET_SPECIFIC_FLAGS += TARGET_ARCH := cortex-m33+nodsp + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m33-generic)) CORE=M33 ARM_LDFLAGS := -Wl,--cpu=Cortex-M33 TARGET_SPECIFIC_FLAGS += -D__DSP_PRESENT=1 -D__FPU_PRESENT=1 -D__VTOR_PRESENT=1 -D__FPU_USED=1 FLOAT=hard TARGET_ARCH := cortex-m33 + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m0-generic)) CORE=M0 ARM_LDFLAGS := -Wl,--cpu=Cortex-M0 TARGET_SPECIFIC_FLAGS += TARGET_ARCH := cortex-m0 + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m3-generic)) CORE=M3 ARM_LDFLAGS := -Wl,--cpu=Cortex-M3 TARGET_SPECIFIC_FLAGS += TARGET_ARCH := cortex-m3 + GCC_TARGET_ARCH := $(TARGET_ARCH) else ifeq ($(TARGET),$(filter $(TARGET),cortex-m4-generic)) CORE=M4 ARM_LDFLAGS := -Wl,--cpu=Cortex-M4.no_fp TARGET_SPECIFIC_FLAGS+= TARGET_ARCH := cortex-m4 + GCC_TARGET_ARCH := $(TARGET_ARCH) + else ifeq ($(TARGET),$(filter $(TARGET),cortex-m4+fp-generic)) + CORE=M4 + ARM_LDFLAGS := -Wl,--cpu=Cortex-M4 + TARGET_SPECIFIC_FLAGS+= + FLOAT=hard + TARGET_ARCH := cortex-m4+fp + GCC_TARGET_ARCH := cortex-m4 else ifeq ($(TARGET),$(filter $(TARGET),cortex-m7+fp-generic)) CORE=M7 - ARM_LDFLAGS := -Wl,--cpu=Cortex-M7.fp + ARM_LDFLAGS := -Wl,--cpu=Cortex-M7 TARGET_SPECIFIC_FLAGS += FLOAT=hard TARGET_ARCH := cortex-m7+fp + GCC_TARGET_ARCH := cortex-m7 else ifeq ($(TARGET),$(filter $(TARGET),cortex-m7-generic)) CORE=M7 ARM_LDFLAGS := -Wl,--cpu=Cortex-M7.no_fp TARGET_SPECIFIC_FLAGS += TARGET_ARCH := cortex-m7 + GCC_TARGET_ARCH := $(TARGET_ARCH) else $(error "$(TARGET) not supported") endif @@ -79,6 +96,10 @@ ifneq ($(filter cortex-%-generic,$(TARGET)),) FLAGS_ARMC = \ --target=arm-arm-none-eabi \ -mcpu=$(TARGET_ARCH) + ifeq ($(BUILD_TYPE), release) + FLAGS_ARMC += -Wno-unused-private-field + endif + CXXFLAGS += $(FLAGS_ARMC) CCFLAGS += $(FLAGS_ARMC) LDFLAGS += $(ARM_LDFLAGS) @@ -95,14 +116,14 @@ ifneq ($(filter cortex-%-generic,$(TARGET)),) CC_TOOL := arm-none-eabi-gcc AR_TOOL := arm-none-eabi-gcc-ar LD := arm-none-eabi-ld - ifneq ($(filter cortex-m55%,$(TARGET_ARCH)),) - $(error Micro architecure support is not available yet for $(TARGET_ARCH)) + ifneq ($(filter cortex-m55%,$(GCC_TARGET_ARCH)),) + $(error Micro architecure support is not available yet for $(GCC_TARGET_ARCH)) else - FLAGS_GCC = -mcpu=$(TARGET_ARCH) + FLAGS_GCC = -mcpu=$(GCC_TARGET_ARCH) endif CXXFLAGS += $(FLAGS_GCC) CCFLAGS += $(FLAGS_GCC) - LDFLAGS += -Wl,--gc-sections + LDFLAGS += endif PLATFORM_FLAGS = \ @@ -111,13 +132,18 @@ ifneq ($(filter cortex-%-generic,$(TARGET)),) -mfloat-abi=$(FLOAT) \ -funsigned-char \ -mlittle-endian \ + -Wno-type-limits \ -fno-function-sections \ -MD \ -DCPU_$(CORE)=1 \ $(TARGET_SPECIFIC_FLAGS) + ifneq ($(BUILD_TYPE), release) + PLATFORM_FLAGS += -DTF_LITE_MCU_DEBUG_LOG + endif + # Common + C/C++ flags - CXXFLAGS += $(PLATFORM_FLAGS) -fno-rtti + CXXFLAGS += $(PLATFORM_FLAGS) CCFLAGS += $(PLATFORM_FLAGS) TEST_SCRIPT := From 06799686074a1bbf0d0bbf7c36a7fe53a8cd5b7d Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 7 Oct 2020 18:54:44 +0000 Subject: [PATCH 0099/1447] [ROCm] Fix for ROCM CSB breakage - 201007 The following commit introduces a new subtest that is failing on the ROCm platform https://github.com/tensorflow/tensorflow/commit/f2ebefba658fb4f424842f03d15f694cb917110f The new subtest ( `NcclManagerTest.Abort` ) requires simulating a multi-node environment on a single-node with mutliple GPUs. This functionality is currently not available on the ROCm platform, and other subtests that require this functionality have already been disabled on the ROCm platform. Doing the same for the newly added subtests as well. --- tensorflow/core/nccl/nccl_manager_test.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc index d16eefa6f72..ff967175091 100644 --- a/tensorflow/core/nccl/nccl_manager_test.cc +++ b/tensorflow/core/nccl/nccl_manager_test.cc @@ -640,6 +640,10 @@ TEST(NcclManagerTest, CommunicatorKey) { } #if !TENSORFLOW_USE_ROCM +// ROCm platform currently does not support simulating a mutli-node +// environment, on a single node with multiple GPUS. So tests that rely +// upon such simulation need to be skipped on the ROCm platform + // This test creates `num_nodes` NcclManagers to simulate a multi-node // environment. It works on a single node with multiple GPUs. It enqueues NCCL // kernels on separate stream per rank. @@ -661,6 +665,10 @@ TYPED_TEST(NcclManagerTest, MultiNodeSingle) { } #if !TENSORFLOW_USE_ROCM +// ROCm platform currently does not support simulating a mutli-node +// environment, on a single node with multiple GPUS. So tests that rely +// upon such simulation need to be skipped on the ROCm platform + // Multi-node broadcast. TYPED_TEST(NcclManagerTest, MultiNodeBroadcast) { int num_nodes; @@ -850,6 +858,11 @@ TYPED_TEST(NcclManagerTest, BroadcastInconsistentSource) { this->VerifyError(test_case.get()); } +#if !TENSORFLOW_USE_ROCM +// ROCm platform currently does not support simulating a mutli-node +// environment, on a single node with multiple GPUS. So tests that rely +// upon such simulation need to be skipped on the ROCm platform + TYPED_TEST(NcclManagerTest, Abort) { using NodeState = typename TestFixture::NodeState; using TestCase = typename TestFixture::TestCase; @@ -911,6 +924,7 @@ TYPED_TEST(NcclManagerTest, Abort) { } } } +#endif } // namespace tensorflow From f6ed328c634f7900d839d730e088d2b8901c9cd9 Mon Sep 17 00:00:00 2001 From: "Brent M. Spell" Date: Wed, 7 Oct 2020 16:41:49 -0400 Subject: [PATCH 0100/1447] use sysctl for cpu frequency on macos This change uses the sysctl system call instead of popen to retrieve the maximum CPU frequency on MacOS. On some platforms (Erlang in our case) using popen to query a value from the shell can deadlock the VM. Using the sysctl system call to retrieve this value avoids the deadlock in Erlang on MacOS. --- .../core/platform/profile_utils/cpu_utils.cc | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/platform/profile_utils/cpu_utils.cc b/tensorflow/core/platform/profile_utils/cpu_utils.cc index 7cd1c4de88f..b76b3377397 100644 --- a/tensorflow/core/platform/profile_utils/cpu_utils.cc +++ b/tensorflow/core/platform/profile_utils/cpu_utils.cc @@ -23,6 +23,10 @@ limitations under the License. #include #endif +#if defined(__APPLE__) +#include +#endif + #include "absl/base/call_once.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/profile_utils/android_armv7a_cpu_utils_helper.h" @@ -114,17 +118,11 @@ static ICpuUtilsHelper* cpu_utils_helper_instance_ = nullptr; "CPU frequency"; return INVALID_FREQUENCY; #elif defined(__APPLE__) - int64 freq_hz; - FILE* fp = - popen("sysctl hw | grep hw.cpufrequency_max: | cut -d' ' -f 2", "r"); - if (fp == nullptr) { - return INVALID_FREQUENCY; - } - if (fscanf(fp, "%lld", &freq_hz) != 1) { - return INVALID_FREQUENCY; - } - pclose(fp); - if (freq_hz < 1e6) { + int64 freq_hz = 0; + size_t freq_size = sizeof(freq_hz); + int retval = + sysctlbyname("hw.cpufrequency_max", &freq_hz, &freq_size, NULL, 0); + if (retval != 0 || freq_hz < 1e6) { LOG(WARNING) << "Failed to get CPU frequency: " << freq_hz << " Hz"; return INVALID_FREQUENCY; } From 34dcd396d9b26a01fb16837c7f7c2dc7149bf2bf Mon Sep 17 00:00:00 2001 From: Trent Lo Date: Tue, 6 Oct 2020 15:00:23 -0700 Subject: [PATCH 0101/1447] [XLA/GPU] Size-constrained buffer allocation. This change provide the capability to XLA to generate multiple heaps (i.e., temp buffers) with a size constraint on each heap to avoid Out-of-Memory due to memory fragmentation. Note that larger allocations are more subject to the effect of fragmentation. --- .../compiler/xla/debug_options_flags.cc | 11 ++ .../compiler/xla/service/buffer_assignment.cc | 62 ++++--- .../compiler/xla/service/buffer_assignment.h | 12 +- .../xla/service/gpu/ir_emitter_unnested.cc | 13 +- .../xla/service/gpu/ir_emitter_unnested.h | 3 +- .../compiler/xla/service/heap_simulator.cc | 65 ++++++- .../compiler/xla/service/heap_simulator.h | 37 +++- .../xla/service/heap_simulator_test.cc | 159 ++++++++++++++++-- .../xla/service/memory_space_assignment.cc | 5 +- tensorflow/compiler/xla/xla.proto | 6 +- 10 files changed, 330 insertions(+), 43 deletions(-) diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc index 2dd7acb2f67..201ac346bad 100644 --- a/tensorflow/compiler/xla/debug_options_flags.cc +++ b/tensorflow/compiler/xla/debug_options_flags.cc @@ -73,6 +73,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() { opts.set_xla_gpu_deterministic_reductions(false); opts.set_xla_cpu_enable_xprof_traceme(false); opts.set_xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found(false); + opts.set_xla_multiheap_size_constraint_per_heap(-1); return opts; } @@ -571,6 +572,16 @@ static void AllocateFlags() { "that falling back to the driver can have drawbacks like using more " "memory and/or other bugs during compilation, so we recommend setting " "this flag to false.")); + flag_objects->push_back(tensorflow::Flag( + "xla_multiheap_size_constraint_per_heap", + int32_setter_for( + &DebugOptions::set_xla_multiheap_size_constraint_per_heap), + flag_values->xla_multiheap_size_constraint_per_heap(), + "Generates multiple heaps (i.e., temp buffers) with a size " + "constraint on each heap to avoid Out-of-Memory due to memory " + "fragmentation. The constraint is soft, so it works with tensors " + "larger than the given constraint size.")); + ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects); } diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc index db34f054f35..1744bf57748 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.cc +++ b/tensorflow/compiler/xla/service/buffer_assignment.cc @@ -1330,12 +1330,23 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering( auto get_heap_algorithm = [&](int64 alignment) { auto algorithms = absl::make_unique< std::vector>>>(); - algorithms->push_back( - absl::make_unique>( - alignment, GlobalDecreasingSizeBestFitHeap::kSpatial)); - algorithms->push_back( - absl::make_unique>( - alignment, GlobalDecreasingSizeBestFitHeap::kTemporal)); + if (assignment->multiheap_size_constraint_per_heap() == -1) { + algorithms->push_back( + absl::make_unique>( + alignment, GlobalDecreasingSizeBestFitHeap::kSpatial)); + algorithms->push_back( + absl::make_unique>( + alignment, GlobalDecreasingSizeBestFitHeap::kTemporal)); + } else { + algorithms->push_back( + absl::make_unique( + assignment->multiheap_size_constraint_per_heap(), alignment, + GlobalDecreasingSizeBestFitHeap::kSpatial)); + algorithms->push_back( + absl::make_unique( + assignment->multiheap_size_constraint_per_heap(), alignment, + GlobalDecreasingSizeBestFitHeap::kTemporal)); + } return absl::make_unique>( std::move(algorithms)); }; @@ -1500,20 +1511,25 @@ void BufferAssigner::AssignBuffersFromHeapSimulator( } VLOG(1) << "Result size from heap simulator: " << result.heap_size; - BufferAllocation* allocation = - assignment->NewEmptyAllocation(result.heap_size, color); - for (const auto& buffer_chunk : result.chunk_map) { - const HloValue& value = *buffer_chunk.first; - const HeapSimulator::Chunk& chunk = buffer_chunk.second; - assignment->AddAssignment(allocation, value, chunk.offset, chunk.size); + for (auto& heap_result : result.heap_results) { + BufferAllocation* allocation = + assignment->NewEmptyAllocation(heap_result.heap_size, color); + for (const auto& buffer_chunk : heap_result.chunk_map) { + const HloValue& value = *buffer_chunk.first; + const HeapSimulator::Chunk& chunk = buffer_chunk.second; + assignment->AddAssignment(allocation, value, chunk.offset, chunk.size); + } + // Compute peak_buffers only when the multiheap mode is off. Simply return + // an empty vector in the multiheap mode. + if (assignment->multiheap_size_constraint_per_heap() == -1) { + allocation->peak_buffers_ = + ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace); + } + + XLA_VLOG_LINES(2, allocation->ToString()); + + allocation->AddHeapTrace(result.debug_trace); } - allocation->peak_buffers_ = - ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace); - - VLOG(1) << "Ran heap simulation for allocation: "; - XLA_VLOG_LINES(2, allocation->ToString()); - - allocation->AddHeapTrace(result.debug_trace); } StatusOr> BufferAssigner::CreateAssignment( @@ -1580,6 +1596,10 @@ StatusOr> BufferAssigner::CreateAssignment( buffers_to_assign_sequentially.size() == global_computations.size(); VLOG(2) << "Running whole module heap simulation: " << run_whole_module_heap_simulation; + const int32 multiheap_size_constraint_per_heap = + module->config().debug_options().xla_multiheap_size_constraint_per_heap(); + VLOG(2) << "Multiheap per heap size limit: " + << multiheap_size_constraint_per_heap; TF_RETURN_IF_ERROR(AssignBuffersWithSequentialOrdering( buffers_to_assign_sequentially, run_whole_module_heap_simulation, assignment.get())); @@ -1618,7 +1638,9 @@ StatusOr> BufferAssigner::CreateAssignment( // This can only be performed after all buffers have been assigned, and // after maybe_live_out is marked, since it is used to determine whether an // allocation contains temporary buffers or not. - assignment->CombineTempAllocations(); + if (multiheap_size_constraint_per_heap == -1) { + assignment->CombineTempAllocations(); + } XLA_VLOG_LINES(2, assignment->ToString()); TF_RETURN_IF_ERROR(assignment->ComputeSummaryStats()); diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h index dfde46ca4b1..c8c430988f5 100644 --- a/tensorflow/compiler/xla/service/buffer_assignment.h +++ b/tensorflow/compiler/xla/service/buffer_assignment.h @@ -363,6 +363,10 @@ class BufferAssignment { return temp_allocation_total_size_; } + int32 multiheap_size_constraint_per_heap() const { + return multiheap_size_constraint_per_heap_; + } + // Returns whether the given buffer has been assigned an allocation. bool HasAllocation(const HloValue& value) const; @@ -491,7 +495,11 @@ class BufferAssignment { buffer_size_(std::move(buffer_size)), color_alignment_(std::move(color_alignment)), alias_analysis_(std::move(alias_analysis)), - hlo_live_range_(std::move(hlo_live_range)) {} + hlo_live_range_(std::move(hlo_live_range)), + multiheap_size_constraint_per_heap_( + module->config() + .debug_options() + .xla_multiheap_size_constraint_per_heap()) {} // Creates and returns a new BufferAllocation, with no assigned // LogicalBuffers. Ownership is maintained internally. @@ -535,6 +543,8 @@ class BufferAssignment { // The total size of all temporary buffers. int64 temp_allocation_total_size_ = 0; + int32 multiheap_size_constraint_per_heap_; + // Maps Buffers to the index of the BufferAllocation which holds the buffer. absl::flat_hash_map allocation_index_for_value_; diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc index 8f01d7e3c41..b382ff8b708 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc @@ -1857,7 +1857,8 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices( absl::string_view name, Thunk::ThunkInfo thunk_info, absl::Span slices, std::function - bind_slice_to_ir_value) { + bind_slice_to_ir_value, + bool insist_single_temp_buffer) { const auto& buffer_assn = ir_emitter_context_->buffer_assignment(); // Figure out which buffer allocations need to be passed as arguments to our @@ -1874,7 +1875,7 @@ IrEmitterUnnested::BuildKernelThunkFromBufferSlices( if (alloc.IsPreallocatedTempBuffer()) { if (!temp_buffer.has_value()) { temp_buffer = &alloc; - } else { + } else if (insist_single_temp_buffer) { LOG(FATAL) << "Multiple temp buffers found, but only one is allowed!"; } } @@ -1995,7 +1996,13 @@ std::unique_ptr IrEmitterUnnested::BuildKernelThunk( << hlo_buffer_slice->gte_index.ToString(); bindings_.BindHloToIrValue(*instr, value, index); - }); + }, + // Check temp buffer numbers only when the multiheap mode is off. + /*insist_single_temp_buffer=*/inst->parent() + ->parent() + ->config() + .debug_options() + .xla_multiheap_size_constraint_per_heap() == -1); } std::unique_ptr IrEmitterUnnested::BuildKernelThunkForMlir( diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h index b83af8799d3..1ad12840c9e 100644 --- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h +++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h @@ -547,7 +547,8 @@ class IrEmitterUnnested : public IrEmitter, absl::string_view name, Thunk::ThunkInfo thunk_info, absl::Span slices, std::function - bind_slice_to_ir_value); + bind_slice_to_ir_value, + bool insist_single_temp_buffer = true); // Returns a KernelThunk that invokes the kernel emitted for `inst`. The // caller needs to make sure `inst` outlives the lifetime of the returned diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc index 2e2b668eba7..7b0220753fa 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.cc +++ b/tensorflow/compiler/xla/service/heap_simulator.cc @@ -409,11 +409,15 @@ HeapSimulator::Result HeapSimulator::Finish() { // Post-process the result to add chunks for shared buffers. An empty chunk // map means that either no buffers were allocated, or the heap was only // collecting statistics, e.g. NoFragmentationStatsHeap. - if (!result.chunk_map.empty()) { + size_t total_chunk_count = 0; + absl::c_for_each(result.heap_results, [&](const HeapResult& hr) { + total_chunk_count += hr.chunk_map.size(); + }); + if (total_chunk_count != 0) { // If we were told to assign specific buffers, make sure we've assigned // exactly that many buffers. if (options_.buffers_to_assign != nullptr) { - CHECK_EQ(options_.buffers_to_assign->size(), result.chunk_map.size()); + CHECK_EQ(options_.buffers_to_assign->size(), total_chunk_count); } } @@ -825,7 +829,10 @@ GlobalDecreasingSizeBestFitHeap::Finish() { CommitChunk(buffer_interval, chunk_candidate); } VLOG(1) << "result heap_size: " << result_.heap_size; - return result_; + Result result; + result.heap_size = result_.heap_size; + result.heap_results.emplace_back(result_); + return result; } template @@ -968,6 +975,58 @@ void GlobalDecreasingSizeBestFitHeap::AddToChunkMap( DCHECK(emplace_result.second); } +HeapSimulator::Result +ConstrainedGlobalDecreasingSizeBestFitHeap::Finish() { + std::vector sorted_buffer_vec = GetSortedBufferIntervals(); + // Convert into std::list so that erase() is O(1). + std::list sorted_buffer_intervals(sorted_buffer_vec.begin(), + sorted_buffer_vec.end()); + + // Use do-while here, because we need to create 1 heap in `multi_heap_result` + // even if `sorted_buffer_intervals` is empty. + Result multi_heap_result; + do { + // Place buffers into the currently processed heap as many as possible. + for (auto it = sorted_buffer_intervals.begin(); + it != sorted_buffer_intervals.end();) { + BufferInterval buffer_interval = *it; + if (!buffer_interval.need_allocation) { + it = sorted_buffer_intervals.erase(it); + continue; + } + if (buffer_interval.size > size_limit_per_heap_) { + LOG(WARNING) << "Alloc buffer size " << buffer_interval.size + << " larger than the per-heap size limit " + << size_limit_per_heap_; + } + + ChunkCandidate chunk_candidate = FindChunkCandidate(buffer_interval); + if (chunk_candidate.heap_size <= size_limit_per_heap_ || + // Commit the chunk as long as the heap is empty. We do this because + // we want the size constraint to be soft, meaning that results are + // successfully generated even if there are some buffer sizes larger + // than the given constraint size. + result_.heap_size == 0) { + CommitChunk(buffer_interval, chunk_candidate); + it = sorted_buffer_intervals.erase(it); + continue; + } + + ++it; + } + // Collect the result from the currently processed heap and reset the heap + // states. + multi_heap_result.heap_size += result_.heap_size; + multi_heap_result.heap_results.push_back(std::move(result_)); + result_ = {}; + interval_tree_ = {}; + } while (!sorted_buffer_intervals.empty()); + + VLOG(1) << "Number of heaps produced = " + << multi_heap_result.heap_results.size(); + return multi_heap_result; +} + template HeapSimulator::Result ChooseBestHeapAlgorithm::Finish() { diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h index b47ff685139..0c7425d8edf 100644 --- a/tensorflow/compiler/xla/service/heap_simulator.h +++ b/tensorflow/compiler/xla/service/heap_simulator.h @@ -67,14 +67,23 @@ class HeapSimulator { } }; - // Result represents the result of the heap simulation. template - struct Result { + struct HeapResult { // The assignment of buffers to chunks. absl::flat_hash_map chunk_map; // The total size in bytes of the heap, containing all assigned chunks. int64 heap_size = 0; + }; + // Result represents the result of the heap simulation. + template + struct Result { + // Heap results. + std::vector> heap_results; + + // The total size in bytes of the heaps. + // heap_size == sum([hr.heap_size for hr in heap_results]). + int64 heap_size = 0; // The total size in bytes of heap fragmentation. int64 fragmentation_size = 0; @@ -229,6 +238,7 @@ class HeapAlgorithm { public: using Chunk = HeapSimulator::Chunk; using Result = HeapSimulator::Result; + using HeapResult = HeapSimulator::HeapResult; virtual ~HeapAlgorithm() = default; @@ -347,6 +357,7 @@ class BufferIntervalTree { template class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { public: + using HeapResult = HeapSimulator::HeapResult; using Result = HeapSimulator::Result; using Chunk = HeapSimulator::Chunk; @@ -415,6 +426,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { int64 preferred_offset = -1) const; void CommitChunk(const BufferInterval& buffer_interval, ChunkCandidate chunk_candidate); + // Adds the buffer and the chunk to the result chunk map. virtual void AddToChunkMap(const BufferType* buffer, Chunk chunk); @@ -426,7 +438,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { BufferIntervalCompare GetTemporalBufferIntervalCompare() const; absl::flat_hash_map buffer_intervals_; - Result result_; + HeapResult result_; BufferIntervalCompare buffer_interval_compare_; BufferIntervalTree interval_tree_; @@ -444,6 +456,25 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm { const BufferInterval& interval) const; }; +// This class implements an algorithm that will output multiple heaps. Each heap +// size is constrained by a given limit. Note that the constraint is soft, +// meaning that valid heap results are generated even if there are some buffer +// sizes larger than the given constraint size. +class ConstrainedGlobalDecreasingSizeBestFitHeap + : public GlobalDecreasingSizeBestFitHeap { + public: + explicit ConstrainedGlobalDecreasingSizeBestFitHeap( + size_t size_limit_per_heap, int64 alignment, Type type = kSpatial) + : size_limit_per_heap_(size_limit_per_heap), + GlobalDecreasingSizeBestFitHeap(alignment, type) {} + ~ConstrainedGlobalDecreasingSizeBestFitHeap() override {} + + Result Finish() override; + + private: + size_t size_limit_per_heap_; +}; + // A heap algorithm that chooses the best results from other algorithms added to // it. template diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc index 8f7668b4965..26305eebb0d 100644 --- a/tensorflow/compiler/xla/service/heap_simulator_test.cc +++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc @@ -256,12 +256,15 @@ class HeapCallRecorder : public HeapAlgorithm { } Result Finish() override { calls_->emplace_back(kFinish, nullptr); - return result_; + HeapSimulator::Result result; + result.heap_size = result_.heap_size; + result.heap_results.emplace_back(std::move(result_)); + return result; } private: CallSequence* calls_; - Result result_; + HeapSimulator::HeapResult result_; }; // HeapSimulatorTracker runs the heap simulator, recording the sequence of calls @@ -335,7 +338,8 @@ class HeapSimulatorTracker { int64 OffsetAt(const HloInstruction* instruction, const ShapeIndex& index) { const HloValue* buffer = BufferAt(instruction, index); - return result_.chunk_map.at(buffer).offset; + CHECK_EQ(1, result_.heap_results.size()); + return result_.heap_results.at(0).chunk_map.at(buffer).offset; } // Ensures the expected sequence of Alloc/Free/Finish calls was performed. @@ -1051,7 +1055,8 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Empty) { GlobalDecreasingSizeBestFitHeap heap(/*alignment=*/1); const HeapSimulator::Result result = heap.Finish(); EXPECT_EQ(0, result.heap_size); - EXPECT_EQ(0, result.chunk_map.size()); + EXPECT_EQ(1, result.heap_results.size()); + EXPECT_EQ(0, result.heap_results.at(0).chunk_map.size()); } TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) { @@ -1078,7 +1083,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSize) { heap.Free(buffer_c_, 20); heap.Free(buffer_d_, 40); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(100, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size); @@ -1117,7 +1125,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, DecreasingSizeWithAlignment) { heap.Free(buffer_c_, 50); heap.Free(buffer_d_, 40); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(120, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1160,7 +1171,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, BestFit) { heap.Free(buffer_d_, 30); heap.Free(buffer_e_, 50); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(140, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1192,7 +1206,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, Colocated) { heap.ShareWith(buffer_c_, buffer_a_, 40); heap.Free(buffer_c_, 40); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(40, result.heap_size); EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1221,7 +1238,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedII) { heap.Free(buffer_c_, 40); heap.Free(buffer_b_, 20); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(60, result.heap_size); EXPECT_EQ(40, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(20, result.chunk_map.at(buffer_b_).size); @@ -1251,7 +1271,10 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ColocatedIII) { heap.Free(buffer_c_, 10); heap.Free(buffer_b_, 30); - const HeapSimulator::Result result = heap.Finish(); + const HeapSimulator::Result results = heap.Finish(); + EXPECT_EQ(1, results.heap_results.size()); + const HeapSimulator::HeapResult& result = + results.heap_results.at(0); EXPECT_EQ(40, result.heap_size); EXPECT_EQ(10, result.chunk_map.at(buffer_a_).size); EXPECT_EQ(30, result.chunk_map.at(buffer_b_).size); @@ -1311,6 +1334,122 @@ TEST_F(GlobalDecreasingSizeBestFitHeapTest, ChunkCandidate) { // Preferred offset 15 could not be given because it is occupied. } +class ConstrainedGlobalDecreasingSizeBestFitHeapTest + : public HeapAlgorithmTestBase {}; + +TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, DecreasingSize) { + // space + // ^ + // | +-------+ + // | +---c---+ + // | +-------+ + // | | b | + // | +-------+ + // | ................ // split into two allocations. + // | +---a---+ + // | +-------+ + // | | | + // | | d | + // | +-------+ + // -----------------> time + ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50, + /*alignment=*/1); + heap.Alloc(buffer_a_, 10); + heap.Alloc(buffer_b_, 30); + heap.Alloc(buffer_c_, 20); + heap.Alloc(buffer_d_, 40); + heap.Free(buffer_a_, 10); + heap.Free(buffer_b_, 30); + heap.Free(buffer_c_, 20); + heap.Free(buffer_d_, 40); + + const HeapSimulator::Result result = heap.Finish(); + EXPECT_EQ(100, result.heap_size); + EXPECT_EQ(2, result.heap_results.size()); + + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_)); + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_d_)); + EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size); + EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_d_).size); + EXPECT_EQ(40, result.heap_results[0].chunk_map.at(buffer_a_).offset); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_d_).offset); +} + +TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, + DecreasingSizeWithAlignment) { + // space + // ^ + // | +-------+ + // | +---b---+ + // | +-------+ + // | | | + // | | d | + // | +-------+ + // | ................... + // | +---a---+ + // | + // | +-------+ + // | | | + // | | c | + // | | | + // | +-------+ + // ---------------------> time + ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/70, + /*alignment=*/20); + heap.Alloc(buffer_a_, 10); + heap.Alloc(buffer_b_, 20); + heap.Alloc(buffer_c_, 50); + heap.Free(buffer_a_, 10); + heap.Alloc(buffer_d_, 40); + heap.Free(buffer_b_, 20); + heap.Free(buffer_c_, 50); + heap.Free(buffer_d_, 40); + + const HeapSimulator::Result result = heap.Finish(); + EXPECT_EQ(130, result.heap_size); // 70 + 60 + EXPECT_EQ(2, result.heap_results.size()); + + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_)); + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_)); + EXPECT_EQ(10, result.heap_results[0].chunk_map.at(buffer_a_).size); + EXPECT_EQ(50, result.heap_results[0].chunk_map.at(buffer_c_).size); + EXPECT_EQ(60, result.heap_results[0].chunk_map.at(buffer_a_).offset); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset); +} + +TEST_F(ConstrainedGlobalDecreasingSizeBestFitHeapTest, ColocatedII) { + // space + // ^ + // | +---------------+ + // | +-------b-------+ + // | .................... + // |+------+ +-------+ + // || | | | + // || | | | <--- colocate with a + // |+--a---+ +---c---+ + // ---------------------> time + ConstrainedGlobalDecreasingSizeBestFitHeap heap(/*size_limit_per_heap=*/50, + /*alignment=*/20); + heap.Alloc(buffer_a_, 30); + heap.Free(buffer_a_, 30); + heap.Alloc(buffer_b_, 20); + + heap.ShareWith(buffer_c_, buffer_a_, 40); + heap.Free(buffer_c_, 40); + heap.Free(buffer_b_, 20); + + const HeapSimulator::Result result = heap.Finish(); + EXPECT_EQ(50, result.heap_size); + EXPECT_EQ(2, result.heap_results.size()); + + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_a_)); + EXPECT_TRUE(result.heap_results[0].chunk_map.contains(buffer_c_)); + EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_a_).size); + EXPECT_EQ(30, result.heap_results[0].chunk_map.at(buffer_c_).size); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_a_).offset); + EXPECT_EQ(0, result.heap_results[0].chunk_map.at(buffer_c_).offset); +} + class IntervalTreeTest : public ::testing::Test {}; TEST_F(IntervalTreeTest, InsertAndRemove) { diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc index 6d4b0e65010..604146e3981 100644 --- a/tensorflow/compiler/xla/service/memory_space_assignment.cc +++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc @@ -1127,7 +1127,10 @@ HeapSimulator::Result AlternateMemoryBestFitHeap::Finish() { VLOG(3) << allocation_info_str_; DumpDebugStringsIfEnabled(); - return result_; + HeapSimulator::Result result; + result.heap_size = result_.heap_size; + result.heap_results.emplace_back(std::move(result_)); + return std::move(result); } void AlternateMemoryBestFitHeap::CreateAllocationValuesFromColocatedIntervals( diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto index 1cf30b10373..f2488c39504 100644 --- a/tensorflow/compiler/xla/xla.proto +++ b/tensorflow/compiler/xla/xla.proto @@ -290,7 +290,11 @@ message DebugOptions { // Extra parameters to pass the GPU assembler. string xla_gpu_asm_extra_flags = 141; - // Next id: 142 + // Per-heap size constraint. New heaps will be created if per-heap max size is + // reached. + int32 xla_multiheap_size_constraint_per_heap = 142; + + // Next id: 143 // Extra options to pass to the compilation backend (e.g. LLVM); specific // interpretation of these values is left to the backend. From 525a82482481c9e07321c8524863683807725c88 Mon Sep 17 00:00:00 2001 From: Ben Barsdell Date: Thu, 8 Oct 2020 17:10:24 +1100 Subject: [PATCH 0102/1447] Refactor SparseApplyFtrl CPU kernel into class - This is in preparation for adding a GPU implementation. --- tensorflow/core/kernels/training_ops.cc | 351 ++++++++++++------------ tensorflow/core/kernels/training_ops.h | 15 + 2 files changed, 197 insertions(+), 169 deletions(-) diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc index bdb07470c07..733a7a0f80f 100644 --- a/tensorflow/core/kernels/training_ops.cc +++ b/tensorflow/core/kernels/training_ops.cc @@ -350,6 +350,171 @@ struct ApplyFtrlMultiplyLinearByLr { } }; +namespace { + +template +inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1, + const T& l2, const T& lr_power, + const bool multiply_linear_by_lr) { + T quadratic; + if (multiply_linear_by_lr) { + if (lr_power == static_cast(-0.5)) { + quadratic = Eigen::numext::sqrt(accum) + static_cast(2) * l2 * lr; + } else { + quadratic = + Eigen::numext::pow(accum, -lr_power) + static_cast(2) * l2 * lr; + } + auto l1_reg_adjust = std::max(std::min(linear, l1 * lr), -l1 * lr); + return (l1_reg_adjust - linear) / quadratic; + } else { + if (lr_power == static_cast(-0.5)) { + quadratic = Eigen::numext::sqrt(accum) / lr + static_cast(2) * l2; + } else { + quadratic = + Eigen::numext::pow(accum, -lr_power) / lr + static_cast(2) * l2; + } + auto l1_reg_adjust = std::max(std::min(linear, l1), -l1); + return (l1_reg_adjust - linear) / quadratic; + } +} + +} // namespace + +template +struct SparseApplyFtrl { + Tindex operator()(const CPUDevice& d, typename TTypes::Matrix var_flat, + typename TTypes::Matrix accum_flat, + typename TTypes::Matrix linear_flat, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstScalar l2_shrinkage, + typename TTypes::ConstScalar lr_power, + typename TTypes::ConstMatrix grad_flat, + typename TTypes::ConstVec indices_vec, + int64 inner_dim, bool multiply_linear_by_lr) { + const Tindex N = static_cast(indices_vec.dimension(0)); + if (N > 0) { + T lr_scalar = lr(); + T l1_scalar = l1(); + T l2_scalar = l2(); + T l2_shrinkage_scalar; + if (has_l2_shrinkage) { + l2_shrinkage_scalar = l2_shrinkage(); + } + T lr_power_scalar = lr_power(); + if (inner_dim > 1) { + const Tindex first_dim_size = + static_cast(var_flat.dimension(0)); + + for (Tindex i = 0; i < N; i++) { + const Tindex index = internal::SubtleMustCopy(indices_vec(i)); + if (!FastBoundsCheck(index, first_dim_size)) return i; + auto accum = accum_flat.template chip<0>(index); + auto linear = linear_flat.template chip<0>(index); + auto grad = grad_flat.template chip<0>(i); + auto var = var_flat.template chip<0>(index); + +// TODO(sanjoy): Remove this macro. +// Use a macro to implement the computation here due to the templating of the +// eigen tensor library. +#define COMPUTE_FTRL(grad, grad_maybe_with_shrinkage) \ + auto new_accum = accum + grad.square(); \ + if (multiply_linear_by_lr) { \ + if (lr_power_scalar == static_cast(-0.5)) { \ + linear += grad_maybe_with_shrinkage * lr_scalar - \ + (new_accum.sqrt() - accum.sqrt()) * var; \ + } else { \ + linear += \ + grad_maybe_with_shrinkage * lr_scalar - \ + (new_accum.pow(-lr_power_scalar) - accum.pow(-lr_power_scalar)) * \ + var; \ + } \ + } else { \ + if (lr_power_scalar == static_cast(-0.5)) { \ + linear += grad_maybe_with_shrinkage - \ + (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var; \ + } else { \ + linear += grad_maybe_with_shrinkage - (new_accum.pow(-lr_power_scalar) - \ + accum.pow(-lr_power_scalar)) / \ + lr_scalar * var; \ + } \ + } \ + auto l1_reg_adjust = \ + (multiply_linear_by_lr \ + ? linear.cwiseMin(l1_scalar * lr_scalar) \ + .cwiseMax(-l1_scalar * lr_scalar) \ + : linear.cwiseMin(l1_scalar).cwiseMax(-l1_scalar)); \ + auto x = l1_reg_adjust - linear; \ + if (multiply_linear_by_lr) { \ + if (lr_power_scalar == static_cast(-0.5)) { \ + auto y = new_accum.sqrt() + \ + linear.constant(static_cast(2) * l2_scalar * lr_scalar); \ + var = x / y; \ + } else { \ + auto y = new_accum.pow(-lr_power_scalar) + \ + linear.constant(static_cast(2) * l2_scalar * lr_scalar); \ + var = x / y; \ + } \ + } else { \ + if (lr_power_scalar == static_cast(-0.5)) { \ + auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) + \ + linear.constant(static_cast(2) * l2_scalar); \ + var = x / y; \ + } else { \ + auto y = \ + new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) + \ + linear.constant(static_cast(2) * l2_scalar); \ + var = x / y; \ + } \ + } \ + accum += grad.square(); + + if (has_l2_shrinkage) { + auto grad_with_shrinkage = + grad + static_cast(2) * l2_shrinkage_scalar * var; + COMPUTE_FTRL(grad, grad_with_shrinkage); + } else { + COMPUTE_FTRL(grad, grad); + } + } +#undef COMPUTE_FTRL + } else { + const Tindex first_dim_size = accum_flat.size(); + + for (Tindex i = 0; i < N; i++) { + const Tindex index = internal::SubtleMustCopy(indices_vec(i)); + if (!FastBoundsCheck(index, first_dim_size)) return i; + T& a = accum_flat(index); + T& l = linear_flat(index); + T& v = var_flat(index); + T g; + if (has_l2_shrinkage) { + g = grad_flat(i) + + (static_cast(2) * l2_shrinkage_scalar * var_flat(index)); + } else { + g = grad_flat(i); + } + + T updated_a = a + grad_flat(i) * grad_flat(i); + using Eigen::numext::pow; + T sigma = pow(updated_a, -lr_power_scalar) - pow(a, -lr_power_scalar); + if (!multiply_linear_by_lr) { + sigma /= lr_scalar; + } + T updated_l = (multiply_linear_by_lr ? l + g * lr_scalar - sigma * v + : l + g - sigma * v); + v = FtrlCompute(updated_a, updated_l, lr_scalar, l1_scalar, l2_scalar, + lr_power_scalar, multiply_linear_by_lr); + a = updated_a; + l = updated_l; + } + } + } + return static_cast(-1); + } +}; + template struct ApplyMomentum { void operator()(const CPUDevice& d, typename TTypes::Flat var, @@ -1512,35 +1677,6 @@ REGISTER_KERNELS(CPU, float); REGISTER_KERNELS(CPU, double); #undef REGISTER_KERNELS -namespace { - -template -inline T FtrlCompute(const T& accum, const T& linear, const T& lr, const T& l1, - const T& l2, const T& lr_power, - const bool multiply_linear_by_lr) { - T quadratic; - if (multiply_linear_by_lr) { - if (lr_power == static_cast(-0.5)) { - quadratic = Eigen::numext::sqrt(accum) + static_cast(2) * l2 * lr; - } else { - quadratic = - Eigen::numext::pow(accum, -lr_power) + static_cast(2) * l2 * lr; - } - auto l1_reg_adjust = std::max(std::min(linear, l1 * lr), -l1 * lr); - return (l1_reg_adjust - linear) / quadratic; - } else { - if (lr_power == static_cast(-0.5)) { - quadratic = Eigen::numext::sqrt(accum) / lr + static_cast(2) * l2; - } else { - quadratic = - Eigen::numext::pow(accum, -lr_power) / lr + static_cast(2) * l2; - } - auto l1_reg_adjust = std::max(std::min(linear, l1), -l1); - return (l1_reg_adjust - linear) / quadratic; - } -} -} // namespace - // Note, this op works on cpu only. template class SparseApplyAdagradOp : public OpKernel { @@ -2672,146 +2808,23 @@ class SparseApplyFtrlOp : public OpKernel { l2_shrinkage->shape().DebugString())); } - if (N > 0) { - if (inner_dim > 1) { - const Tindex first_dim_size = var.dim_size(0); - auto indices_vec = indices.vec(); - auto var_flat = var.flat_outer_dims(); - auto accum_flat = accum.flat_outer_dims(); - auto linear_flat = linear.flat_outer_dims(); - auto grad_flat = grad.flat_outer_dims(); - T lr_scalar = lr.scalar()(); - T l1_scalar = l1.scalar()(); - T l2_scalar = l2.scalar()(); - T l2_shrinkage_scalar; - if (has_l2_shrinkage) { - l2_shrinkage_scalar = l2_shrinkage->scalar()(); - } - T lr_power_scalar = lr_power.scalar()(); - - for (Tindex i = 0; i < N; i++) { - const Tindex index = internal::SubtleMustCopy(indices_vec(i)); - OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), - errors::InvalidArgument( - strings::StrCat("Index ", index, " at offset ", i, - " in indices is out of range"))); - auto accum = accum_flat.template chip<0>(index); - auto linear = linear_flat.template chip<0>(index); - auto grad = grad_flat.template chip<0>(i); - auto var = var_flat.template chip<0>(index); - -// Use a macro to implement the computation here due to the templating of the -// eigen tensor library. -#define COMPUTE_FTRL(grad, grad_maybe_with_shrinkage) \ - auto new_accum = accum + grad.square(); \ - if (multiply_linear_by_lr_) { \ - if (lr_power_scalar == static_cast(-0.5)) { \ - linear += grad_maybe_with_shrinkage * lr_scalar - \ - (new_accum.sqrt() - accum.sqrt()) * var; \ - } else { \ - linear += \ - grad_maybe_with_shrinkage * lr_scalar - \ - (new_accum.pow(-lr_power_scalar) - accum.pow(-lr_power_scalar)) * \ - var; \ - } \ - } else { \ - if (lr_power_scalar == static_cast(-0.5)) { \ - linear += grad_maybe_with_shrinkage - \ - (new_accum.sqrt() - accum.sqrt()) / lr_scalar * var; \ - } else { \ - linear += grad_maybe_with_shrinkage - (new_accum.pow(-lr_power_scalar) - \ - accum.pow(-lr_power_scalar)) / \ - lr_scalar * var; \ - } \ - } \ - auto l1_reg_adjust = \ - (multiply_linear_by_lr_ \ - ? linear.cwiseMin(l1_scalar * lr_scalar) \ - .cwiseMax(-l1_scalar * lr_scalar) \ - : linear.cwiseMin(l1_scalar).cwiseMax(-l1_scalar)); \ - auto x = l1_reg_adjust - linear; \ - if (multiply_linear_by_lr_) { \ - if (lr_power_scalar == static_cast(-0.5)) { \ - auto y = new_accum.sqrt() + \ - linear.constant(static_cast(2) * l2_scalar * lr_scalar); \ - var = x / y; \ - } else { \ - auto y = new_accum.pow(-lr_power_scalar) + \ - linear.constant(static_cast(2) * l2_scalar * lr_scalar); \ - var = x / y; \ - } \ - } else { \ - if (lr_power_scalar == static_cast(-0.5)) { \ - auto y = new_accum.sqrt() / new_accum.constant(lr_scalar) + \ - linear.constant(static_cast(2) * l2_scalar); \ - var = x / y; \ - } else { \ - auto y = \ - new_accum.pow(-lr_power_scalar) / new_accum.constant(lr_scalar) + \ - linear.constant(static_cast(2) * l2_scalar); \ - var = x / y; \ - } \ - } \ - accum += grad.square(); - - if (has_l2_shrinkage) { - auto grad_with_shrinkage = - grad + static_cast(2) * l2_shrinkage_scalar * var; - COMPUTE_FTRL(grad, grad_with_shrinkage); - } else { - COMPUTE_FTRL(grad, grad); - } - } -#undef COMPUTE_FTRL - } else { - T lr_scalar = lr.scalar()(); - T l1_scalar = l1.scalar()(); - T l2_scalar = l2.scalar()(); - T lr_power_scalar = lr_power.scalar()(); - T l2_shrinkage_scalar; - if (has_l2_shrinkage) { - l2_shrinkage_scalar = l2_shrinkage->scalar()(); - } - - auto indices_vec = indices.vec(); - auto var_flat = var.flat(); - auto accum_flat = accum.flat(); - auto linear_flat = linear.flat(); - auto grad_flat = grad.flat(); - const Tindex first_dim_size = accum_flat.size(); - - for (Tindex i = 0; i < N; i++) { - const Tindex index = internal::SubtleMustCopy(indices_vec(i)); - OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size), - errors::InvalidArgument( - strings::StrCat("Index ", index, " at offset ", i, - " in indices is out of range"))); - T& a = accum_flat(index); - T& l = linear_flat(index); - T& v = var_flat(index); - T g; - if (has_l2_shrinkage) { - g = grad_flat(i) + - (static_cast(2) * l2_shrinkage_scalar * var_flat(index)); - } else { - g = grad_flat(i); - } - - T updated_a = a + grad_flat(i) * grad_flat(i); - using Eigen::numext::pow; - T sigma = pow(updated_a, -lr_power_scalar) - pow(a, -lr_power_scalar); - if (!multiply_linear_by_lr_) { - sigma /= lr_scalar; - } - T updated_l = (multiply_linear_by_lr_ ? l + g * lr_scalar - sigma * v - : l + g - sigma * v); - v = FtrlCompute(updated_a, updated_l, lr_scalar, l1_scalar, l2_scalar, - lr_power_scalar, multiply_linear_by_lr_); - a = updated_a; - l = updated_l; - } - } - } + const Device& device = ctx->template eigen_device(); + auto indices_vec = indices.vec(); + const Tindex bad_i = + functor::SparseApplyFtrl()( + device, var.flat_outer_dims(), accum.flat_outer_dims(), + linear.flat_outer_dims(), lr.scalar(), l1.scalar(), + l2.scalar(), + // Note: Passing l2 as a placeholder when not has_l2_shrinkage (it + // will not be used). + has_l2_shrinkage ? l2_shrinkage->scalar() : l2.scalar(), + lr_power.scalar(), grad.flat_outer_dims(), indices_vec, + inner_dim, multiply_linear_by_lr_); + OP_REQUIRES( + ctx, bad_i < 0, + errors::InvalidArgument( + "indices", SliceDebugString(indices.shape(), bad_i), " = ", + indices_vec(bad_i), " is not in [0, ", var.dim_size(0), ")")); MaybeForwardRefInputToRefOutput(ctx, 0, 0); } diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h index ef44b5f9659..df48a37fcb8 100644 --- a/tensorflow/core/kernels/training_ops.h +++ b/tensorflow/core/kernels/training_ops.h @@ -151,6 +151,21 @@ struct ApplyFtrlV2MultiplyLinearByLr { typename TTypes::ConstScalar lr_power); }; +template +struct SparseApplyFtrl { + Tindex operator()(const Device& d, typename TTypes::Matrix var_flat, + typename TTypes::Matrix accum_flat, + typename TTypes::Matrix linear_flat, + typename TTypes::ConstScalar lr, + typename TTypes::ConstScalar l1, + typename TTypes::ConstScalar l2, + typename TTypes::ConstScalar l2_shrinkage, + typename TTypes::ConstScalar lr_power, + typename TTypes::ConstMatrix grad_flat, + typename TTypes::ConstVec indices_vec, + int64 inner_dim, bool multiply_linear_by_lr); +}; + template struct ApplyMomentum { void operator()(const Device& d, typename TTypes::Flat var, From 6fb1e4e4259333f315094d3ceb92ea3cde7c4174 Mon Sep 17 00:00:00 2001 From: kushanam Date: Thu, 8 Oct 2020 07:52:49 -0700 Subject: [PATCH 0103/1447] adding device to base iterator --- tensorflow/python/distribute/input_lib.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 36e42bf2a6b..991b3d269ee 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -589,7 +589,8 @@ class DistributedIteratorBase(DistributedIteratorInterface): # pylint: disable=super-init-not-called def __init__(self, input_workers, iterators, strategy, - enable_get_next_as_optional): + enable_get_next_as_optional, + replication_mode=InputReplicationMode.PER_WORKER): assert isinstance(input_workers, InputWorkers) if not input_workers.worker_devices: raise ValueError("Should have at least one worker for input iterator.") @@ -598,6 +599,7 @@ class DistributedIteratorBase(DistributedIteratorInterface): self._input_workers = input_workers self._strategy = strategy self._enable_get_next_as_optional = enable_get_next_as_optional + self._replication_mode = replication_mode def next(self): return self.__next__() @@ -627,12 +629,16 @@ class DistributedIteratorBase(DistributedIteratorInterface): if not self._enable_get_next_as_optional: replicas = [] for i, worker in enumerate(self._input_workers.worker_devices): + if self._replication_mode == InputReplicationMode.PER_WORKER: + worker_device = worker + else: + worker_device = self._input_workers._worker_device_pairs[i][1][0] if name is not None: - d = tf_device.DeviceSpec.from_string(worker) + d = tf_device.DeviceSpec.from_string(worker_device) new_name = "%s_%s_%d" % (name, d.job, d.task) else: new_name = None - with ops.device(worker): + with ops.device(worker_device): # Make `replicas` a flat list of values across all replicas. replicas.extend( self._iterators[i].get_next_as_list_static_shapes(new_name)) @@ -843,7 +849,8 @@ class DistributedIterator(DistributedIteratorBase, strategy=None, components=None, element_spec=None, - enable_get_next_as_optional=False): + enable_get_next_as_optional=False, + replication_mode=InputReplicationMode.PER_WORKER): if input_workers is None: raise ValueError("`input_workers` should be " "provided.") @@ -860,13 +867,14 @@ class DistributedIterator(DistributedIteratorBase, self._iterators = components self._strategy = strategy self._enable_get_next_as_optional = enable_get_next_as_optional + self._replication_mode = replication_mode else: if (components is not None and element_spec is not None): raise ValueError(error_message) super(DistributedIterator, self).__init__(input_workers, iterators, strategy, - enable_get_next_as_optional) + enable_get_next_as_optional, replication_mode) @property def element_spec(self): @@ -1180,7 +1188,7 @@ class DistributedDatasetsFromFunction(_IterableInput): self._strategy) else: iterator = DistributedIterator(self._input_workers, iterators, - self._strategy) + self._strategy, self._replication_mode) iterator._element_spec = self._element_spec # pylint: disable=protected-access # When async eager is enabled, sometimes the iterator may not finish @@ -1575,6 +1583,7 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, _SingleWorkerOwnedDatasetIterator from. element_spec: A nested structure of `TypeSpec` objects that represents the type specification of elements of the iterator. + replication_mode: an enum value of `tf.distribute.InputReplicationMode`. """ if worker is None or devices is None: raise ValueError("Both `worker` and `devices` should be provided") From a93e36e78ea82af7a770c082a9ad4f93fc0dd7e8 Mon Sep 17 00:00:00 2001 From: Advait Jain Date: Thu, 8 Oct 2020 14:08:34 -0700 Subject: [PATCH 0104/1447] Enforce that the target makefile must be called TARGET_makefile. This allows for better error reporting (b/170331334). For example: ``` make -f tensorflow/lite/micro/tools/make/Makefile microlite TARGET=foo ``` will give the following error: ``` tensorflow/lite/micro/tools/make/Makefile:359: tensorflow/lite/micro/tools/make/targets/foo_makefile.inc: No such file or directory make: *** No rule to make target 'tensorflow/lite/micro/tools/make/targets/foo_makefile.inc'. Stop. ``` --- tensorflow/lite/micro/tools/make/Makefile | 8 +- .../make/targets/apollo3evb_makefile.inc | 271 +++++++++--------- .../tools/make/targets/bluepill_makefile.inc | 102 ++++--- .../make/targets/sparkfun_edge_makefile.inc | 2 + 4 files changed, 186 insertions(+), 197 deletions(-) create mode 100644 tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index c3db0181a80..c23ca4e4aee 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -354,11 +354,9 @@ $(eval $(call add_third_party_download,$(RUY_URL),$(RUY_MD5),ruy,)) $(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),person_model_grayscale,)) $(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,)) -# These target-specific makefiles should modify or replace options like -# CXXFLAGS or LIBS to work for a specific targeted architecture. All logic -# based on platforms or architectures should happen within these files, to -# keep this main makefile focused on the sources and dependencies. -include $(wildcard $(MAKEFILE_DIR)/targets/*_makefile.inc) +# The target-specific makefile must have a name that is exactly +# TARGET_makefile.inc +include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc # Load dependencies for optimized kernel implementations. include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc) diff --git a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc index 68792496ec3..4d5e9e542b2 100644 --- a/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/apollo3evb_makefile.inc @@ -1,143 +1,136 @@ -# Settings for apollo3 evb and SparkFun Edge platforms. -ifeq ($(TARGET),$(filter $(TARGET),\ - apollo3evb\ - sparkfun_edge\ - )) - export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH) - TARGET_ARCH := cortex-m4 - TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- - TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/ - # Download the Ambiq Apollo3 SDK and set this variable to find the header - # files: - APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST) - # Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions - # with the hard interfaces. - GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/ +export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH) +TARGET_ARCH := cortex-m4 +TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- +TARGET_TOOLCHAIN_ROOT := $(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/downloads/gcc_embedded/bin/ +# Download the Ambiq Apollo3 SDK and set this variable to find the header +# files: +APOLLO3_SDK := $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST) +# Need a pointer to the GNU ARM toolchain for crtbegin.o for the fp functions +# with the hard interfaces. +GCC_ARM := $(MAKEFILE_DIR)/downloads/gcc_embedded/ - $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,)) - $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis)) - $(eval $(call add_third_party_download,$(AM_SDK_URL),$(AM_SDK_MD5),$(AM_SDK_DEST),patch_am_sdk)) +$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,)) +$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis)) +$(eval $(call add_third_party_download,$(AM_SDK_URL),$(AM_SDK_MD5),$(AM_SDK_DEST),patch_am_sdk)) - ifeq ($(findstring sparkfun,$(TARGET)), sparkfun) - $(eval $(call add_third_party_download,$(SF_BSPS_URL),$(SF_BSPS_MD5),$(AM_SDK_DEST)/$(SF_BSPS_DEST),)) - # Make sure that we download the full Ambiq SDK before the SparkFun BSPs. +ifeq ($(findstring sparkfun,$(TARGET)), sparkfun) + $(eval $(call add_third_party_download,$(SF_BSPS_URL),$(SF_BSPS_MD5),$(AM_SDK_DEST)/$(SF_BSPS_DEST),)) + # Make sure that we download the full Ambiq SDK before the SparkFun BSPs. $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST)/$(SF_BSPS_DEST): $(MAKEFILE_DIR)/downloads/$(AM_SDK_DEST) - endif - - PLATFORM_FLAGS = \ - -DPART_apollo3 \ - -DAM_PACKAGE_BGA \ - -DAM_PART_APOLLO3 \ - -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \ - -DTF_LITE_STATIC_MEMORY \ - -DNDEBUG \ - -DTF_LITE_MCU_DEBUG_LOG \ - -D __FPU_PRESENT=1 \ - -DARM_MATH_CM4 \ - -fno-rtti \ - -fmessage-length=0 \ - -fno-exceptions \ - -fno-unwind-tables \ - -ffunction-sections \ - -fdata-sections \ - -funsigned-char \ - -MMD \ - -mcpu=cortex-m4 \ - -mthumb \ - -mfpu=fpv4-sp-d16 \ - -mfloat-abi=hard \ - -std=gnu++11 \ - -Wvla \ - -Wall \ - -Wextra \ - -Wno-missing-field-initializers \ - -Wno-strict-aliasing \ - -Wno-type-limits \ - -Wno-unused-function \ - -Wno-unused-parameter \ - -fno-delete-null-pointer-checks \ - -fno-threadsafe-statics \ - -fomit-frame-pointer \ - -fno-use-cxa-atexit \ - -nostdlib \ - -ggdb \ - -O3 - CXXFLAGS += $(PLATFORM_FLAGS) - CCFLAGS += $(PLATFORM_FLAGS) - LDFLAGS += \ - -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \ - -nostartfiles -static \ - -Wl,--gc-sections -Wl,--entry,Reset_Handler \ - -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \ - -fno-exceptions \ - -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \ - -Wl,-T,$(TENSORFLOW_ROOT)$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \ - -Wl,-Map=$(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref - BUILD_TYPE := micro - ifeq ($(TARGET), apollo3evb) - BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp - endif - ifeq ($(findstring sparkfun,$(TARGET)), sparkfun) - BOARD_BSP_PATH := $(APOLLO3_SDK)/$(SF_BSPS_DEST)/$(subst sparkfun_,,$(TARGET))/bsp - INCLUDES+= \ - -I$(APOLLO3_SDK)/$(SF_BSPS_DEST)/common/third_party/hm01b0 - endif - MICROLITE_LIBS := \ - $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \ - $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \ - $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \ - -lm - INCLUDES += \ - -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \ - -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \ - -I$(GCC_ARM)/arm-none-eabi/ \ - -I$(APOLLO3_SDK)/mcu/apollo3/ \ - -I$(APOLLO3_SDK)/mcu/apollo3/regs \ - -I$(APOLLO3_SDK)/mcu/apollo3/hal \ - -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \ - -I$(BOARD_BSP_PATH) \ - -I$(APOLLO3_SDK)/devices/ \ - -I$(APOLLO3_SDK)/utils/ \ - - - # The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c - # file from Ambiq: - # - Increase the stack size from 1k to 20k - # - Change the application entry call from main() to _main() - # The am_*.c files should be copied from the Ambiq Apollo3 SDK - # _main.c contains application and target specific initialization, like - # setting clock speed, default uart setups, etc. and an implementation - # of the DebugLog interfaces. - MICROLITE_CC_SRCS += \ - $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \ - $(APOLLO3_SDK)/utils/am_util_delay.c \ - $(APOLLO3_SDK)/utils/am_util_faultisr.c \ - $(APOLLO3_SDK)/utils/am_util_id.c \ - $(APOLLO3_SDK)/utils/am_util_stdio.c \ - $(APOLLO3_SDK)/devices/am_devices_led.c - - CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source - THIRD_PARTY_CC_SRCS := \ - $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \ - $(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \ - $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \ - $(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \ - $(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \ - $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \ - $(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \ - $(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \ - $(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \ - $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \ - $(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c - - MICRO_SPEECH_TEST_SRCS += \ - $(AP3_MICRO_DIR)/_main.c - - TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh - # These are tests that don't currently work on the Apollo3 board. - EXCLUDED_TESTS := \ - tensorflow/lite/micro/micro_interpreter_test.cc \ - tensorflow/lite/micro/simple_tensor_allocator_test.cc - MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS)) - endif + +PLATFORM_FLAGS = \ + -DPART_apollo3 \ + -DAM_PACKAGE_BGA \ + -DAM_PART_APOLLO3 \ + -DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \ + -DTF_LITE_STATIC_MEMORY \ + -DNDEBUG \ + -DTF_LITE_MCU_DEBUG_LOG \ + -D __FPU_PRESENT=1 \ + -DARM_MATH_CM4 \ + -fno-rtti \ + -fmessage-length=0 \ + -fno-exceptions \ + -fno-unwind-tables \ + -ffunction-sections \ + -fdata-sections \ + -funsigned-char \ + -MMD \ + -mcpu=cortex-m4 \ + -mthumb \ + -mfpu=fpv4-sp-d16 \ + -mfloat-abi=hard \ + -std=gnu++11 \ + -Wvla \ + -Wall \ + -Wextra \ + -Wno-missing-field-initializers \ + -Wno-strict-aliasing \ + -Wno-type-limits \ + -Wno-unused-function \ + -Wno-unused-parameter \ + -fno-delete-null-pointer-checks \ + -fno-threadsafe-statics \ + -fomit-frame-pointer \ + -fno-use-cxa-atexit \ + -nostdlib \ + -ggdb \ + -O3 +CXXFLAGS += $(PLATFORM_FLAGS) +CCFLAGS += $(PLATFORM_FLAGS) +LDFLAGS += \ + -mthumb -mcpu=cortex-m4 -mfpu=fpv4-sp-d16 -mfloat-abi=hard \ + -nostartfiles -static \ + -Wl,--gc-sections -Wl,--entry,Reset_Handler \ + -Wl,--start-group -lm -lc -lgcc -Wl,--end-group \ + -fno-exceptions \ + -nostdlib --specs=nano.specs -t -lstdc++ -lc -lnosys -lm \ + -Wl,-T,$(TENSORFLOW_ROOT)$(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/apollo3evb.ld \ + -Wl,-Map=$(TENSORFLOW_ROOT)$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref +BUILD_TYPE := micro +ifeq ($(TARGET), apollo3evb) + BOARD_BSP_PATH := $(APOLLO3_SDK)/boards/apollo3_evb/bsp +endif +ifeq ($(findstring sparkfun,$(TARGET)), sparkfun) + BOARD_BSP_PATH := $(APOLLO3_SDK)/$(SF_BSPS_DEST)/$(subst sparkfun_,,$(TARGET))/bsp + INCLUDES+= \ + -I$(APOLLO3_SDK)/$(SF_BSPS_DEST)/common/third_party/hm01b0 +endif +MICROLITE_LIBS := \ + $(BOARD_BSP_PATH)/gcc/bin/libam_bsp.a \ + $(APOLLO3_SDK)/mcu/apollo3/hal/gcc/bin/libam_hal.a \ + $(GCC_ARM)/lib/gcc/arm-none-eabi/7.3.1/thumb/v7e-m/fpv4-sp/hard/crtbegin.o \ + -lm +INCLUDES += \ + -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \ + -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Include/ \ + -I$(GCC_ARM)/arm-none-eabi/ \ + -I$(APOLLO3_SDK)/mcu/apollo3/ \ + -I$(APOLLO3_SDK)/mcu/apollo3/regs \ + -I$(APOLLO3_SDK)/mcu/apollo3/hal \ + -I$(APOLLO3_SDK)/CMSIS/AmbiqMicro/Include/ \ + -I$(BOARD_BSP_PATH) \ + -I$(APOLLO3_SDK)/devices/ \ + -I$(APOLLO3_SDK)/utils/ \ + + +# The startup_gcc.c file is an altered version of the examples/hello_world/gcc/startup_gcc.c +# file from Ambiq: +# - Increase the stack size from 1k to 20k +# - Change the application entry call from main() to _main() +# The am_*.c files should be copied from the Ambiq Apollo3 SDK +# _main.c contains application and target specific initialization, like +# setting clock speed, default uart setups, etc. and an implementation +# of the DebugLog interfaces. +MICROLITE_CC_SRCS += \ + $(APOLLO3_SDK)/boards/apollo3_evb/examples/hello_world/gcc_patched/startup_gcc.c \ + $(APOLLO3_SDK)/utils/am_util_delay.c \ + $(APOLLO3_SDK)/utils/am_util_faultisr.c \ + $(APOLLO3_SDK)/utils/am_util_id.c \ + $(APOLLO3_SDK)/utils/am_util_stdio.c \ + $(APOLLO3_SDK)/devices/am_devices_led.c + +CMSIS_SRC_DIR := $(MAKEFILE_DIR)/downloads/cmsis/CMSIS/DSP/Source +THIRD_PARTY_CC_SRCS := \ +$(CMSIS_SRC_DIR)/BasicMathFunctions/arm_dot_prod_q15.c \ +$(CMSIS_SRC_DIR)/BasicMathFunctions/arm_mult_q15.c \ +$(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_init_q15.c \ +$(CMSIS_SRC_DIR)/TransformFunctions/arm_rfft_q15.c \ +$(CMSIS_SRC_DIR)/TransformFunctions/arm_bitreversal2.c \ +$(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_q15.c \ +$(CMSIS_SRC_DIR)/TransformFunctions/arm_cfft_radix4_q15.c \ +$(CMSIS_SRC_DIR)/CommonTables/arm_const_structs.c \ +$(CMSIS_SRC_DIR)/CommonTables/arm_common_tables.c \ +$(CMSIS_SRC_DIR)/StatisticsFunctions/arm_mean_q15.c \ +$(CMSIS_SRC_DIR)/StatisticsFunctions/arm_max_q7.c + +MICRO_SPEECH_TEST_SRCS += \ + $(AP3_MICRO_DIR)/_main.c + +TEST_SCRIPT := tensorflow/lite/micro/testing/test_apollo3evb_binary.sh +# These are tests that don't currently work on the Apollo3 board. +EXCLUDED_TESTS := \ + tensorflow/lite/micro/micro_interpreter_test.cc \ + tensorflow/lite/micro/simple_tensor_allocator_test.cc +MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS)) diff --git a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc index 96bc53d1809..e516554c063 100644 --- a/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/bluepill_makefile.inc @@ -1,64 +1,60 @@ -# Settings for Blue Pill platforms. -ifeq ($(TARGET), bluepill) +export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH) +TARGET_ARCH := cortex-m3 +TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- - export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH) - TARGET_ARCH := cortex-m3 - TARGET_TOOLCHAIN_PREFIX := arm-none-eabi- +$(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,)) +$(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis)) +$(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,)) - $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,)) - $(eval $(call add_third_party_download,$(CMSIS_URL),$(CMSIS_MD5),cmsis,patch_cmsis)) - $(eval $(call add_third_party_download,$(STM32_BARE_LIB_URL),$(STM32_BARE_LIB_MD5),stm32_bare_lib,)) +PLATFORM_FLAGS = \ + -DTF_LITE_MCU_DEBUG_LOG \ + -mcpu=cortex-m3 \ + -mthumb \ + -Wno-vla \ + -Wno-strict-aliasing \ + -Wno-shadow \ + -Wno-type-limits \ + -fomit-frame-pointer \ + -nostdlib - PLATFORM_FLAGS = \ - -DTF_LITE_MCU_DEBUG_LOG \ - -mcpu=cortex-m3 \ - -mthumb \ - -Wno-vla \ - -Wno-strict-aliasing \ - -Wno-shadow \ - -Wno-type-limits \ - -fomit-frame-pointer \ - -nostdlib +# TODO(b/168334217): Currently we always add -DNDEBUG because the build is +# broken w/o it. Remove this workaround once the issue is resolved. +PLATFORM_FLAGS += -DNDEBUG - # TODO(b/168334217): Currently we always add -DNDEBUG because the build is - # broken w/o it. Remove this workaround once the issue is resolved. - PLATFORM_FLAGS += -DNDEBUG +CXXFLAGS += $(PLATFORM_FLAGS) -fno-use-cxa-atexit +CCFLAGS += $(PLATFORM_FLAGS) - CXXFLAGS += $(PLATFORM_FLAGS) -fno-use-cxa-atexit - CCFLAGS += $(PLATFORM_FLAGS) +LDFLAGS += \ + -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \ + -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref - LDFLAGS += \ - -T $(MAKEFILE_DIR)/targets/bluepill/bluepill.lds \ - -Wl,-Map=$(MAKEFILE_DIR)/gen/$(TARGET).map,--cref +# Additional include paths needed for the stm_32_bare_lib only. +INCLUDES += \ + -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \ + -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include - # Additional include paths needed for the stm_32_bare_lib only. - INCLUDES += \ - -isystem$(MAKEFILE_DIR)/downloads/cmsis/CMSIS/Core/Include/ \ - -I$(MAKEFILE_DIR)/downloads/stm32_bare_lib/include +MICROLITE_CC_SRCS += \ + $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \ + $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc) +EXCLUDED_SRCS := \ + $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c +MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS)) - MICROLITE_CC_SRCS += \ - $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.c) \ - $(wildcard $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/*.cc) - EXCLUDED_SRCS := \ - $(MAKEFILE_DIR)/downloads/stm32_bare_lib/source/debug_log.c - MICROLITE_CC_SRCS := $(filter-out $(EXCLUDED_SRCS), $(MICROLITE_CC_SRCS)) +# TODO(b/143286954): Figure out why some tests fail and enable ince the issues +# are resolved. +EXCLUDED_TESTS := \ + tensorflow/lite/micro/micro_interpreter_test.cc \ + tensorflow/lite/micro/micro_allocator_test.cc \ + tensorflow/lite/micro/memory_helpers_test.cc \ + tensorflow/lite/micro/memory_arena_threshold_test.cc \ + tensorflow/lite/micro/kernels/circular_buffer_test.cc +MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS)) - # TODO(b/143286954): Figure out why some tests fail and enable ince the issues - # are resolved. - EXCLUDED_TESTS := \ - tensorflow/lite/micro/micro_interpreter_test.cc \ - tensorflow/lite/micro/micro_allocator_test.cc \ - tensorflow/lite/micro/memory_helpers_test.cc \ - tensorflow/lite/micro/memory_arena_threshold_test.cc \ - tensorflow/lite/micro/kernels/circular_buffer_test.cc - MICROLITE_TEST_SRCS := $(filter-out $(EXCLUDED_TESTS), $(MICROLITE_TEST_SRCS)) +EXCLUDED_EXAMPLE_TESTS := \ + tensorflow/lite/micro/examples/magic_wand/Makefile.inc \ + tensorflow/lite/micro/examples/micro_speech/Makefile.inc \ + tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc +MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS)) - EXCLUDED_EXAMPLE_TESTS := \ - tensorflow/lite/micro/examples/magic_wand/Makefile.inc \ - tensorflow/lite/micro/examples/micro_speech/Makefile.inc \ - tensorflow/lite/micro/examples/image_recognition_experimental/Makefile.inc - MICRO_LITE_EXAMPLE_TESTS := $(filter-out $(EXCLUDED_EXAMPLE_TESTS), $(MICRO_LITE_EXAMPLE_TESTS)) +TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh - TEST_SCRIPT := tensorflow/lite/micro/testing/test_bluepill_binary.sh - -endif diff --git a/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc b/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc new file mode 100644 index 00000000000..0a4e53202eb --- /dev/null +++ b/tensorflow/lite/micro/tools/make/targets/sparkfun_edge_makefile.inc @@ -0,0 +1,2 @@ +include $(MAKEFILE_DIR)/targets/apollo3evb_makefile.inc + From 8bb1b54d592964b551efef99f099231e19ec7944 Mon Sep 17 00:00:00 2001 From: Advait Jain Date: Thu, 8 Oct 2020 14:28:49 -0700 Subject: [PATCH 0105/1447] removed linux/osx makefiles and pulled in the one define into the common makefile. --- tensorflow/lite/micro/tools/make/Makefile | 1 + .../micro/tools/make/targets/linux_x86_makefile.inc | 9 --------- .../lite/micro/tools/make/targets/osx_makefile.inc | 13 ------------- .../tools/make/targets/osx_x86_64_makefile.inc | 10 ---------- 4 files changed, 1 insertion(+), 32 deletions(-) delete mode 100644 tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc delete mode 100644 tensorflow/lite/micro/tools/make/targets/osx_makefile.inc delete mode 100644 tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index c23ca4e4aee..d59ba923f56 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -111,6 +111,7 @@ COMMON_FLAGS := \ -fdata-sections \ -fmessage-length=0 \ -DTF_LITE_STATIC_MEMORY \ + -DTF_LITE_DISABLE_X86_NEON \ $(OPTIMIZATION_LEVEL) \ $(CC_WARNINGS) \ $(TAG_DEFINES) diff --git a/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc b/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc deleted file mode 100644 index 8ea78e8f3e3..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/linux_x86_makefile.inc +++ /dev/null @@ -1,9 +0,0 @@ -# Settings for x86 on Linux -ifeq ($(TARGET), linux) - ifeq ($(TARGET_ARCH), x86_64) - PLATFORM_FLAGS = \ - -DTF_LITE_DISABLE_X86_NEON - CXXFLAGS += $(PLATFORM_FLAGS) - CCFLAGS += $(PLATFORM_FLAGS) - endif -endif diff --git a/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc b/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc deleted file mode 100644 index 9b1e2220575..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/osx_makefile.inc +++ /dev/null @@ -1,13 +0,0 @@ -# Settings for Mac OS platforms. -ifeq ($(TARGET), osx) - - # Make sure we can find the embedded GCC compiler. - export PATH := ${PATH}:tensorflow/lite/micro/tools/make/downloads/gcc_embedded/bin/ - - PLATFORM_FLAGS = \ - -DTF_LITE_DISABLE_X86_NEON - - CXXFLAGS += $(PLATFORM_FLAGS) - CCFLAGS += $(PLATFORM_FLAGS) - -endif \ No newline at end of file diff --git a/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc b/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc deleted file mode 100644 index 78febaf5ddd..00000000000 --- a/tensorflow/lite/micro/tools/make/targets/osx_x86_64_makefile.inc +++ /dev/null @@ -1,10 +0,0 @@ -# Settings for x86 on Mac -ifeq ($(TARGET), osx) - ifeq ($(TARGET_ARCH), x86_64) - PLATFORM_FLAGS = \ - -DTF_LITE_DISABLE_X86_NEON - CXXFLAGS += $(PLATFORM_FLAGS) - CCFLAGS += $(PLATFORM_FLAGS) - endif -endif - From 1a59a417ac509eca82f154b2667551ea722a3d7a Mon Sep 17 00:00:00 2001 From: ShengYang1 Date: Fri, 9 Oct 2020 08:53:16 +0800 Subject: [PATCH 0106/1447] Code clean --- tensorflow/core/common_runtime/mkl_layout_pass.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc index 977ce9d06da..ede9511e4a7 100644 --- a/tensorflow/core/common_runtime/mkl_layout_pass.cc +++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc @@ -1695,8 +1695,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { } static bool Maxpool3DGradRewrite(const Node* n) { - CHECK_NOTNULL(n); - bool do_rewrite = false; + DCHECK(n); for (const Edge* e : n->in_edges()) { // Rewrite only if there is corresponding Maxpool3D, i.e., workspace is // available @@ -1705,11 +1704,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass { e->src()->type_string() == mkl_op_registry::GetMklOpName(csinfo_.max_pool3d) && e->src_output() == 0) { - do_rewrite = true; - break; + return true; } } - return do_rewrite; + return false; } static bool FusedBatchNormExRewrite(const Node* n) { From 9b3d8f2212aa106445ed90a802ae58b18151d641 Mon Sep 17 00:00:00 2001 From: Lukas Geiger Date: Fri, 9 Oct 2020 10:13:47 +0100 Subject: [PATCH 0107/1447] Cleanup Selectv2 broadcasting --- tensorflow/core/kernels/cwise_op_select.cc | 27 +++++----------------- 1 file changed, 6 insertions(+), 21 deletions(-) diff --git a/tensorflow/core/kernels/cwise_op_select.cc b/tensorflow/core/kernels/cwise_op_select.cc index 02a82892fed..a78af454f09 100644 --- a/tensorflow/core/kernels/cwise_op_select.cc +++ b/tensorflow/core/kernels/cwise_op_select.cc @@ -149,21 +149,9 @@ class SelectV2Op : public OpKernel { // The `cond`, `then`, and `else` are broadcastable (bcast.IsValid()), // This matches the behavior of numpy. - // TODO (yongtang): Consolidate into n-ary broadcast, instead of multiple - // 2-ary broadcast. - - // Combine `then` and `else`. - BCast then_else_bcast(BCast::FromShape(then->shape()), - BCast::FromShape(else_->shape()), false); - OP_REQUIRES(ctx, then_else_bcast.IsValid(), - errors::InvalidArgument( - "then ", then->shape().DebugString(), " and else ", - else_->shape().DebugString(), " must be broadcastable")); - // Combine `cond` with `then` and `else`. - BCast bcast( - BCast::FromShape(cond->shape()), - BCast::FromShape(BCast::ToShape(then_else_bcast.output_shape())), - false); + BCastList<3> bcast({cond->shape().dim_sizes(), then->shape().dim_sizes(), + else_->shape().dim_sizes()}, + false); OP_REQUIRES(ctx, bcast.IsValid(), errors::InvalidArgument( "condition ", cond->shape().DebugString(), ", then ", @@ -172,12 +160,9 @@ class SelectV2Op : public OpKernel { // Broadcast `cond`, `then` and `else` to combined shape, // in order to obtain the reshape. - BCast cond_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())), - BCast::FromShape(cond->shape()), false); - BCast then_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())), - BCast::FromShape(then->shape()), false); - BCast else_bcast(BCast::FromShape(BCast::ToShape(bcast.output_shape())), - BCast::FromShape(else_->shape()), false); + BCast cond_bcast(bcast.output_shape(), cond->shape().dim_sizes(), false); + BCast then_bcast(bcast.output_shape(), then->shape().dim_sizes(), false); + BCast else_bcast(bcast.output_shape(), else_->shape().dim_sizes(), false); OP_REQUIRES( ctx, cond_bcast.IsValid() && then_bcast.IsValid() && else_bcast.IsValid(), From 79c0b6c793dad595f1eb7d94cbfc48ad82da0351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Fri, 9 Oct 2020 12:52:35 +0200 Subject: [PATCH 0108/1447] TFLu: Download and modify PATH if GCC is not installed --- .../micro/tools/make/targets/cortex_m_generic_makefile.inc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc index d56ae722b9f..8546be1cae0 100644 --- a/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc +++ b/tensorflow/lite/micro/tools/make/targets/cortex_m_generic_makefile.inc @@ -112,6 +112,11 @@ ifneq ($(filter cortex-%-generic,$(TARGET)),) # https://developer.arm.com/documentation/100891/0611/troubleshooting/general-troubleshooting-advice MICROLITE_LIBS := $(filter-out -lm,$(MICROLITE_LIBS)) else ifeq ($(filter armgcc,$(ALL_TAGS)),armgcc) + ifeq ($(shell which arm-none-eabi-gcc),) + export PATH := $(MAKEFILE_DIR)/downloads/gcc_embedded/bin/:$(PATH) + $(eval $(call add_third_party_download,$(GCC_EMBEDDED_URL),$(GCC_EMBEDDED_MD5),gcc_embedded,)) + endif + CXX_TOOL := arm-none-eabi-gcc CC_TOOL := arm-none-eabi-gcc AR_TOOL := arm-none-eabi-gcc-ar From f7c67569fc1e76824308dec17b911c3ac7d70cc4 Mon Sep 17 00:00:00 2001 From: Mahmoud Abuzaina Date: Fri, 9 Oct 2020 08:49:10 -0700 Subject: [PATCH 0109/1447] Addressing review comment --- tensorflow/core/ops/mkl_nn_ops.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc index e4853587a6c..7531f91f4e2 100644 --- a/tensorflow/core/ops/mkl_nn_ops.cc +++ b/tensorflow/core/ops/mkl_nn_ops.cc @@ -538,8 +538,8 @@ REGISTER_OP("_MklNativeMaxPool3D") .Attr("workspace_enabled: bool = false") .SetShapeFn(shape_inference::Pool3DShape) .Doc(R"doc( -oneDNN version of MaxPoolGrad that does not depend on layout propagation. -Uses oneDNN APIs to compute gradients of MaxPool operator. +oneDNN version of MaxPool3D operator that does not depend on layout propagation. +Uses oneDNN APIs to perform 3D max pooling on the input. *NOTE*: Do not invoke this operator directly in Python. Graph rewrite pass is expected to invoke these operators. )doc"); From 0542d9f811992f887c8b3a923b0cbbc4a2ec74e8 Mon Sep 17 00:00:00 2001 From: "ag.ramesh" Date: Fri, 9 Oct 2020 09:29:44 -0700 Subject: [PATCH 0110/1447] Addressed review comments. --- third_party/llvm-openmp/BUILD | 16 +++++++--------- third_party/llvm/BUILD | 5 ++++- third_party/llvm/expand_cmake_vars.py | 4 ++-- third_party/llvm/llvm.bzl | 14 +++++++------- 4 files changed, 20 insertions(+), 19 deletions(-) diff --git a/third_party/llvm-openmp/BUILD b/third_party/llvm-openmp/BUILD index ee355768777..759e91e60fc 100644 --- a/third_party/llvm-openmp/BUILD +++ b/third_party/llvm-openmp/BUILD @@ -5,7 +5,6 @@ exports_files(["LICENSE.txt"]) load( "@org_tensorflow//third_party/llvm:llvm.bzl", "cmake_var_string", - "dict_add", "expand_cmake_vars", ) load( @@ -14,17 +13,17 @@ load( ) genrule( - name = "il8n_id", + name = "kmp_il8n_id", srcs = [ "runtime/tools/message-converter.pl", "runtime/src/i18n/en_US.txt", ], outs = ["include/kmp_i18n_id.inc"], - cmd = "$(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --enum=$@ $(location runtime/src/i18n/en_US.txt)", + cmd = "perl $(location runtime/tools/message-converter.pl) --os=lin --prefix=kmp_i18n --enum=$@ $(location runtime/src/i18n/en_US.txt)", ) genrule( - name = "kmp_i18n_default", + name = "kmp_il8n_default", srcs = [ "runtime/tools/message-converter.pl", "runtime/src/i18n/en_US.txt", @@ -49,12 +48,12 @@ omp_vars = { "LIBOMP_ENABLE_ASSERTIONS": 1, "LIBOMP_ENABLE_SHARED": 1, "LIBOMP_LEGAL_ARCH": "Intel(R) 64", - "LIBOMP_LIB_FILE": "libiomp5.so", + "LIBOMP_LIB_FILE": "libiomp5", "LIBOMP_VERSION_MAJOR": 5, "LIBOMP_VERSION_MINOR": 0, } -omp_all_cmake_vars = cmake_var_string(dict_add(omp_vars)) +omp_all_cmake_vars = cmake_var_string(omp_vars) expand_cmake_vars( name = "config_kmp", @@ -73,7 +72,6 @@ expand_cmake_vars( cc_binary( name = "libiomp5.so", srcs = glob([ - "runtime/src/*.h", "runtime/src/kmp_alloc.cpp", "runtime/src/kmp_atomic.cpp", "runtime/src/kmp_csupport.cpp", @@ -109,8 +107,8 @@ cc_binary( ]) + [ ":config_kmp", ":config_omp", - ":il8n_id", - ":kmp_i18n_default", + ":kmp_il8n_id", + ":kmp_il8n_default", ":ldscript", ], copts = ["-Domp_EXPORTS -D_GNU_SOURCE -D_REENTRANT"], diff --git a/third_party/llvm/BUILD b/third_party/llvm/BUILD index f2e079aad76..46cf18dd3da 100644 --- a/third_party/llvm/BUILD +++ b/third_party/llvm/BUILD @@ -2,5 +2,8 @@ py_binary( name = "expand_cmake_vars", srcs = ["expand_cmake_vars.py"], srcs_version = "PY2AND3", - visibility = ["//visibility:public"], + visibility = [ + "@llvm-openmp//:__subpackages__", + "@llvm-project//:__subpackages__", + ], ) diff --git a/third_party/llvm/expand_cmake_vars.py b/third_party/llvm/expand_cmake_vars.py index 067e4f88d5a..a8a4b9673ed 100644 --- a/third_party/llvm/expand_cmake_vars.py +++ b/third_party/llvm/expand_cmake_vars.py @@ -38,10 +38,10 @@ def _parse_args(argv): def _expand_variables(input_str, cmake_vars): - """Expands ${VARIABLE} and @VARIABLE@s in 'input_str', using dictionary 'cmake_vars'. + """Expands ${VARIABLE}s and @VARIABLE@s in 'input_str', using dictionary 'cmake_vars'. Args: - input_str: the string containing ${VARIABLE} and @VARIABLE@ expressions to expand. + input_str: the string containing ${VARIABLE} or @VARIABLE@ expressions to expand. cmake_vars: a dictionary mapping variable names to their values. Returns: diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl index c2be9dca302..dcbaab9edd4 100644 --- a/third_party/llvm/llvm.bzl +++ b/third_party/llvm/llvm.bzl @@ -7,7 +7,7 @@ TODO(chandlerc): Currently this expresses include-based dependencies as correctly understood by the build system. """ -def dict_add(*dictionaries): +def _dict_add(*dictionaries): """Returns a new `dict` that has all the entries of the given dictionaries. If the same key is present in more than one of the input dictionaries, the @@ -305,7 +305,7 @@ win32_cmake_vars = { # than hardcoding x86_64. llvm_all_cmake_vars = select({ "@org_tensorflow//tensorflow:macos": cmake_var_string( - dict_add( + _dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-apple-darwin"), posix_cmake_vars, @@ -313,7 +313,7 @@ llvm_all_cmake_vars = select({ ), ), "@org_tensorflow//tensorflow:linux_ppc64le": cmake_var_string( - dict_add( + _dict_add( cmake_vars, llvm_target_cmake_vars("PowerPC", "powerpc64le-unknown-linux_gnu"), posix_cmake_vars, @@ -321,21 +321,21 @@ llvm_all_cmake_vars = select({ ), ), "@org_tensorflow//tensorflow:windows": cmake_var_string( - dict_add( + _dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-pc-win32"), win32_cmake_vars, ), ), "@org_tensorflow//tensorflow:freebsd": cmake_var_string( - dict_add( + _dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-unknown-freebsd"), posix_cmake_vars, ), ), "@org_tensorflow//tensorflow:linux_s390x": cmake_var_string( - dict_add( + _dict_add( cmake_vars, llvm_target_cmake_vars("SystemZ", "systemz-unknown-linux_gnu"), posix_cmake_vars, @@ -343,7 +343,7 @@ llvm_all_cmake_vars = select({ ), ), "//conditions:default": cmake_var_string( - dict_add( + _dict_add( cmake_vars, llvm_target_cmake_vars("X86", "x86_64-unknown-linux_gnu"), posix_cmake_vars, From ce41ea78005d839d25b2b4a7da03b40f3545352a Mon Sep 17 00:00:00 2001 From: mdfaijul Date: Fri, 9 Oct 2020 09:41:17 -0700 Subject: [PATCH 0111/1447] Addressed review comments. --- .../core/kernels/mkl/mkl_batch_matmul_op.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc index 66903d8ff7a..c56aa73b7ce 100644 --- a/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc +++ b/tensorflow/core/kernels/mkl/mkl_batch_matmul_op.cc @@ -19,7 +19,7 @@ limitations under the License. // Multiplication (MatMul) operations. We currently register this kernel only // for oneDNN supported data types (float, bfloat16). The maximum number of // dimensions (rank) for output tensor is 12 in oneDNN. If output tensor rank -// exceeds 12, we fallback to Eigen library based kernel. +// exceeds 12, we fall back to Eigen library based kernel. #define EIGEN_USE_THREADS @@ -119,11 +119,11 @@ class BatchMatMulMkl : public OpKernel { out_shape.AddDim(lhs_rows); out_shape.AddDim(rhs_cols); // The maximum number of dimensions for a tensor in DNNL is 12. - OP_REQUIRES(ctx, out_shape.dims() <= 12, - errors::InvalidArgument( - "Rank of output tensor is required as <= 12, ", "but is ", - out_shape.dims(), ". Current implementation supports upto ", - "rank 12 tensors.")); + OP_REQUIRES( + ctx, out_shape.dims() <= 12, + errors::InvalidArgument( + "Rank of output tensor must be <= 12, but is ", out_shape.dims(), + ". Current implementation supports upto rank 12 tensors.")); Tensor* out = nullptr; OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out)); @@ -156,7 +156,7 @@ class BatchMatMulMkl : public OpKernel { using dims = dnnl::memory::dims; - // This method makes the rank (ndims) of input same as the output by creating + // This method makes the rank (ndims) of input same as the output by adding // new axes to the input. For example, if input shape is [a, b, c, d] and // output shape is [e, f, g, h, i, j], then the reshaped input would have a // shape of [1, 1, a, b, c, d]. @@ -188,7 +188,7 @@ class BatchMatMulMkl : public OpKernel { // Create dnnl::memory::dims for inputs and output of same rank. // It is assumed here that MatMulBCast object creates output_batch_shape as // a conforming superset of input batch shapes, i.e., ndims_out >= - // ndims_lhs and ndims_out >= ndims_lhs. + // ndims_lhs and ndims_out >= ndims_rhs. if (ndims_lhs < ndims_out) { ExpandInputDimsToOutputShape(lhs_shape, out_shape, &lhs_dims); } From 60e1ac246f95708ffbda3e3e9fbaa2c0d980fcc8 Mon Sep 17 00:00:00 2001 From: xiaohong1031 Date: Fri, 9 Oct 2020 10:12:46 -0700 Subject: [PATCH 0112/1447] minor change per code review suggestion --- .../core/common_runtime/mkl_layout_pass.cc | 75 +++++++++---------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc index 977ce9d06da..27e83f42e13 100644 --- a/tensorflow/core/common_runtime/mkl_layout_pass.cc +++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc @@ -1116,7 +1116,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // merged with 'm'. If input 'm' is Conv2D, then check if there exists BiasAdd // node that can be merged with 'm'. static Node* GetConv2DOrBiasAdd(const Node* m) { - CHECK_NOTNULL(m); + DCHECK(m); Node* n = nullptr; DataType T_m; @@ -1283,7 +1283,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // So 1st input of BiasAddGrad connects with 3rd input of // Conv2DBackpropFilter and vice versa. static Node* GetConv2DBackpropFilterOrBiasAddGrad(const Node* m) { - CHECK_NOTNULL(m); + DCHECK(m); Node* n = nullptr; DataType T_m; @@ -1537,7 +1537,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // @return - true (if it is not a depth/batch wise pooling case); // false otherwise. static bool NonDepthBatchWisePoolRewrite(const Node* n) { - CHECK_NOTNULL(n); + DCHECK(n); string data_format_str; TensorFormat data_format; @@ -1564,7 +1564,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { // and use default Eigen. But for depth_radius=2, MKL DNN optimized // path is taken, i.e., eigen node is rewritten by MKl DNN node. static bool LrnRewrite(const Node* n) { - CHECK_NOTNULL(n); + DCHECK(n); int depth_radius; TF_CHECK_OK(GetNodeAttr(n->def(), "depth_radius", &depth_radius)); @@ -1582,7 +1582,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { } static bool LrnGradRewrite(const Node* n) { - CHECK_NOTNULL(n); + DCHECK(n); bool do_rewrite = false; for (const Edge* e : n->in_edges()) { @@ -1676,8 +1676,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass { } return true; } + static bool MaxpoolGradRewrite(const Node* n) { - CHECK_NOTNULL(n); + DCHECK(n); bool do_rewrite = false; for (const Edge* e : n->in_edges()) { // Rewrite only if there is corresponding Maxpool, i.e workspace is @@ -1695,8 +1696,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass { } static bool Maxpool3DGradRewrite(const Node* n) { - CHECK_NOTNULL(n); - bool do_rewrite = false; + DCHECK(n); for (const Edge* e : n->in_edges()) { // Rewrite only if there is corresponding Maxpool3D, i.e., workspace is // available @@ -1705,11 +1705,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass { e->src()->type_string() == mkl_op_registry::GetMklOpName(csinfo_.max_pool3d) && e->src_output() == 0) { - do_rewrite = true; - break; + return true; } } - return do_rewrite; + return false; } static bool FusedBatchNormExRewrite(const Node* n) { @@ -2065,7 +2064,7 @@ void MklLayoutRewritePass::GetNodesProducingTFTensorList( int list_length, std::vector* output_nodes) { CHECK_LT(*input_idx, inputs.size()); CHECK_GT(list_length, 0); - CHECK_NOTNULL(output_nodes); + DCHECK(output_nodes); output_nodes->reserve(list_length); while (list_length != 0) { @@ -2102,7 +2101,7 @@ void MklLayoutRewritePass::GetDummyMklTensorNode(std::unique_ptr* g, // device of the original // node. .Finalize(&**g, out)); - CHECK_NOTNULL(*out); // Make sure we got a valid object before using it + DCHECK(*out); // Make sure we got a valid object before using it // If number of inputs to the original node is > 0, then we add // control dependency between 1st input (index 0) of the original node and @@ -2130,7 +2129,7 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList( int list_length, std::vector* output_nodes) { CHECK_LT(*input_idx, inputs.size()); CHECK_GT(list_length, 0); - CHECK_NOTNULL(output_nodes); + DCHECK(output_nodes); output_nodes->reserve(list_length); while (list_length != 0) { @@ -2158,9 +2157,9 @@ void MklLayoutRewritePass::GetNodesProducingMklTensorList( void MklLayoutRewritePass::GetNodeProducingMklTensor( std::unique_ptr* g, const Node* orig_node, Node* n, int n_output_slot, Node** mkl_node, int* mkl_node_output_slot) { - CHECK_NOTNULL(n); - CHECK_NOTNULL(mkl_node); - CHECK_NOTNULL(mkl_node_output_slot); + DCHECK(n); + DCHECK(mkl_node); + DCHECK(mkl_node_output_slot); // If this is an MKL op, then it will create extra output for MKL layout. DataType T; @@ -2179,7 +2178,7 @@ void MklLayoutRewritePass::GetNodeProducingMklTensor( // DummyMklTensor node has no input and generates only 1 output // (dummy Mkl tensor) as output slot number 0. GetDummyMklTensorNode(g, mkl_node, orig_node); - CHECK_NOTNULL(*mkl_node); + DCHECK(*mkl_node); *mkl_node_output_slot = 0; } } @@ -2190,7 +2189,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs( NodeBuilder* nb, const Node* old_node, std::vector* workspace_tensors, bool are_workspace_tensors_available) { - CHECK_NOTNULL(workspace_tensors); + DCHECK(workspace_tensors); CHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS); // TODO(nhasabni): Temporary solution to connect filter input of @@ -2208,7 +2207,7 @@ int MklLayoutRewritePass::SetUpContiguousInputs( Node* filter_node = nullptr; TF_CHECK_OK(old_node->input_node(kConv2DBackpropInputFilterInputSlotIdx, &filter_node)); - CHECK_NOTNULL(filter_node); + DCHECK(filter_node); // Now check which nodes receive from filter_node. Filter feeds as // 2nd input (slot 1) of _MklConv2D, _MklConv2DWithBias, and @@ -2458,7 +2457,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( std::unique_ptr* g, const Node* orig_node, NodeBuilder* nb, std::vector* ws_tensors, bool* are_ws_tensors_added) { bool workspace_edge_added = false; // Default initializer - CHECK_NOTNULL(are_ws_tensors_added); + DCHECK(are_ws_tensors_added); *are_ws_tensors_added = false; // Default initializer DataType T; @@ -2513,7 +2512,7 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( mkl_op_registry::GetMklOpName(ws.fwd_op) && e->dst_input() == ws.bwd_slot) { nb->Attr("workspace_enabled", true); - CHECK_NOTNULL(ws_tensors); + DCHECK(ws_tensors); // Add workspace edge between fwd op and bwd op. ws_tensors->push_back(NodeBuilder::NodeOut(e->src(), ws.ws_fwd_slot)); // Add Mkl tensor edge for workspace edge between fwd op and bwd op. @@ -2543,9 +2542,9 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded( Node* dmt_mkl_ws = nullptr; // Dummy Mkl tensor for workspace GetDummyWorkspaceTensorNode(g, &dmt_ws, orig_node); GetDummyMklTensorNode(g, &dmt_mkl_ws, orig_node); - CHECK_NOTNULL(dmt_ws); - CHECK_NOTNULL(dmt_mkl_ws); - CHECK_NOTNULL(ws_tensors); + DCHECK(dmt_ws); + DCHECK(dmt_mkl_ws); + DCHECK(ws_tensors); // We add dummy tensor as workspace tensor. ws_tensors->push_back(NodeBuilder::NodeOut(dmt_ws, 0)); // We add dummy tensor as Mkl tensor for workspace tensor. @@ -3207,8 +3206,8 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr* g, // BiasAdd has only 1 output (at slot 0) and merged node also has only 1 // output (at slot 0). const int kConv2DWithBiasOutputSlot = 0; - CHECK_NOTNULL((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(), - e->dst_input())); + DCHECK((*g)->AddEdge(new_node, kConv2DWithBiasOutputSlot, e->dst(), + e->dst_input())); } } @@ -3501,8 +3500,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad( (*g)->AddControlEdge(new_node, e->dst(), true); } } else { - CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx, - e->dst(), e->dst_input())); + DCHECK((*g)->AddEdge(new_node, kMergedNodeBiasGradOutputIdx, + e->dst(), e->dst_input())); } } unique_node.clear(); @@ -3515,8 +3514,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad( (*g)->AddControlEdge(new_node, e->dst(), true); } } else { - CHECK_NOTNULL((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx, - e->dst(), e->dst_input())); + DCHECK((*g)->AddEdge(new_node, kMergedNodeFilterGradOutputIdx, + e->dst(), e->dst_input())); } } @@ -3537,8 +3536,8 @@ Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad( Status MklLayoutRewritePass::MergeNode(std::unique_ptr* g, Node* m, Node* n) { - CHECK_NOTNULL(m); - CHECK_NOTNULL(n); + DCHECK(m); + DCHECK(n); if (((m->type_string() == csinfo_.bias_add && n->type_string() == csinfo_.conv2d)) || @@ -3644,7 +3643,7 @@ Status MklLayoutRewritePass::RewriteNodeForLayoutPropagation( (*g)->AddControlEdge(*new_node, e->dst(), true); } } else { - CHECK_NOTNULL((*g)->AddEdge( + DCHECK((*g)->AddEdge( *new_node, GetTensorDataIndex(e->src_output(), e->src()->num_outputs()), e->dst(), e->dst_input())); @@ -3792,7 +3791,7 @@ MklLayoutRewritePass::CheckForQuantizedNodeRewrite(const Node* n) const { const MklLayoutRewritePass::RewriteInfo* MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const { - CHECK_NOTNULL(n); + DCHECK(n); // QuantizedOps may have attributes other than "T", so decoupled the check // with a function, CheckForQuantizedNodeRewrite(const Node*). @@ -4011,8 +4010,8 @@ bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr* g, if (IsConstant(e_metadata->src())) { Node* e_metadata_dst = e_metadata->dst(); int e_metadata_in_slot = e_metadata->dst_input(); - CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst, - e_metadata_in_slot)); + DCHECK((*g)->AddEdge(n_data, n_metadata_op_slot, e_metadata_dst, + e_metadata_in_slot)); (*g)->RemoveEdge(e_metadata); return true; @@ -4084,7 +4083,7 @@ bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr* g, bool MklLayoutRewritePass::RunPass(std::unique_ptr* g) { bool result = false; - CHECK_NOTNULL(g); + DCHECK(g); DumpGraph("Before running MklLayoutRewritePass", &**g); From 7699824d80ba7e8f8b2653b965fbbec8ce65e789 Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 30 Sep 2020 19:51:15 +0000 Subject: [PATCH 0113/1447] Expose TF_RegisterFilesystemPlugin C API This PR is part of the effort for modular file system. In modular file system `RegisterFilesystemPlugin` was a C++ API and there is no directly way to call this API through python. This PR exposes C API `TF_RegisterFilesystemPlugin`, so that it can be used in python bindings. Signed-off-by: Yong Tang --- tensorflow/c/BUILD | 1 + tensorflow/c/c_api.cc | 11 +++++++++++ tensorflow/c/c_api.h | 7 +++++++ 3 files changed, 19 insertions(+) diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD index 677ab3355ff..ba76e2f6705 100644 --- a/tensorflow/c/BUILD +++ b/tensorflow/c/BUILD @@ -217,6 +217,7 @@ tf_cuda_library( "//tensorflow/core:lib_internal", "//tensorflow/core/distributed_runtime:server_lib", "//tensorflow/core/kernels:logging_ops", + "//tensorflow/c/experimental/filesystem:modular_filesystem", ], }), alwayslink = 1, diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc index a03e9227a75..9579efab94d 100644 --- a/tensorflow/c/c_api.cc +++ b/tensorflow/c/c_api.cc @@ -25,6 +25,7 @@ limitations under the License. #include "tensorflow/core/platform/platform.h" // NOLINT #if !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD) +#include "tensorflow/c/experimental/filesystem/modular_filesystem.h" #include "tensorflow/cc/framework/gradients.h" #include "tensorflow/cc/framework/ops.h" #include "tensorflow/cc/framework/scope_internal.h" @@ -2606,4 +2607,14 @@ void TF_RegisterLogListener(void (*listener)(const char*)) { #endif // !defined(IS_MOBILE_PLATFORM) && !defined(IS_SLIM_BUILD) } +void TF_RegisterFilesystemPlugin(const char* plugin_filename, + TF_Status* status) { +#if defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD) + status->status = tensorflow::errors::Unimplemented( + "FileSystem plugin functionality is not supported on mobile"); +#else + status->status = tensorflow::RegisterFilesystemPlugin(plugin_filename); +#endif // defined(IS_MOBILE_PLATFORM) || defined(IS_SLIM_BUILD) +} + } // end extern "C" diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h index db5f8fd68f8..3f25f7ec10e 100644 --- a/tensorflow/c/c_api.h +++ b/tensorflow/c/c_api.h @@ -1577,6 +1577,13 @@ TF_CAPI_EXPORT extern void TF_DeleteServer(TF_Server* server); TF_CAPI_EXPORT extern void TF_RegisterLogListener( void (*listener)(const char*)); +// Register a FileSystem plugin from filename `plugin_filename`. +// +// On success, place OK in status. +// On failure, place an error status in status. +TF_CAPI_EXPORT extern void TF_RegisterFilesystemPlugin(const char* plugin_filename, + TF_Status* status); + #ifdef __cplusplus } /* end extern "C" */ #endif From bcf7e85b0c61b156f02594fff3c8041edda8d82a Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Wed, 30 Sep 2020 19:54:57 +0000 Subject: [PATCH 0114/1447] Expose Python binding of TF_RegisterFilesystemPlugin and tf.experimental.register_filesystem_plugin This PR is parf of modular file system to expose the python binding of C API `TF_RegisterFilesystemPlugin`. In addition, an experimental API `tf.experimental.register_filesystem_plugin` has been setup so that it is possible to register a modular file system plugin with: ``` tf.experimental.register_filesystem_plugin(plugin_path) ``` Signed-off-by: Yong Tang --- .../python/client/tf_session_wrapper.cc | 9 +++++++ tensorflow/python/framework/load_library.py | 25 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc index ac656d322c4..cfbe6cbe42a 100644 --- a/tensorflow/python/client/tf_session_wrapper.cc +++ b/tensorflow/python/client/tf_session_wrapper.cc @@ -1155,6 +1155,15 @@ PYBIND11_MODULE(_pywrap_tf_session, m) { return "TensorHandle"; }); + m.def( + "TF_RegisterFilesystemPlugin", + [](const char* plugin_filename) { + tensorflow::Safe_TF_StatusPtr status = + tensorflow::make_safe(TF_NewStatus()); + TF_RegisterFilesystemPlugin(plugin_filename, status.get()); + tensorflow::MaybeRaiseRegisteredFromTFStatus(status.get()); + }); + py::enum_(m, "TF_DataType") .value("TF_FLOAT", TF_FLOAT) .value("TF_DOUBLE", TF_DOUBLE) diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py index f37b48e76c2..a8c7a4e4185 100644 --- a/tensorflow/python/framework/load_library.py +++ b/tensorflow/python/framework/load_library.py @@ -157,3 +157,28 @@ def load_library(library_location): errno.ENOENT, 'The file or folder to load kernel libraries from does not exist.', library_location) + + +@tf_export('experimental.register_filesystem_plugin') +def register_filesystem_plugin(plugin_location): + """Loads a TensorFlow FileSystem plugin. + + Args: + plugin_location: Path to the plugin. + Relative or absolute filesystem plugin path to a dynamic library file. + + Returns: + None + + Raises: + OSError: When the file to be loaded is not found. + RuntimeError: when unable to load the library. + """ + if os.path.exists(plugin_location): + py_tf.TF_RegisterFilesystemPlugin(plugin_location) + + else: + raise OSError( + errno.ENOENT, + 'The file to load file system plugin from does not exist.', + plugin_location) From b5126cbfb26350996d1c3f28931803ec42df9bd4 Mon Sep 17 00:00:00 2001 From: Advait Jain Date: Fri, 9 Oct 2020 12:34:39 -0700 Subject: [PATCH 0115/1447] Fix case when target and host_os are the same. --- tensorflow/lite/micro/tools/make/Makefile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index d59ba923f56..73ef25220b0 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -356,8 +356,11 @@ $(eval $(call add_third_party_download,$(PERSON_MODEL_URL),$(PERSON_MODEL_MD5),p $(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_INT8_MD5),person_model_int8,)) # The target-specific makefile must have a name that is exactly -# TARGET_makefile.inc +# TARGET_makefile.inc and is only needed for cross-compilation (i.e. when TARGET +# is different from the HOST_OS). +ifneq ($(TARGET),$(HOST_OS)) include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc +endif # Load dependencies for optimized kernel implementations. include $(wildcard $(MAKEFILE_DIR)/ext_libs/*.inc) From 7c991e6c6472430e414622058ab8ad6439aeefb4 Mon Sep 17 00:00:00 2001 From: Advait Jain Date: Fri, 9 Oct 2020 13:02:35 -0700 Subject: [PATCH 0116/1447] fix the arduino build. --- tensorflow/lite/micro/tools/make/Makefile | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/micro/tools/make/Makefile b/tensorflow/lite/micro/tools/make/Makefile index 73ef25220b0..fa1aa3f0baf 100644 --- a/tensorflow/lite/micro/tools/make/Makefile +++ b/tensorflow/lite/micro/tools/make/Makefile @@ -359,7 +359,13 @@ $(eval $(call add_third_party_download,$(PERSON_MODEL_INT8_URL),$(PERSON_MODEL_I # TARGET_makefile.inc and is only needed for cross-compilation (i.e. when TARGET # is different from the HOST_OS). ifneq ($(TARGET),$(HOST_OS)) -include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc + # The arduino is also special in that it does not have an arduino_makefile but + # the TARGET=arduino is still used to create a directory for the generated + # artifacts. We are using a workaround right now and will be separating the + # project generation from the Makefile in the future. + ifneq ($(TARGET),arduino) + include $(MAKEFILE_DIR)/targets/$(TARGET)_makefile.inc + endif endif # Load dependencies for optimized kernel implementations. From 8754ae7756fb89b8ff675021fc636029a3722a4a Mon Sep 17 00:00:00 2001 From: "ag.ramesh" Date: Fri, 9 Oct 2020 17:39:18 -0700 Subject: [PATCH 0117/1447] More changes based on review comments. --- third_party/llvm-openmp/BUILD | 19 +++++++++++++++---- third_party/mkl/BUILD | 3 +++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/third_party/llvm-openmp/BUILD b/third_party/llvm-openmp/BUILD index 759e91e60fc..dcfb6d8ff8d 100644 --- a/third_party/llvm-openmp/BUILD +++ b/third_party/llvm-openmp/BUILD @@ -13,7 +13,7 @@ load( ) genrule( - name = "kmp_il8n_id", + name = "kmp_i18n_id", srcs = [ "runtime/tools/message-converter.pl", "runtime/src/i18n/en_US.txt", @@ -23,7 +23,7 @@ genrule( ) genrule( - name = "kmp_il8n_default", + name = "kmp_i18n_default", srcs = [ "runtime/tools/message-converter.pl", "runtime/src/i18n/en_US.txt", @@ -69,6 +69,17 @@ expand_cmake_vars( dst = "include/omp.h", ) +# TODO(Intel-tf) Replace the following cc_binary call with cc_library. +# cc_library should be used for files that are not independently executed. Using +# cc_library here results in the following linking errors. +# ERROR: //tensorflow/BUILD:689:1: Linking of rule '//tensorflow:libtensorflow_framework.so.2.4.0' failed (Exit 1) +# /usr/bin/ld.gold: error: symbol GOMP_parallel_loop_nonmonotonic_guided has undefined version VERSION +# /usr/bin/ld.gold: error: symbol GOMP_parallel_start has undefined version GOMP_1.0 +# /usr/bin/ld.gold: error: symbol GOMP_cancellation_point has undefined version GOMP_4.0 +# /usr/bin/ld.gold: error: symbol omp_set_num_threads has undefined version OMP_1.0 +# ...... +# ...... + cc_binary( name = "libiomp5.so", srcs = glob([ @@ -107,8 +118,8 @@ cc_binary( ]) + [ ":config_kmp", ":config_omp", - ":kmp_il8n_id", - ":kmp_il8n_default", + ":kmp_i18n_id", + ":kmp_i18n_default", ":ldscript", ], copts = ["-Domp_EXPORTS -D_GNU_SOURCE -D_REENTRANT"], diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD index 5cee0514a06..8b9e0e6b0bc 100644 --- a/third_party/mkl/BUILD +++ b/third_party/mkl/BUILD @@ -47,6 +47,9 @@ filegroup( visibility = ["//visibility:public"], ) +# TODO(Intel-tf) Remove the following call to cc_library and replace all uses +# of mkl_libs_linux with @llvm-openmp//:libiomp5.so directly. + cc_library( name = "mkl_libs_linux", srcs = [ From 005269b40e6eb3d6c17f4ea2759c2a86beaa27cf Mon Sep 17 00:00:00 2001 From: kushanam Date: Fri, 9 Oct 2020 23:54:47 -0700 Subject: [PATCH 0118/1447] Revert "adding device to base iterator" This reverts commit 6fb1e4e4259333f315094d3ceb92ea3cde7c4174. --- tensorflow/python/distribute/input_lib.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 991b3d269ee..36e42bf2a6b 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -589,8 +589,7 @@ class DistributedIteratorBase(DistributedIteratorInterface): # pylint: disable=super-init-not-called def __init__(self, input_workers, iterators, strategy, - enable_get_next_as_optional, - replication_mode=InputReplicationMode.PER_WORKER): + enable_get_next_as_optional): assert isinstance(input_workers, InputWorkers) if not input_workers.worker_devices: raise ValueError("Should have at least one worker for input iterator.") @@ -599,7 +598,6 @@ class DistributedIteratorBase(DistributedIteratorInterface): self._input_workers = input_workers self._strategy = strategy self._enable_get_next_as_optional = enable_get_next_as_optional - self._replication_mode = replication_mode def next(self): return self.__next__() @@ -629,16 +627,12 @@ class DistributedIteratorBase(DistributedIteratorInterface): if not self._enable_get_next_as_optional: replicas = [] for i, worker in enumerate(self._input_workers.worker_devices): - if self._replication_mode == InputReplicationMode.PER_WORKER: - worker_device = worker - else: - worker_device = self._input_workers._worker_device_pairs[i][1][0] if name is not None: - d = tf_device.DeviceSpec.from_string(worker_device) + d = tf_device.DeviceSpec.from_string(worker) new_name = "%s_%s_%d" % (name, d.job, d.task) else: new_name = None - with ops.device(worker_device): + with ops.device(worker): # Make `replicas` a flat list of values across all replicas. replicas.extend( self._iterators[i].get_next_as_list_static_shapes(new_name)) @@ -849,8 +843,7 @@ class DistributedIterator(DistributedIteratorBase, strategy=None, components=None, element_spec=None, - enable_get_next_as_optional=False, - replication_mode=InputReplicationMode.PER_WORKER): + enable_get_next_as_optional=False): if input_workers is None: raise ValueError("`input_workers` should be " "provided.") @@ -867,14 +860,13 @@ class DistributedIterator(DistributedIteratorBase, self._iterators = components self._strategy = strategy self._enable_get_next_as_optional = enable_get_next_as_optional - self._replication_mode = replication_mode else: if (components is not None and element_spec is not None): raise ValueError(error_message) super(DistributedIterator, self).__init__(input_workers, iterators, strategy, - enable_get_next_as_optional, replication_mode) + enable_get_next_as_optional) @property def element_spec(self): @@ -1188,7 +1180,7 @@ class DistributedDatasetsFromFunction(_IterableInput): self._strategy) else: iterator = DistributedIterator(self._input_workers, iterators, - self._strategy, self._replication_mode) + self._strategy) iterator._element_spec = self._element_spec # pylint: disable=protected-access # When async eager is enabled, sometimes the iterator may not finish @@ -1583,7 +1575,6 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, _SingleWorkerOwnedDatasetIterator from. element_spec: A nested structure of `TypeSpec` objects that represents the type specification of elements of the iterator. - replication_mode: an enum value of `tf.distribute.InputReplicationMode`. """ if worker is None or devices is None: raise ValueError("Both `worker` and `devices` should be provided") From 41597b290a30cbb488e569fb38b5abeb19fb95f2 Mon Sep 17 00:00:00 2001 From: kushanam Date: Sat, 10 Oct 2020 00:59:43 -0700 Subject: [PATCH 0119/1447] apply review changes p1 --- .../collective_all_reduce_strategy.py | 2 +- tensorflow/python/distribute/input_lib.py | 59 ++++++------------- .../python/distribute/mirrored_strategy.py | 4 ++ .../python/distribute/one_device_strategy.py | 2 +- .../distribute/parameter_server_strategy.py | 2 +- tensorflow/python/distribute/tpu_strategy.py | 2 +- 6 files changed, 25 insertions(+), 46 deletions(-) diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py index 7bca1810bfd..fc2b286c137 100644 --- a/tensorflow/python/distribute/collective_all_reduce_strategy.py +++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py @@ -487,7 +487,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended): if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: raise NotImplementedError("InputReplicationMode.PER_REPLICA " "is only supported in `experimental_distribute_datasets_from_function` " - "of mirrored_strategy") + "of tf.distribute.MirroredStrategy") input_context = self._make_input_context() return input_lib.get_distributed_datasets_from_function( dataset_fn=dataset_fn, diff --git a/tensorflow/python/distribute/input_lib.py b/tensorflow/python/distribute/input_lib.py index 36e42bf2a6b..c4d06758625 100644 --- a/tensorflow/python/distribute/input_lib.py +++ b/tensorflow/python/distribute/input_lib.py @@ -1147,16 +1147,15 @@ class DistributedDatasetsFromFunction(_IterableInput): "input_contexts (%d)" % (input_workers.num_workers, len(input_contexts))) - self._dataset_fn = dataset_fn self._input_workers = input_workers self._input_contexts = input_contexts self._strategy = strategy self._replication_mode = replication_mode self._datasets, element_spec = ( - _create_datasets_per_worker_with_input_context(self._input_contexts, - self._input_workers, - dataset_fn, - self._replication_mode)) + _create_datasets_from_function_with_input_context(self._input_contexts, + self._input_workers, + dataset_fn, + self._replication_mode)) self._enable_get_next_as_optional = _enable_get_next_as_optional( self._strategy, element_spec) self._element_spec = _create_distributed_tensor_spec( @@ -1164,7 +1163,7 @@ class DistributedDatasetsFromFunction(_IterableInput): def __iter__(self): if (ops.executing_eagerly_outside_functions() or - ops.get_default_graph().building_function): + ops.get_default_graph().building_function): # This is an optional flag that can be used to turn off using # OwnedMultiDeviceIterators and instead use the legacy # MultiDeviceIterators as a stop gap solution that will allow us to roll @@ -1176,8 +1175,10 @@ class DistributedDatasetsFromFunction(_IterableInput): enable_legacy_iterators, self._replication_mode) if enable_legacy_iterators: - iterator = DistributedIteratorV1(self._input_workers, iterators, - self._strategy) + iterator = DistributedIteratorV1(self._input_workers, + iterators, + self._strategy, + enable_get_next_as_optional=self._enable_get_next_as_optional) else: iterator = DistributedIterator(self._input_workers, iterators, self._strategy) @@ -1415,8 +1416,7 @@ def _recover_shape_fn(data, value_structure): class _SingleWorkerDatasetIteratorBase(object): """Iterator for a single `tf.data.Dataset`.""" - def __init__(self, dataset, worker, devices, - replication_mode=InputReplicationMode.PER_WORKER): + def __init__(self, dataset, worker, devices): """Create iterator for the `dataset` to fetch data to worker's `devices` . A `MultiDeviceIterator` or `OwnedMultiDeviceIterator` is used to prefetch @@ -1431,7 +1431,6 @@ class _SingleWorkerDatasetIteratorBase(object): self._worker = worker self._devices = devices self._element_spec = dataset.element_spec - self._replication_mode = replication_mode self._make_iterator() def _make_iterator(self): @@ -1575,6 +1574,7 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, _SingleWorkerOwnedDatasetIterator from. element_spec: A nested structure of `TypeSpec` objects that represents the type specification of elements of the iterator. + replication_mode: Replication mode for the input function. """ if worker is None or devices is None: raise ValueError("Both `worker` and `devices` should be provided") @@ -1582,6 +1582,7 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, error_message = ("Either `dataset` or both `components` and `element_spec` " "need to be provided.") + self._replication_mode = replication_mode if dataset is None: if (components is None or element_spec is None): raise ValueError(error_message) @@ -1589,14 +1590,12 @@ class _SingleWorkerOwnedDatasetIterator(_SingleWorkerDatasetIteratorBase, self._worker = worker self._devices = devices self._iterator = components[0] - self._replication_mode = replication_mode else: if (components is not None or element_spec is not None): raise ValueError(error_message) super(_SingleWorkerOwnedDatasetIterator, self).__init__(dataset=dataset, worker=worker, - devices=devices, - replication_mode=replication_mode) + devices=devices) def _make_iterator(self): """Make appropriate iterator on the dataset.""" @@ -1698,17 +1697,6 @@ class _SingleWorkerDatasetIterator(_SingleWorkerDatasetIteratorBase): return dataset_ops.get_legacy_output_types(self._iterator) -class _SingleReplicaDatasetIterator(_SingleWorkerOwnedDatasetIterator): - def __init__(self, dataset, device): - super(_SingleReplicaDatasetIterator, self).__init__(dataset, device, []) - - def _make_iterator(self): - """Make appropriate iterator on the dataset.""" - with ops.device(self._worker): - self._iterator = iter(self._dataset) - - - class _SingleWorkerCallableIterator(object): """Iterator for a single tensor-returning callable.""" @@ -1742,19 +1730,6 @@ class _SingleWorkerCallableIterator(object): return [] -def _create_iterators_per_replica(input_contexts, input_workers, - dataset_fn): - """Create a multidevice iterator per workers given a dataset function.""" - iterators = [] - for i, ctx in enumerate(input_contexts): - devices = input_workers.compute_devices_for_worker(i) - dataset = dataset_fn(ctx) - # Wrapping dataset here (ex. applying options) might result in moving it to the CPU - iterator = _SingleReplicaDatasetIterator(dataset, devices[0]) - iterators.append(iterator) - return iterators - - def _create_iterators_per_worker(worker_datasets, input_workers, enable_legacy_iterators, replication_mode=InputReplicationMode.PER_WORKER): @@ -1777,10 +1752,10 @@ def _create_iterators_per_worker(worker_datasets, input_workers, return iterators -def _create_datasets_per_worker_with_input_context(input_contexts, - input_workers, - dataset_fn, - replication_mode): +def _create_datasets_from_function_with_input_context(input_contexts, + input_workers, + dataset_fn, + replication_mode): """Create device datasets per worker given a dataset function.""" datasets = [] for i, ctx in enumerate(input_contexts): diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py index f75d0ddc3e9..0eef2849b17 100644 --- a/tensorflow/python/distribute/mirrored_strategy.py +++ b/tensorflow/python/distribute/mirrored_strategy.py @@ -234,7 +234,9 @@ class MirroredStrategy(distribute_lib.Strategy): the replicas when writing your own training loop. If you are using `.fit` and `.compile` methods available in `tf.keras`, then `tf.keras` will handle the distribution for you. + For example: + ```python my_strategy = tf.distribute.MirroredStrategy() with my_strategy.scope(): @@ -243,12 +245,14 @@ class MirroredStrategy(distribute_lib.Strategy): def replica_fn(input): # process input and return result return result + total_result = 0 for x in dataset: per_replica_result = my_strategy.run(replica_fn, args=(x,)) total_result += my_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_result, axis=None) return total_result + dist_dataset = my_strategy.experimental_distribute_dataset(dataset) for _ in range(EPOCHS): train_result = distribute_train_epoch(dist_dataset) diff --git a/tensorflow/python/distribute/one_device_strategy.py b/tensorflow/python/distribute/one_device_strategy.py index 003a24c2b6e..4c6b25c700b 100644 --- a/tensorflow/python/distribute/one_device_strategy.py +++ b/tensorflow/python/distribute/one_device_strategy.py @@ -325,7 +325,7 @@ class OneDeviceExtended(distribute_lib.StrategyExtendedV1): if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: raise NotImplementedError("InputReplicationMode.PER_REPLICA " "is only supported in `experimental_distribute_datasets_from_function` " - "of mirrored_strategy") + "of tf.distribute.MirroredStrategy") return input_lib.get_distributed_datasets_from_function( dataset_fn, self._input_workers_with_options(options), diff --git a/tensorflow/python/distribute/parameter_server_strategy.py b/tensorflow/python/distribute/parameter_server_strategy.py index 4b07980e8ec..d7d937cbb45 100644 --- a/tensorflow/python/distribute/parameter_server_strategy.py +++ b/tensorflow/python/distribute/parameter_server_strategy.py @@ -136,7 +136,7 @@ class ParameterServerStrategy(distribute_lib.Strategy): if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: raise NotImplementedError("InputReplicationMode.PER_REPLICA " "is only supported in `experimental_distribute_datasets_from_function` " - "of mirrored_strategy") + "of tf.distribute.MirroredStrategy") self._raise_pss_error_if_eager() super(ParameterServerStrategy, self).distribute_datasets_from_function( dataset_fn=dataset_fn, options=options) diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py index a2cf20e5c39..9ae87885f60 100644 --- a/tensorflow/python/distribute/tpu_strategy.py +++ b/tensorflow/python/distribute/tpu_strategy.py @@ -819,7 +819,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1): if options and options.replication_mode == distribute_lib.InputReplicationMode.PER_REPLICA: raise NotImplementedError("InputReplicationMode.PER_REPLICA " "is only supported in `experimental_distribute_datasets_from_function` " - "of mirrored_strategy") + "of tf.distribute.MirroredStrategy") input_workers = self._get_input_workers(options) input_contexts = [] num_workers = input_workers.num_workers From cc30517761e97e7448d438d1ecd763af0a05929b Mon Sep 17 00:00:00 2001 From: Yong Tang Date: Sat, 10 Oct 2020 15:40:47 +0000 Subject: [PATCH 0120/1447] Update API golden to expose tf.experimental.register_filesystem_plugin Signed-off-by: Yong Tang --- tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt | 4 ++++ tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt index c3a84b15dd6..dc60db610ff 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt @@ -20,4 +20,8 @@ tf_module { name: "output_all_intermediates" argspec: "args=[\'state\'], varargs=None, keywords=None, defaults=None" } + member_method { + name: "register_filesystem_plugin" + argspec: "args=[\'plugin_location\'], varargs=None, keywords=None, defaults=None" + } } diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt index 58384846276..33c28d715a5 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt @@ -28,4 +28,8 @@ tf_module { name: "function_executor_type" argspec: "args=[\'executor_type\'], varargs=None, keywords=None, defaults=None" } + member_method { + name: "register_filesystem_plugin" + argspec: "args=[\'plugin_location\'], varargs=None, keywords=None, defaults=None" + } } From 8897653779b54b220885f50964d1a29f364c3ba3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=B5=20V=C4=83n=20Ngh=C4=A9a?= Date: Sun, 11 Oct 2020 13:44:36 +0700 Subject: [PATCH 0121/1447] Add Neg gradient --- tensorflow/c/eager/gradients_test.cc | 78 ++++++++++++++++++- .../c/experimental/gradients/math_grad.cc | 34 ++++++++ .../c/experimental/gradients/math_grad.h | 1 + .../python/framework/experimental/math_ops.cc | 11 +++ .../python/framework/experimental/math_ops.py | 5 ++ 5 files changed, 128 insertions(+), 1 deletion(-) diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc index cd4febba8c1..fa775cfa0e4 100644 --- a/tensorflow/c/eager/gradients_test.cc +++ b/tensorflow/c/eager/gradients_test.cc @@ -62,10 +62,10 @@ Status RegisterGradients(GradientRegistry* registry) { TF_RETURN_IF_ERROR(registry->Register("Exp", ExpRegisterer)); TF_RETURN_IF_ERROR(registry->Register("IdentityN", IdentityNRegisterer)); TF_RETURN_IF_ERROR(registry->Register("Sqrt", SqrtRegisterer)); + TF_RETURN_IF_ERROR(registry->Register("Neg", NegRegisterer)); return Status::OK(); } - // Computes // y = inputs[0] + inputs[1] // return grad(y, {inputs[0], inputs[1]}) @@ -199,6 +199,38 @@ Status IdentityNGradModel(AbstractContext* ctx, return Status::OK(); } +// Computes +// y = - inputs[0] +// return grad(y, {inputs[0]}) +Status NegGradModel(AbstractContext* ctx, + absl::Span inputs, + absl::Span outputs, + const GradientRegistry& registry) { + TapeVSpace vspace(ctx); + auto tape = new Tape(/*persistent=*/false); + tape->Watch(ToId(inputs[0])); + + std::vector neg_outputs(1); + AbstractContextPtr tape_ctx(new TapeContext(ctx, tape, registry)); + TF_RETURN_IF_ERROR( + ops::Neg(tape_ctx.get(), inputs, absl::MakeSpan(neg_outputs), "Neg")); + + std::unordered_map + source_tensors_that_are_targets; + std::vector out_grads; + TF_RETURN_IF_ERROR(tape->ComputeGradient( + vspace, /*target_tensor_ids=*/{ToId(neg_outputs[0])}, + /*source_tensor_ids=*/{ToId(inputs[0])}, source_tensors_that_are_targets, + /*output_gradients=*/{}, &out_grads, + /*build_default_zeros_grads=*/false)); + for (auto neg_output : neg_outputs) { + neg_output->Unref(); + } + outputs[0] = out_grads[0]; + delete tape; + return Status::OK(); +} + AbstractContext* BuildFunction(const char* fn_name) { std::unique_ptr status( TF_NewStatus(), TF_DeleteStatus); @@ -536,6 +568,50 @@ TEST_P(CppGradients, TestIdentityNGrad) { result_tensor = nullptr; } +TEST_P(CppGradients, TestNegGrad) { + std::unique_ptr status( + TF_NewStatus(), TF_DeleteStatus); + AbstractContextPtr ctx; + { + AbstractContext* ctx_raw = nullptr; + Status s = + BuildImmediateExecutionContext(std::get<1>(GetParam()), &ctx_raw); + ASSERT_EQ(errors::OK, s.code()) << s.error_message(); + ctx.reset(ctx_raw); + } + + AbstractTensorHandlePtr x; + { + AbstractTensorHandle* x_raw = nullptr; + Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw); + ASSERT_EQ(errors::OK, s.code()) << s.error_message(); + x.reset(x_raw); + } + + GradientRegistry registry; + Status s = RegisterGradients(®istry); + ASSERT_EQ(errors::OK, s.code()) << s.error_message(); + + // Pseudo-code: + // + // tape.watch(x) + // y = - x + // outputs = tape.gradient(y, x) + std::vector outputs(1); + s = RunModel(NegGradModel, ctx.get(), {x.get()}, absl::MakeSpan(outputs), + /*use_function=*/!std::get<2>(GetParam()), registry); + ASSERT_EQ(errors::OK, s.code()) << s.error_message(); + + TF_Tensor* result_tensor; + s = getValue(outputs[0], &result_tensor); + ASSERT_EQ(errors::OK, s.code()) << s.error_message(); + auto result_value = static_cast(TF_TensorData(result_tensor)); + EXPECT_EQ(*result_value, -1.0); + outputs[0]->Unref(); + TF_DeleteTensor(result_tensor); + result_tensor = nullptr; +} + TEST_P(CppGradients, TestSetAttrString) { std::unique_ptr status( TF_NewStatus(), TF_DeleteStatus); diff --git a/tensorflow/c/experimental/gradients/math_grad.cc b/tensorflow/c/experimental/gradients/math_grad.cc index 5cba7b28fda..da937e9ed39 100644 --- a/tensorflow/c/experimental/gradients/math_grad.cc +++ b/tensorflow/c/experimental/gradients/math_grad.cc @@ -24,6 +24,7 @@ using std::vector; using tensorflow::ops::Conj; using tensorflow::ops::MatMul; using tensorflow::ops::Mul; +using tensorflow::ops::Neg; using tensorflow::ops::SqrtGrad; namespace tensorflow { @@ -201,6 +202,30 @@ class MatMulGradientFunction : public GradientFunction { AttrBuilder forward_attrs; }; +class NegGradientFunction : public GradientFunction { + public: + Status Compute(Context* ctx, const IncomingGradients& grad_inputs, + vector* grad_outputs) override { + /* Given upstream grad U and a Neg op Y = -X, the gradients are: + * + * dX = -U + * + */ + + grad_outputs->resize(1); + + // Grad for X + std::vector neg_outputs(1); + std::string name = "Neg_Grad"; + TF_RETURN_IF_ERROR(ops::Neg(ctx->ctx, {grad_inputs[0]}, + absl::MakeSpan(neg_outputs), name.c_str())); + + (*grad_outputs)[0] = neg_outputs[0]; + return Status::OK(); + } + ~NegGradientFunction() override {} +}; + } // namespace BackwardFunction* AddRegisterer(const ForwardOperation& op) { @@ -239,5 +264,14 @@ BackwardFunction* SqrtRegisterer(const ForwardOperation& op) { return new BackwardFunction(gradient_function, default_gradients); } +BackwardFunction* NegRegisterer(const ForwardOperation& op) { + auto gradient_function = new NegGradientFunction; + // For ops with a single output, the gradient function is not called if there + // is no incoming gradient. So we do not need to worry about creating zeros + // grads in this case. + auto default_gradients = new PassThroughDefaultGradients(op); + return new BackwardFunction(gradient_function, default_gradients); +} + } // namespace gradients } // namespace tensorflow diff --git a/tensorflow/c/experimental/gradients/math_grad.h b/tensorflow/c/experimental/gradients/math_grad.h index 7faeadcca81..38d83b959d2 100644 --- a/tensorflow/c/experimental/gradients/math_grad.h +++ b/tensorflow/c/experimental/gradients/math_grad.h @@ -24,6 +24,7 @@ BackwardFunction* AddRegisterer(const ForwardOperation& op); BackwardFunction* ExpRegisterer(const ForwardOperation& op); BackwardFunction* MatMulRegisterer(const ForwardOperation& op); BackwardFunction* SqrtRegisterer(const ForwardOperation& op); +BackwardFunction* NegRegisterer(const ForwardOperation& op); } // namespace gradients } // namespace tensorflow diff --git a/tensorflow/python/framework/experimental/math_ops.cc b/tensorflow/python/framework/experimental/math_ops.cc index 8a6d8525092..7f2f4f28318 100644 --- a/tensorflow/python/framework/experimental/math_ops.cc +++ b/tensorflow/python/framework/experimental/math_ops.cc @@ -53,5 +53,16 @@ PYBIND11_MODULE(_math_ops, m) { /*transpose_a=*/false, /*transpose_b=*/false)); return outputs[0]; }); + m.def("neg", + [](AbstractContext* ctx, AbstractTensorHandle* a, const char* name) { + int num_outputs = 1; + std::vector outputs(1); + if (!name) { + name = "Neg"; + } + MaybeRaiseRegisteredFromStatus( + ops::Neg(ctx, {a}, absl::MakeSpan(outputs), name)); + return outputs[0]; + }); } } // namespace tensorflow diff --git a/tensorflow/python/framework/experimental/math_ops.py b/tensorflow/python/framework/experimental/math_ops.py index 7b3a171da1f..d5656e99319 100644 --- a/tensorflow/python/framework/experimental/math_ops.py +++ b/tensorflow/python/framework/experimental/math_ops.py @@ -30,3 +30,8 @@ def add(a, b, name=None): def mat_mul(a, b, name=None): ctx = context.get_default() return _math_ops.mat_mul(ctx, a, b, name) + + +def neg(a, name=None): + ctx = context.get_default() + return _math_ops.neg(ctx, a, name) From eced5da8f12adc496ce9c8e20491e3af5a10ee1e Mon Sep 17 00:00:00 2001 From: Sanjoy Das Date: Sun, 11 Oct 2020 21:03:56 -0700 Subject: [PATCH 0122/1447] VLOG(1) an error if cuDNN sub-lib pre-initialization fails This lets us detect cases where we've built against cuDNN 8.0.4+ but are running with cuDNN 8.0.3 or older. --- tensorflow/stream_executor/cuda/cuda_dnn.cc | 23 +++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index a7ed5cedb4f..8fa70d500a0 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -291,6 +291,17 @@ port::Status GetLoadedCudnnVersion(CudnnVersion* version) { return port::Status::OK(); } +#if CUDNN_MAJOR >= 8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4) +void PreloadCudnnLibrary(cudnnStatus_t (*version_check_fn)(), + absl::string_view sub_library) { + cudnnStatus_t status = version_check_fn(); + if (status != CUDNN_STATUS_SUCCESS) { + VLOG(1) << "Could not pre-initialize cuDNN sub-library " << sub_library + << ". Error: " << cudnnGetErrorString(status) << "."; + } +} +#endif + } // namespace CudnnSupport::CudnnSupport(GpuExecutor* parent) : parent_(parent) {} @@ -320,12 +331,12 @@ port::Status CudnnSupport::Init() { // Preload sub libs for cudnn 8.0.4+ #if CUDNN_MAJOR >= 8 && (CUDNN_MINOR > 0 || CUDNN_PATCHLEVEL >= 4) - cudnnOpsInferVersionCheck(); - cudnnOpsTrainVersionCheck(); - cudnnCnnInferVersionCheck(); - cudnnCnnTrainVersionCheck(); - cudnnAdvInferVersionCheck(); - cudnnAdvTrainVersionCheck(); + PreloadCudnnLibrary(cudnnOpsInferVersionCheck, "cudnn_ops_infer"); + PreloadCudnnLibrary(cudnnOpsTrainVersionCheck, "cudnn_ops_train"); + PreloadCudnnLibrary(cudnnCnnInferVersionCheck, "cudnn_cnn_infer"); + PreloadCudnnLibrary(cudnnCnnTrainVersionCheck, "cudnn_cnn_train"); + PreloadCudnnLibrary(cudnnAdvInferVersionCheck, "cudnn_adv_infer"); + PreloadCudnnLibrary(cudnnAdvTrainVersionCheck, "cudnn_adv_train"); #endif cudnn_.reset(new CudnnAccess(cudnn_handle)); From b6bc26301d1277778f7490c43b34ab9c08015f14 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 12 Oct 2020 09:49:31 +0200 Subject: [PATCH 0123/1447] Avoid empty linker_bin_path breaking the build If ctx.attr.linker_bin_path is empty (e.g. if should_download_clang is set) the GPU build would add a lone `-B` to the build which swallows the next argument leading to broken builds. Fixes #41856, fixes #42313 --- third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl index afc8132bd15..cf84339528f 100644 --- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl +++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl @@ -500,9 +500,9 @@ def _features(cpu, compiler, ctx): flag_groups = [ flag_group(flags = ( ["-Wl,-no-as-needed"] if cpu == "local" else [] - ) + [ - "-B" + ctx.attr.linker_bin_path, - ]), + ) + ( + ["-B" + ctx.attr.linker_bin_path] if ctx.attr.linker_bin_path else [] + ), flag_group( flags = ["@%{linker_param_file}"], expand_if_available = "linker_param_file", From b24a50b67d3ba04e9b03f9b220cc851b2ec08536 Mon Sep 17 00:00:00 2001 From: Alexander Grund Date: Mon, 12 Oct 2020 12:33:32 +0200 Subject: [PATCH 0124/1447] Make sure flag_group.flags is non-empty --- .../gpus/crosstool/cc_toolchain_config.bzl.tpl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl index cf84339528f..90894342e9f 100644 --- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl +++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl @@ -497,12 +497,11 @@ def _features(cpu, compiler, ctx): ), flag_set( actions = all_link_actions(), - flag_groups = [ - flag_group(flags = ( - ["-Wl,-no-as-needed"] if cpu == "local" else [] - ) + ( - ["-B" + ctx.attr.linker_bin_path] if ctx.attr.linker_bin_path else [] - ), + flag_groups = ([ + flag_group(flags = ["-Wl,-no-as-needed"]) + ] if cpu == "local" else []) + ([ + flag_group(flags = ["-B" + ctx.attr.linker_bin_path]) + ] if ctx.attr.linker_bin_path" else []) + [ flag_group( flags = ["@%{linker_param_file}"], expand_if_available = "linker_param_file", From 9dfedbd7f0fc7f37dd8fac30bcc76c290ba4ebc7 Mon Sep 17 00:00:00 2001 From: Stephan Herhut Date: Mon, 12 Oct 2020 03:38:39 -0700 Subject: [PATCH 0125/1447] Delete MoveScalarComputationsIntoGpuLaunchPass, as this is now available upstream in MLIR core. PiperOrigin-RevId: 336629456 Change-Id: Iddc027ce9f19edbc4acb0030a887a8a18f2f3112 --- .../mlir/tools/kernel_gen/kernel_creator.cc | 2 - .../xla/service/mlir_gpu/kernel_lowering.cc | 2 - .../compiler/xla/service/mlir_gpu/passes.cc | 62 ------------------- .../compiler/xla/service/mlir_gpu/passes.h | 4 -- .../compiler/xla/service/mlir_gpu/passes.td | 9 --- 5 files changed, 79 deletions(-) diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc index 00096b6d7db..5cab21e0efb 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc @@ -158,8 +158,6 @@ Status LowerTFtoGPU(mlir::ModuleOp module, bool gpu_binary_only, // Approximate Tanh using standard operations. pm.addNestedPass<::mlir::FuncOp>( ::mlir::mhlo::createLegalizeTrigonometricToApproximationPass()); - // Move scalar operations into the launch to ensure smaller signatures. - pm.addPass(xla::mlir_gpu::createMoveScalarComputationsIntoGpuLaunchPass()); // Take launches to launches with kernels. pm.addPass(::mlir::createGpuKernelOutliningPass()); diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc index bb8a990fa6d..c1e5e760074 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc @@ -125,8 +125,6 @@ Status LowerLHLOToGPU(mlir::ModuleOp module, LowerLHLOToGPUOptions options) { pm.addNestedPass<::mlir::FuncOp>( ::mlir::mhlo::createLegalizeTrigonometricToApproximationPass()); } - // Move scalar operations into the launch to ensure smaller signatures. - pm.addPass(createMoveScalarComputationsIntoGpuLaunchPass()); // Take launches to launches with kernels. pm.addPass(::mlir::createGpuKernelOutliningPass()); // Make sure the kernel signature resembled the original function's diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc index f0997701d73..84751bc0507 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/passes.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.cc @@ -203,63 +203,6 @@ struct DeadTempBufferRemovalPass } }; -struct MoveScalarComputationsIntoGpuLaunchPass - : MoveScalarComputationsIntoGpuLaunchPassBase< - MoveScalarComputationsIntoGpuLaunchPass> { - static bool isInliningBeneficiary(mlir::Operation* op) { - return llvm::isa(op); - } - - static bool extractBeneficiaryOps( - mlir::Operation* op, llvm::SmallVectorImpl* ops, - llvm::SetVector args) { - if (!isInliningBeneficiary(op)) { - return false; - } - - ops->push_back(op); - for (auto operand : op->getOperands()) { - // It is an existing arg, keep going. - if (args.count(operand)) { - continue; - } - mlir::Operation* definingOp = operand.getDefiningOp(); - if (!definingOp || !extractBeneficiaryOps(definingOp, ops, args)) { - return false; - } - } - return true; - } - - static void inlineOperationsIntoLaunch(mlir::gpu::LaunchOp launch) { - llvm::SetVector used_above; - mlir::getUsedValuesDefinedAbove(launch.body(), used_above); - mlir::BlockAndValueMapping inlined_map; - for (mlir::Value v : used_above) { - llvm::SmallVector ops_to_move; - mlir::Operation* definingOp = v.getDefiningOp(); - if (definingOp && - extractBeneficiaryOps(definingOp, &ops_to_move, used_above)) { - mlir::OpBuilder b(launch.body()); - for (mlir::Operation* op : llvm::reverse(ops_to_move)) { - auto result = b.clone(*op, inlined_map); - for (auto pair : llvm::zip(op->getResults(), result->getResults())) { - mlir::replaceAllUsesInRegionWith(std::get<0>(pair), - std::get<1>(pair), launch.body()); - } - inlined_map.map(op->getResults(), result->getResults()); - } - } - } - } - - void runOnFunction() override { - getFunction().walk( - [](mlir::gpu::LaunchOp launch) { inlineOperationsIntoLaunch(launch); }); - } -}; - struct RewriteKernelSignaturePass : RewriteKernelSignaturePassBase { void runOnFunction() override { @@ -414,11 +357,6 @@ std::unique_ptr createDeadTempBufferRemovalPass() { return absl::make_unique(); } -std::unique_ptr -createMoveScalarComputationsIntoGpuLaunchPass() { - return absl::make_unique(); -} - std::unique_ptr createRewriteKernelSignaturePass() { return absl::make_unique(); } diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.h b/tensorflow/compiler/xla/service/mlir_gpu/passes.h index 19ebe53f7ce..832321387c6 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/passes.h +++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.h @@ -37,10 +37,6 @@ std::unique_ptr createStoreForwardingPass(); /// that loads and stores are side-effect free (in bounds, no aliasing, etc.). std::unique_ptr createDeadTempBufferRemovalPass(); -/// Moves scalar computations to the GPULaunchOp body. -std::unique_ptr -createMoveScalarComputationsIntoGpuLaunchPass(); - /// Sorts the operands to the kernel for a deterministic order. First operands /// that are defined by function arguments, followed by operands that are /// returned from the function. This only works for simple functions without diff --git a/tensorflow/compiler/xla/service/mlir_gpu/passes.td b/tensorflow/compiler/xla/service/mlir_gpu/passes.td index 1b19fcf5274..55fe15ad6ff 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/passes.td +++ b/tensorflow/compiler/xla/service/mlir_gpu/passes.td @@ -46,15 +46,6 @@ def DeadTempBufferRemovalPass }]; } -def MoveScalarComputationsIntoGpuLaunchPass - : FunctionPass<"mlir-gpu-inline-scalar-computation"> { - let summary = "Pass to Move scalar computations to the GPULaunchOp body."; - let constructor = "createMoveScalarComputationsIntoGpuLaunchPass()"; - let description = [{ - Moves scalar computations to the GPULaunchOp body. - }]; -} - def RewriteKernelSignaturePass : FunctionPass<"mlir-gpu-rewrite-signatures"> { let summary = "Rewrite kernel signatures to be deterministic."; From efcaa18c839eb513e6cc894789ec78be1a233977 Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Mon, 12 Oct 2020 05:58:03 -0700 Subject: [PATCH 0126/1447] Integrate LLVM at llvm/llvm-project@473b364a19ff Updates LLVM usage to match [473b364a19ff](https://github.com/llvm/llvm-project/commit/473b364a19ff) PiperOrigin-RevId: 336647877 Change-Id: I779cf2e7192c4620daad5ad1e56559db86657f4d --- tensorflow/workspace.bzl | 4 ++-- third_party/mlir/BUILD | 31 +++++++++++++++++++++++++++---- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 637d3b9e05b..eb19c7877b6 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -712,8 +712,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): ) # Check out LLVM and MLIR from llvm-project. - LLVM_COMMIT = "a2291a58bf1c860d026581fee6fe96019dc25440" - LLVM_SHA256 = "2c171a49faeaa520122154b243818b02d0fd5d3fe236aaa01025801040fb91af" + LLVM_COMMIT = "473b364a19ff4ec39abe2ce3da6614d717207966" + LLVM_SHA256 = "20588f418b6dd4b3d5870e5fe94c0eab3251fedbb06015a19aee5ab0be0a7c1c" LLVM_URLS = [ "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), diff --git a/third_party/mlir/BUILD b/third_party/mlir/BUILD index f925d035a1d..31a839d81fa 100644 --- a/third_party/mlir/BUILD +++ b/third_party/mlir/BUILD @@ -1271,6 +1271,30 @@ gentbl( ], ) +gentbl( + name = "GPUBaseIncGen", + strip_include_prefix = "include", + tbl_outs = [ + ( + "-gen-dialect-decls -dialect=gpu", + "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc", + ), + ( + "-gen-op-interface-decls", + "include/mlir/Dialect/GPU/GPUOpInterfaces.h.inc", + ), + ( + "-gen-op-interface-defs", + "include/mlir/Dialect/GPU/GPUOpInterfaces.cpp.inc", + ), + ], + tblgen = ":mlir-tblgen", + td_file = "include/mlir/Dialect/GPU/GPUBase.td", + td_srcs = [ + ":GPUOpsTdFiles", + ], +) + gentbl( name = "GPUOpsIncGen", strip_include_prefix = "include", @@ -1283,10 +1307,6 @@ gentbl( "-gen-op-defs", "include/mlir/Dialect/GPU/GPUOps.cpp.inc", ), - ( - "-gen-dialect-decls -dialect=gpu", - "include/mlir/Dialect/GPU/GPUOpsDialect.h.inc", - ), ], tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/GPU/GPUOps.td", @@ -1308,12 +1328,14 @@ cc_library( ]), includes = ["include"], deps = [ + ":GPUBaseIncGen", ":GPUOpsIncGen", ":IR", ":LLVMDialect", ":SideEffectInterfaces", ":StandardOps", ":Support", + "@llvm-project//llvm:Support", ], ) @@ -3799,6 +3821,7 @@ cc_library( ":ConversionPassIncGen", ":IR", ":LinalgOps", + ":LinalgTransforms", ":Pass", ":SCFDialect", ":StandardOps", From aa6ff638f6fa0e83f65bcf92c6e9b70125289b08 Mon Sep 17 00:00:00 2001 From: Andy Ly Date: Mon, 12 Oct 2020 07:23:32 -0700 Subject: [PATCH 0127/1447] Move unique id generation for tf.VarHandleOp to an op interface. Other ops can create resource handles, and some parts for determining a unique resource handle can be reused. PiperOrigin-RevId: 336658650 Change-Id: Icbb08f28692c8d27e28f731df44c01894cc6f307 --- tensorflow/compiler/mlir/tensorflow/BUILD | 4 +- .../analysis/resource_alias_analysis.cc | 62 +++------------ .../mlir/tensorflow/ir/tf_op_interfaces.h | 79 +++++++++++++++++++ .../mlir/tensorflow/ir/tf_op_interfaces.td | 23 ++++++ .../compiler/mlir/tensorflow/ir/tf_ops.td | 9 ++- .../compiler/mlir/tensorflow/ir/tf_ops_n_z.cc | 22 ++++++ 6 files changed, 148 insertions(+), 51 deletions(-) diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD index 199a9c0939c..1c740731acd 100644 --- a/tensorflow/compiler/mlir/tensorflow/BUILD +++ b/tensorflow/compiler/mlir/tensorflow/BUILD @@ -110,6 +110,8 @@ cc_library( deps = [ ":tensorflow_op_interfaces_inc_gen", ":tensorflow_structs", + "//tensorflow/core:framework", + "@llvm-project//llvm:Support", "@llvm-project//mlir:IR", "@llvm-project//mlir:Support", ], @@ -810,8 +812,8 @@ cc_library( ], deps = [ ":tensorflow", + ":tensorflow_op_interfaces", ":tensorflow_types", - "//tensorflow/core:framework", "@llvm-project//llvm:Support", "@llvm-project//mlir:Analysis", "@llvm-project//mlir:IR", diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc index 93a55cd9289..cdc9e33e368 100644 --- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc +++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.cc @@ -24,7 +24,6 @@ limitations under the License. #include "llvm/ADT/SCCIterator.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Casting.h" #include "mlir/Analysis/CallGraph.h" // from @llvm-project @@ -40,8 +39,8 @@ limitations under the License. #include "mlir/Support/LogicalResult.h" // from @llvm-project #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h" +#include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h" -#include "tensorflow/core/framework/resource_mgr.h" namespace mlir { namespace TF { @@ -228,51 +227,16 @@ BacktrackAnalysisInfo::BacktrackAnalysisInfo( backtracked_values_.push_back(backtrack_analysis.BacktrackValue(result)); } -namespace { - -//===----------------------------------------------------------------------===// -// ResourceAliasAnalysisInfo helper functions. -//===----------------------------------------------------------------------===// - -constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id"; - -// Returns if a VarHandleOp is anonymous, which means it always creates a new -// variable. -bool IsResourceHandleAnonymous(VarHandleOp handle) { - return handle.shared_name() == tensorflow::ResourceHandle::ANONYMOUS_NAME; -} - -// Returns a string unique identifier for a non-anonymous VarHandleOp. -std::string GetVarHandleStringId(VarHandleOp handle) { - auto device = handle.getAttrOfType("device"); - return llvm::join( - llvm::ArrayRef{ - handle.container(), handle.shared_name(), - device ? device.getValue() : llvm::StringRef()}, - "/"); -} - -// Finds a unique ID for a VarHandleOp's output. If it is anonymous, always -// creates a new ID; otherwise, tries to reuse the existing ID for the -// referenced variable if it exists, or creates a new one if not. -int64_t GetOrCreateIdForVarHandle(VarHandleOp handle, int64_t& next_id, - llvm::StringMap& name_id_map) { - // Always create a new ID for anonymous handle. - if (IsResourceHandleAnonymous(handle)) return next_id++; - - auto name = GetVarHandleStringId(handle); - auto emplace_res = name_id_map.try_emplace(name, next_id); - // New ID created, increment next_id. - if (emplace_res.second) ++next_id; - return emplace_res.first->second; -} - -} // namespace - //===----------------------------------------------------------------------===// // ResourceAliasAnalysisInfo //===----------------------------------------------------------------------===// +namespace { + +constexpr char kResourceArgUniqueIdAttr[] = "tf._resource_arg_unique_id"; + +} // namespace + constexpr int64_t ResourceAliasAnalysisInfo::kUnknownResourceId; // Constructs the analysis info by analyzing the given function. @@ -338,13 +302,13 @@ ResourceAliasAnalysisInfo::ResourceAliasAnalysisInfo( } }); - llvm::StringMap var_handle_name_id_map; + llvm::SmallDenseMap resource_handle_id_map; func_op.walk([&](Operation* op) { - if (auto var_handle = dyn_cast(op)) { - AddValueUniqueIDMapping( - var_handle.resource(), - GetOrCreateIdForVarHandle(var_handle, next_unique_id, - var_handle_name_id_map)); + if (auto resource_alloc = dyn_cast(op)) { + ResourceHandleValueAndId resource = + resource_alloc.GetResourceHandleValueAndId(resource_handle_id_map, + next_unique_id); + AddValueUniqueIDMapping(resource.value, resource.id); } else if (llvm::isa(op)) { for (auto result : filter_resources(op->getResults())) PropagateInputToOutput(op->getOperand(result.getResultNumber()), diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h index 1eb5c89f0fc..17b52c03b6b 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h @@ -16,10 +16,17 @@ limitations under the License. #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_ #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_ +#include + +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/StringRef.h" #include "mlir/IR/Attributes.h" // from @llvm-project #include "mlir/IR/OpImplementation.h" // from @llvm-project +#include "mlir/IR/Operation.h" // from @llvm-project #include "tensorflow/compiler/mlir/tensorflow/ir/tf_structs.h" #include "tensorflow/compiler/mlir/tensorflow/ir/tf_verifiers.h" +#include "tensorflow/core/framework/resource_mgr.h" namespace mlir { namespace TF { @@ -49,8 +56,80 @@ struct ContractionFusion { SmallVector additional_attributes; }; +//===----------------------------------------------------------------------===// +// TensorFlow Resource Handles. +//===----------------------------------------------------------------------===// + +inline bool IsResourceHandleAnonymous(StringRef name) { + return name == tensorflow::ResourceHandle::ANONYMOUS_NAME; +} + +// Helper struct representing an identifier for a resource handle. For resource +// handles created explicitly and shared across resource allocator ops, +// `container`, `name`, and `device` can be set. If an resource handle is tied +// to an instance of an operation (e.g. TensorFlow runtime operation caching), +// `op` can be set instead. +struct ResourceHandle { + ResourceHandle(StringRef container, StringRef name, StringRef device, + Operation* op) + : container(container), name(name), device(device), op(op) {} + + bool operator==(const ResourceHandle& rhs) const { + return container == rhs.container && name == rhs.name && + device == rhs.device && op == rhs.op; + } + + // Make ResourceHandle hashable. + friend ::llvm::hash_code hash_value(const ResourceHandle& resource_handle); + + std::string container; + std::string name; + std::string device; + Operation* op = nullptr; +}; + +// Make ResourceHandle hashable. +inline ::llvm::hash_code hash_value(const ResourceHandle& resource_handle) { + return ::llvm::hash_combine(resource_handle.container, resource_handle.name, + resource_handle.device, resource_handle.op); +} + +// Helper struct holding a resource handle value and unique id associated to the +// resource handle. +struct ResourceHandleValueAndId { + ResourceHandleValueAndId(Value value, int64_t id) : value(value), id(id) {} + + Value value; + int64_t id = -1; +}; + #include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.h.inc" } // namespace TF } // namespace mlir +namespace llvm { +template <> +struct DenseMapInfo { + static mlir::TF::ResourceHandle getEmptyKey() { + return {/*container=*/"", /*name=*/"", /*device=*/"", /*op=*/nullptr}; + } + + static mlir::TF::ResourceHandle getTombstoneKey() { + return {/*container=*/"", + /*name=*/tensorflow::ResourceHandle::ANONYMOUS_NAME, /*device=*/"", + /*op=*/nullptr}; + } + + static unsigned getHashValue( + const mlir::TF::ResourceHandle& resource_handle) { + return mlir::TF::hash_value(resource_handle); + } + + static bool isEqual(const mlir::TF::ResourceHandle& lhs, + const mlir::TF::ResourceHandle& rhs) { + return lhs == rhs; + } +}; +} // namespace llvm + #endif // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OP_INTERFACES_H_ diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td index 3c41c04a0d6..1ed30c89a77 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_op_interfaces.td @@ -125,4 +125,27 @@ def TF_ContractionFusableInterface : OpInterface<"ContractionFusableInterface"> ]; } +//===----------------------------------------------------------------------===// +// TensorFlow Resource Handle Interfaces. +//===----------------------------------------------------------------------===// + +def TF_ResourceHandleAllocatorInterface : OpInterface<"ResourceHandleAllocatorInterface"> { + let description = [{ + A resource handle allocator operation is one that creates a resource handle, + or looks up and reuses an existing resource handle. + }]; + + let methods = [ + InterfaceMethod< + /*desc=*/[{Returns the resource handle value and unique id associated with + the resource handle. If a resource handle is reused, then an + existing id will be returned.}], + /*retTy=*/"ResourceHandleValueAndId", + /*methodName=*/"GetResourceHandleValueAndId", + /*args=*/(ins "llvm::SmallDenseMap&":$resource_handle_id_map, + "int64_t&":$next_id) + >, + ]; +} + #endif // TF_OP_INTERFACES diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td index 544f07f7075..67ad0fc4e70 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td @@ -787,7 +787,7 @@ This operation holds the metadata common to operations of a `tpu.replicate()` co let results = (outs); } -def TF_VarHandleOp : TF_Op<"VarHandleOp", []> { +def TF_VarHandleOp : TF_Op<"VarHandleOp", [TF_ResourceHandleAllocatorInterface]> { let summary = "Creates a handle to a Variable resource from its name."; let description = [{ @@ -816,6 +816,13 @@ Example: TF_DerivedOperandOrResultHandleTypeAttr<"resource">; TF_DerivedOperandOrResultHandleShapeAttr shape = TF_DerivedOperandOrResultHandleShapeAttr<"resource">; + + let extraClassDeclaration = [{ + // TF_ResourceHandleAllocatorInterface: + ResourceHandleValueAndId GetResourceHandleValueAndId( + llvm::SmallDenseMap &resource_handle_id_map, + int64_t &next_id); + }]; } // Multiple variadic operands with different sizes are not supported by the diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc index 519f7e9fcaf..8742a0e2b71 100644 --- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc +++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc @@ -27,6 +27,7 @@ limitations under the License. #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/Sequence.h" @@ -2388,6 +2389,27 @@ static LogicalResult VerifyUnsortedSegmentReduction(Op op) { return success(); } +//===----------------------------------------------------------------------===// +// VarHandleOp +//===----------------------------------------------------------------------===// + +ResourceHandleValueAndId VarHandleOp::GetResourceHandleValueAndId( + llvm::SmallDenseMap &resource_handle_id_map, + int64_t &next_id) { + // Always create a new ID for anonymous handle. + if (IsResourceHandleAnonymous(shared_name())) return {resource(), next_id++}; + + llvm::StringRef device; + if (auto device_attr = getAttrOfType("device")) + device = device_attr.getValue(); + + ResourceHandle handle(container(), shared_name(), device, /*op=*/nullptr); + auto emplace_res = resource_handle_id_map.try_emplace(handle, next_id); + // New ID created, increment next_id. + if (emplace_res.second) ++next_id; + return {resource(), emplace_res.first->second}; +} + //===----------------------------------------------------------------------===// // VarIsInitializedOp //===----------------------------------------------------------------------===// From 8714ada4ea5d56f8023b348bb31de73612a73c3f Mon Sep 17 00:00:00 2001 From: Jian Li Date: Mon, 12 Oct 2020 08:39:44 -0700 Subject: [PATCH 0128/1447] Separate the model build and tensor allocation call for integer LSTM test. PiperOrigin-RevId: 336669565 Change-Id: I13437f34cbe4a1461b724d91fae934757bc903ab --- tensorflow/lite/kernels/lstm_test.cc | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/tensorflow/lite/kernels/lstm_test.cc b/tensorflow/lite/kernels/lstm_test.cc index ccdc8193f09..9e0084e813d 100644 --- a/tensorflow/lite/kernels/lstm_test.cc +++ b/tensorflow/lite/kernels/lstm_test.cc @@ -1458,9 +1458,13 @@ class LSTMIntegerOpModel : public SingleOpModel { BuiltinOperator_LSTM, BuiltinOptions_LSTMOptions, CreateLSTMOptions(builder_, ActivationFunctionType_TANH).Union()); - BuildInterpreter({}); // Input sizes are already set + BuildInterpreter(/*input_shapes=*/{}, /*num_threads=*/-1, + /*allow_fp32_relax_to_fp16=*/false, + /*apply_delegate=*/true, /*allocate_and_delegate=*/false); } + void PerformAllocateAndDelegate() { AllocateAndDelegate(true); } + void SetInputToInputWeights(const std::vector& f) { QuantizeAndPopulate(input_to_input_weights_, f); } @@ -1692,6 +1696,8 @@ TEST(IntegerLstmOpTest, NoCifg_NoPeephole_Projection_LayerNorm) { /*use_layer_norm=*/true, /*use_8x8_8_implementation=*/false, ranges, intermediates); + // Do allocate. + lstm.PerformAllocateAndDelegate(); // Set weights. lstm.SetInputToInputWeights(input_to_input_weights); @@ -1859,6 +1865,9 @@ TEST(IntegerLstmOpTest, NoCifg_Peephole_Projection_LayerNorm) { /*use_8x8_8_implementation=*/false, ranges, intermediates); + // Do allocate. + lstm.PerformAllocateAndDelegate(); + // Set weights. lstm.SetInputToInputWeights(input_to_input_weights); lstm.SetInputToCellWeights(input_to_cell_weights); @@ -2026,6 +2035,9 @@ TEST(IntegerLstmOpTest, Cifg_NoPeephole_Projection_LayerNorm_8x8_8) { /*use_8x8_8_implementation=*/true, ranges, intermediates); + // Do allocate. + lstm.PerformAllocateAndDelegate(); + // Set weights. // lstm.SetInputToInputWeights(input_to_input_weights); lstm.SetInputToCellWeights(input_to_cell_weights); From 0e6fb5fde517fa497186c931f29d82f98f99cfeb Mon Sep 17 00:00:00 2001 From: Adrian Kuegel Date: Mon, 12 Oct 2020 08:41:15 -0700 Subject: [PATCH 0129/1447] Integrate LLVM at llvm/llvm-project@93377888ae89 Updates LLVM usage to match [93377888ae89](https://github.com/llvm/llvm-project/commit/93377888ae89) PiperOrigin-RevId: 336669842 Change-Id: I4fccb6278cfa172c88ad4d5392d15bce8d11db58 --- .../Dialect/mhlo/transforms/rewriters.h | 2 +- .../mhlo/transforms/hlo_legalize_to_lhlo.cc | 2 +- .../mlir/tools/kernel_gen/kernel_creator.cc | 2 +- .../tools/kernel_gen/transforms/bufferize.cc | 3 +- .../kernel_gen/transforms/bufferize_pass.cc | 2 +- .../xla/service/mlir_gpu/kernel_lowering.cc | 2 +- tensorflow/workspace.bzl | 4 +- third_party/llvm/llvm.autogenerated.BUILD | 44 +++++++++++++++++++ 8 files changed, 53 insertions(+), 8 deletions(-) diff --git a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h index b0a382d6b0f..a58d0c6304c 100644 --- a/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h +++ b/tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/rewriters.h @@ -20,7 +20,7 @@ limitations under the License. #include "mlir/IR/MLIRContext.h" #include "mlir/IR/PatternMatch.h" -#include "mlir/Transforms/BufferPlacement.h" +#include "mlir/Transforms/Bufferize.h" #include "mlir/Transforms/DialectConversion.h" namespace mlir { diff --git a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc index 22338d2847d..75ecafcf718 100644 --- a/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc +++ b/tensorflow/compiler/mlir/hlo/lib/Dialect/mhlo/transforms/hlo_legalize_to_lhlo.cc @@ -34,7 +34,7 @@ limitations under the License. #include "mlir/IR/PatternMatch.h" #include "mlir/IR/StandardTypes.h" #include "mlir/Pass/Pass.h" -#include "mlir/Transforms/BufferPlacement.h" +#include "mlir/Transforms/Bufferize.h" #include "mlir/Transforms/DialectConversion.h" namespace mlir { diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc index 5cab21e0efb..6c95323ed37 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc @@ -40,7 +40,7 @@ limitations under the License. #include "mlir/Parser.h" // from @llvm-project #include "mlir/Pass/Pass.h" // from @llvm-project #include "mlir/Pass/PassManager.h" // from @llvm-project -#include "mlir/Transforms/BufferPlacement.h" // from @llvm-project +#include "mlir/Transforms/Bufferize.h" // from @llvm-project #include "mlir/Transforms/DialectConversion.h" // from @llvm-project #include "mlir/Transforms/Passes.h" // from @llvm-project #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h" diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc index e6f242eb59d..df610028741 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize.cc @@ -15,6 +15,8 @@ limitations under the License. // This file implements logic for translating mixed IR to buffer form. +#include "mlir/Transforms/Bufferize.h" // from @llvm-project + #include #include @@ -29,7 +31,6 @@ limitations under the License. #include "mlir/IR/OperationSupport.h" // from @llvm-project #include "mlir/IR/StandardTypes.h" // from @llvm-project #include "mlir/Pass/Pass.h" // from @llvm-project -#include "mlir/Transforms/BufferPlacement.h" // from @llvm-project #include "mlir/Transforms/DialectConversion.h" // from @llvm-project namespace mlir { diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc index 8ddbb15219f..b1fa880bf4b 100644 --- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc +++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/bufferize_pass.cc @@ -26,7 +26,7 @@ limitations under the License. #include "mlir/IR/PatternMatch.h" // from @llvm-project #include "mlir/IR/StandardTypes.h" // from @llvm-project #include "mlir/IR/Visitors.h" // from @llvm-project -#include "mlir/Transforms/BufferPlacement.h" // from @llvm-project +#include "mlir/Transforms/Bufferize.h" // from @llvm-project #include "mlir/Transforms/DialectConversion.h" // from @llvm-project #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/hlo_ops.h" #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h" diff --git a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc index c1e5e760074..a664a316e13 100644 --- a/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc +++ b/tensorflow/compiler/xla/service/mlir_gpu/kernel_lowering.cc @@ -35,7 +35,7 @@ limitations under the License. #include "mlir/IR/Dialect.h" // from @llvm-project #include "mlir/Pass/Pass.h" // from @llvm-project #include "mlir/Pass/PassManager.h" // from @llvm-project -#include "mlir/Transforms/BufferPlacement.h" // from @llvm-project +#include "mlir/Transforms/Bufferize.h" // from @llvm-project #include "mlir/Transforms/Passes.h" // from @llvm-project #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/IR/lhlo_ops.h" #include "tensorflow/compiler/mlir/hlo/include/mlir-hlo/Dialect/mhlo/transforms/passes.h" diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index eb19c7877b6..9c49611ad7b 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -712,8 +712,8 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): ) # Check out LLVM and MLIR from llvm-project. - LLVM_COMMIT = "473b364a19ff4ec39abe2ce3da6614d717207966" - LLVM_SHA256 = "20588f418b6dd4b3d5870e5fe94c0eab3251fedbb06015a19aee5ab0be0a7c1c" + LLVM_COMMIT = "93377888ae89560ba6d3976e2762d3d4724c4dfd" + LLVM_SHA256 = "606e772f64024cf6c56b31cd1f5fa700c789a22181c73365498973ac73892f91" LLVM_URLS = [ "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT), diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD index e0d64fe867a..579005926a4 100644 --- a/third_party/llvm/llvm.autogenerated.BUILD +++ b/third_party/llvm/llvm.autogenerated.BUILD @@ -1772,6 +1772,50 @@ cc_library( ], ) +cc_library( + name = "CSKYCodeGen", + srcs = glob([ + "lib/Target/CSKY/*.c", + "lib/Target/CSKY/*.cpp", + "lib/Target/CSKY/*.inc", + ]), + hdrs = glob([ + "include/llvm/Target/CSKY/*.h", + "include/llvm/Target/CSKY/*.def", + "include/llvm/Target/CSKY/*.inc", + "lib/Target/CSKY/*.h", + ]), + copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/CSKY"], + deps = [ + ":CSKYInfo", + ":CodeGen", + ":Core", + ":Support", + ":Target", + ":config", + ], +) + +cc_library( + name = "CSKYInfo", + srcs = glob([ + "lib/Target/CSKY/TargetInfo/*.c", + "lib/Target/CSKY/TargetInfo/*.cpp", + "lib/Target/CSKY/TargetInfo/*.inc", + ]), + hdrs = glob([ + "include/llvm/Target/CSKY/TargetInfo/*.h", + "include/llvm/Target/CSKY/TargetInfo/*.def", + "include/llvm/Target/CSKY/TargetInfo/*.inc", + "lib/Target/CSKY/TargetInfo/*.h", + ]), + copts = llvm_copts + ["-Iexternal/llvm-project/llvm/lib/Target/CSKY"], + deps = [ + ":Support", + ":config", + ], +) + cc_library( name = "CodeGen", srcs = glob([ From adcefa251c6c5727568609eb26f936ab689eef58 Mon Sep 17 00:00:00 2001 From: Amit Patankar Date: Mon, 12 Oct 2020 08:45:42 -0700 Subject: [PATCH 0130/1447] Migrate the first macro usage to the toolchains repo. PiperOrigin-RevId: 336670581 Change-Id: I7cf8f8d9b977ec59b97e7afcaeb5a94fd726d57e --- tensorflow/core/BUILD | 2 +- tensorflow/opensource_only.files | 1 + tensorflow/workspace.bzl | 11 +++++++++++ third_party/tf_toolchains.BUILD | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 third_party/tf_toolchains.BUILD diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 1350775ae44..4a4e6748baf 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -91,7 +91,7 @@ load("//tensorflow:tensorflow.bzl", "tf_gen_op_libs") load("//tensorflow:tensorflow.bzl", "if_nccl") # buildifier: disable=same-origin-load -load("//tensorflow:tensorflow.bzl", "tensorflow_opensource_extra_deps") +load("@tf_toolchains//macros:cpp.bzl", "tensorflow_opensource_extra_deps") # buildifier: disable=same-origin-load load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu") diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files index 5d7658389be..ff944035b22 100644 --- a/tensorflow/opensource_only.files +++ b/tensorflow/opensource_only.files @@ -204,6 +204,7 @@ tensorflow/third_party/tensorrt/build_defs.bzl.tpl tensorflow/third_party/tensorrt/tensorrt/include/tensorrt_config.h.tpl tensorflow/third_party/tensorrt/tensorrt_configure.bzl tensorflow/third_party/termcolor.BUILD +tensorflow/third_party/tf_toolchains.BUILD tensorflow/third_party/tflite_mobilenet.BUILD tensorflow/third_party/tflite_mobilenet_float.BUILD tensorflow/third_party/tflite_mobilenet_quant.BUILD diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl index 9c49611ad7b..a74ff92b1f8 100755 --- a/tensorflow/workspace.bzl +++ b/tensorflow/workspace.bzl @@ -1185,6 +1185,17 @@ def tf_repositories(path_prefix = "", tf_repo_name = ""): ], ) + tf_http_archive( + name = "tf_toolchains", + sha256 = "eb175afa73e5a33d2b5d2aabcfde6c8c3395fd7001eb5ba765a5cd98cce714ba", + strip_prefix = "toolchains-0.0.2", + build_file = clean_dep("//third_party:tf_toolchains.BUILD"), + urls = [ + "http://mirror.tensorflow.org/github.com/tensorflow/toolchains/archive/v0.0.2.tar.gz", + "https://github.com/tensorflow/toolchains/archive/v0.0.2.tar.gz", + ], + ) + def tf_bind(): """Bind targets for some external repositories""" ############################################################################## diff --git a/third_party/tf_toolchains.BUILD b/third_party/tf_toolchains.BUILD new file mode 100644 index 00000000000..ffd0fb0cdc5 --- /dev/null +++ b/third_party/tf_toolchains.BUILD @@ -0,0 +1 @@ +package(default_visibility = ["//visibility:public"]) From 4365f05b2f27ab4420ef8db2f91021eb5c6bd255 Mon Sep 17 00:00:00 2001 From: Jean-Baptiste Lespiau Date: Mon, 12 Oct 2020 08:49:14 -0700 Subject: [PATCH 0131/1447] Add support for jax_debug_nans and fix the last few glitches with the C++ jax.jit. - Sorting the keyword arguments must be done on the string, because we go through the Python path which uses flatten() which sort them by string. - Some error with obj == obj which is the same as obj.is(obj) and not obj.equal(obj). - Moves all the Python tests to the C++ tests (which also run on the _python_jit). PiperOrigin-RevId: 336671123 Change-Id: Ic6a684a77b845e2268693cde14412bc40823530e --- tensorflow/compiler/xla/python/jax_jit.cc | 47 +++++++++++++---------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc index e1528e77e0f..2d392d41f37 100644 --- a/tensorflow/compiler/xla/python/jax_jit.cc +++ b/tensorflow/compiler/xla/python/jax_jit.cc @@ -122,10 +122,10 @@ struct CallSignature { std::vector static_args; // A PyTreeDef for each positional dynamic (i.e. not static) argument. std::vector dynamic_positional_args_treedef; - // Keyword arguments. Sorted by the interned keyword pointers. + // Keyword arguments. Sorted by the keyword name. std::vector keyword_args; // Shape and dtype for both the dynamic positional arguments and the keyword - // arguments (sorted by interned keyword pointers). + // arguments (sorted by keyword name). std::vector dynamic_args_signatures; PjRtDevice* device; @@ -190,7 +190,7 @@ std::string CallSignature::DebugString() const { std::vector static_args_str; static_args_str.reserve(static_args.size()); for (auto& static_arg : static_args) { - static_args_str.emplace_back(py::cast(static_arg.str())); + static_args_str.emplace_back(py::cast(py::str(static_arg))); } std::vector signature_str; @@ -271,9 +271,7 @@ class CompiledFunction { private: // Returns nullptr if not present in the cache. - CacheEntry* GetCacheEntryIfPresent(const py::args& args, - const py::kwargs& kwargs, - const CallSignature& signature); + CacheEntry* GetCacheEntryIfPresent(const CallSignature& signature); // Should never return nullptr. CacheEntry* AddCacheEntry(const py::args& args, const py::kwargs& kwargs, const CallSignature& signature, @@ -424,8 +422,10 @@ void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs, // Keyword arguments. std::vector> kwargs(py_kwargs.begin(), py_kwargs.end()); - // We first intern the keys, then sort them (by pointer) and then create - // the signatures. + // We first intern the keys, then sort them (by name, as in the Python path) + // (see also PyTreeDef::Flatten) and then create the signatures. + // TODO(jblespiau): We should be able to sort the keys by interned-key + // pointers, but this requires the Python compilation to do the same. arguments.signature.keyword_args.resize(kwargs.size()); for (size_t i = 0; i < kwargs.size(); ++i) { // Intern the key if not already interned. @@ -442,7 +442,7 @@ void FlattenArguments(const py::args& args, const py::kwargs& py_kwargs, std::sort(kwargs.begin(), kwargs.end(), [](const std::pair& a, const std::pair& b) { - return a.first.ptr() < b.first.ptr(); + return a.first < b.first; }); for (size_t i = 0; i < kwargs.size(); ++i) { arguments.signature.keyword_args[i].key = kwargs[i].first; @@ -511,7 +511,7 @@ StatusOr> ScalarToBuffer( "%s", absl::StrCat( "Not supported: The C++ jax jit execution path, only accepts " "DeviceArray, Numpy arrays, or Python scalars. Got type ", - py::cast(scalar.get_type().str()))); + py::cast(py::str(scalar.get_type())))); } const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) { @@ -524,16 +524,16 @@ const py::dtype* DtypeTo32BitDtype(const py::dtype& dtype) { static const auto* complex64_dt = new py::dtype("complex64"); static const auto* complex128_dt = new py::dtype("complex128"); - if (dtype == *int64_dt) { + if (dtype.equal(*int64_dt)) { return int32_dt; } - if (dtype == *float64_dt) { + if (dtype.equal(*float64_dt)) { return float32_dt; } - if (dtype == *uint64_dt) { + if (dtype.equal(*uint64_dt)) { return uint32_dt; } - if (dtype == *complex128_dt) { + if (dtype.equal(*complex128_dt)) { return complex64_dt; } @@ -568,7 +568,7 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient, const auto& device_array = xla_module->attr("DeviceArray"); static const auto* numpy_module = new py::module(py::module::import("numpy")); - const auto& array = numpy_module->attr("array"); + const auto& np_array = numpy_module->attr("array"); // When the jitted function is not committed, we first check whether any // sticky `DeviceArray` is present and on which device they live. See also: @@ -663,7 +663,7 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient, if (!jax_enable_x64) { const py::dtype* to_dtype = DtypeTo32BitDtype(numpy_array.dtype()); if (to_dtype) { - numpy_array = array(numpy_array, to_dtype); + numpy_array = np_array(numpy_array, *to_dtype); } } std::unique_ptr buffer = @@ -702,7 +702,6 @@ Status ConvertArgsToBuffers(bool jax_enable_x64, xla::PyClient& pyclient, } // namespace CacheEntry* CompiledFunction::GetCacheEntryIfPresent( - const py::args& args, const py::kwargs& kwargs, const CallSignature& signature) { auto found_iterator = executables_.find(signature); if (found_iterator != executables_.end()) { // Cache hit! @@ -810,12 +809,11 @@ py::object CompiledFunction::Call(py::args args, py::kwargs kwargs) { return py::cast(cache_miss_(*args, **kwargs))[0]; } - CacheEntry* cache_entry = - GetCacheEntryIfPresent(args, kwargs, arguments.signature); + CacheEntry* cache_entry = GetCacheEntryIfPresent(arguments.signature); if (!cache_entry) { py::object out_and_fastpath_data = cache_miss_(*args, **kwargs); - cache_entry = GetCacheEntryIfPresent(args, kwargs, arguments.signature); + cache_entry = GetCacheEntryIfPresent(arguments.signature); if (!cache_entry) { cache_entry = AddCacheEntry(args, kwargs, arguments.signature, out_and_fastpath_data); @@ -868,6 +866,15 @@ void BuildJaxjitSubmodule(pybind11::module& m) { }); // Only for testing purposes + jitlib.def("_DtypeTo32BitDtype", [](const py::object obj) -> py::object { + py::dtype dtype = py::dtype::from_args(obj); + const py::dtype* res = DtypeTo32BitDtype(dtype); + if (res) { + return *res; + } else { + return py::none(); + } + }); jitlib.def("_is_float0", &IsFloat0); jitlib.def("_is_trivial", &HasTrivialLazyExpr); jitlib.def("_ScalarToBuffer", [](py::handle scalar, bool jax_enable_x64, From b5aae2985b8cda1181e92e41f88453f954d4e728 Mon Sep 17 00:00:00 2001 From: Feng Liu Date: Mon, 12 Oct 2020 08:58:24 -0700 Subject: [PATCH 0132/1447] Print the tfr external functions in order PiperOrigin-RevId: 336672681 Change-Id: Ib385a1c1130bdc2535fc5727165e3973c6515262 --- .../compiler/mlir/tfr/python/tfr_gen.py | 2 +- .../compiler/mlir/tfr/python/tfr_gen_test.py | 25 +++++++++---------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_gen.py b/tensorflow/compiler/mlir/tfr/python/tfr_gen.py index 0311c6a3136..f8622d11511 100644 --- a/tensorflow/compiler/mlir/tfr/python/tfr_gen.py +++ b/tensorflow/compiler/mlir/tfr/python/tfr_gen.py @@ -271,7 +271,7 @@ class OpDefCache(object): def mlir_external_funcs(self): tfr_funcs = [] - for op_def, derived_attrs in self._op_defs.values(): + for _, (op_def, derived_attrs) in sorted(self._op_defs.items()): tfr_func = '\ntfr.func @tf__{}_('.format(_camel_to_snake(op_def.name)) # tensor inputs diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py b/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py index e7b8c491a52..88696490c4a 100644 --- a/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py +++ b/tensorflow/compiler/mlir/tfr/python/tfr_gen_test.py @@ -462,26 +462,25 @@ class TFRGenTensorTest(TFRGenTestBase): CHECK-NEXT: tfr.return %[[call]] : !tfr.tensor CHECK-NEXT: } - CHECK-LABEL: tfr.func @tf__test_complex_tf_op_(!tfr.tensor,!tfr.tensor,i64{tfr.name="N"}) -> (!tfr.tensor_list) attributes {N,T,Tlen} - - CHECK-LABEL: tfr.func @tf__split_v_(!tfr.tensor,!tfr.tensor,!tfr.tensor,i64{tfr.name="num_split"}) -> (!tfr.tensor_list) attributes {T,Tlen,i32_,num_split} - - CHECK-LABEL: tfr.func @tf__pack_(!tfr.tensor_list,i64{tfr.name="axis"}) -> (!tfr.tensor) attributes {N,T,axis} - - CHECK-LABEL: tfr.func @tf__test_identity_op_(!tfr.tensor) -> (!tfr.tensor) attributes {T} - - CHECK-LABEL: tfr.func @tf__identity_(!tfr.tensor) -> (!tfr.tensor) attributes {T} - - CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor,!tfr.tensor,i1{tfr.name="pred"}) -> (!tfr.tensor) attributes {T,pred} - CHECK-LABEL: tfr.func @tf__add_(!tfr.tensor,!tfr.tensor) -> (!tfr.tensor) attributes {T} CHECK-LABEL: tfr.func @tf__concat_(!tfr.tensor,!tfr.tensor_list) -> (!tfr.tensor) attributes {N,T,i32_} - CHECK-LABEL: tfr.func @tf__test_input_n_op_(!tfr.tensor_list) -> (!tfr.tensor) attributes {N,T} + CHECK-LABEL: tfr.func @tf__identity_(!tfr.tensor) -> (!tfr.tensor) attributes {T} + + CHECK-LABEL: tfr.func @tf__pack_(!tfr.tensor_list,i64{tfr.name="axis"}) -> (!tfr.tensor) attributes {N,T,axis} + + CHECK-LABEL: tfr.func @tf__split_v_(!tfr.tensor,!tfr.tensor,!tfr.tensor,i64{tfr.name="num_split"}) -> (!tfr.tensor_list) attributes {T,Tlen,i32_,num_split} CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor,!tfr.tensor,i1{tfr.name="pred"}) -> (!tfr.tensor) attributes {T,pred} + CHECK-LABEL: tfr.func @tf__test_complex_tf_op_(!tfr.tensor,!tfr.tensor,i64{tfr.name="N"}) -> (!tfr.tensor_list) attributes {N,T,Tlen} + + CHECK-LABEL: tfr.func @tf__test_identity_op_(!tfr.tensor) -> (!tfr.tensor) attributes {T} + + CHECK-LABEL: tfr.func @tf__test_two_inputs_op_(!tfr.tensor,!tfr.tensor,i1{tfr.name="pred"}) -> (!tfr.tensor) attributes {T,pred} + + CHECK-LABEL: tfr.func @tf__test_input_n_op_(!tfr.tensor_list) -> (!tfr.tensor) attributes {N,T} """ self._check_code(mlir_code, mlir_code_exp) From 484ba7174994be3b66721b1ea73b57e77e1b24db Mon Sep 17 00:00:00 2001 From: Bixia Zheng Date: Mon, 12 Oct 2020 09:01:05 -0700 Subject: [PATCH 0133/1447] [TF:TRT] Add SetLayerName for setting the name of TensorRT layers. Add function SetLayerName and use it to set the name of TensorRT layers. PiperOrigin-RevId: 336673222 Change-Id: I1223adb79771e5d1f53f1e5911d321ec7470963e --- .../tf2tensorrt/convert/convert_nodes.cc | 88 ++++++++++--------- .../tf2tensorrt/convert/convert_nodes.h | 10 ++- .../tf2tensorrt/convert/convert_nodes_test.cc | 29 ++++-- 3 files changed, 74 insertions(+), 53 deletions(-) diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc index 6752009c37b..aa5455b94b3 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.cc @@ -429,6 +429,19 @@ Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l, return Status::OK(); } +// Sets the name of an ILayer using the name of the node_def. If the operation +// represented by the ILayer is generated by the converter to support the +// conversion of node_def, callers need to specify a non-empty sub_op_name +// appended to the name of the node_def to avoid layer name conflicts. +void SetLayerName(nvinfer1::ILayer* layer, const NodeDef& node_def, + absl::string_view sub_op_name = "") { + if (sub_op_name.empty()) { + layer->setName(node_def.name().c_str()); + } else { + layer->setName(absl::StrCat(node_def.name(), "-", sub_op_name).c_str()); + } +} + nvinfer1::ITensor* Converter::CreateConstantLayer( const TRT_ShapedWeights& weights, const nvinfer1::Dims& dims) { nvinfer1::Weights trt_weights = weights.GetTrtWeights(); @@ -1475,8 +1488,9 @@ Status Converter::GetTensorOrWeights(const string& name, Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor, const std::vector& order_with_batch_dim, - absl::string_view name, - nvinfer1::ITensor** output_tensor) { + nvinfer1::ITensor** output_tensor, + const NodeDef& node_def, + absl::string_view sub_op_name) { const auto dims = input_tensor->getDimensions(); const int order_size = use_implicit_batch_ ? order_with_batch_dim.size() - 1 : order_with_batch_dim.size(); @@ -1491,7 +1505,8 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor, nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose"); - layer->setName(std::basic_string(name).c_str()); + SetLayerName(layer, node_def, sub_op_name); + MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0)); nvinfer1::Permutation permutation; @@ -2186,7 +2201,7 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group, const bool need_transpose = (data_format == "NHWC"); if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor)); + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); } // Dimensions of transposed tensor. const auto tensor_dim = tensor->getDimensions(); @@ -2252,7 +2267,7 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group, #else layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); #endif - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); layer->setNbGroups(num_groups); conv_layer = layer; } else { @@ -2269,7 +2284,7 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group, #else layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); #endif - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); layer->setNbGroups(num_groups); layer->setDilation(dilation); conv_layer = layer; @@ -2311,8 +2326,7 @@ Status ConvertConv2DHelper(OpConverterParams* params, int group, // Restore transpose. if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"), - &output_tensor)); + output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); @@ -2370,7 +2384,7 @@ Status ConvertTranspose(OpConverterParams* params) { // Start conversion. nvinfer1::ITensor* output_tensor = nullptr; TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - input_tensor, perm, params->node_def.name(), &output_tensor)); + input_tensor, perm, &output_tensor, params->node_def)); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); } @@ -2834,7 +2848,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, } if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, transpose_order, StrCat(node_def.name(), "_for_pad"), &tensor)); + tensor, transpose_order, &tensor, node_def, "for_pad")); } // Add padding layer nvinfer1::IPaddingLayer* layer = params->converter->network()->addPadding( @@ -2846,8 +2860,7 @@ Status ConvertStridedSliceHelper(OpConverterParams* params, // Restore transpose if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, inv_transpose_order, StrCat(node_def.name(), "_after_pad"), - &tensor)); + tensor, inv_transpose_order, &tensor, node_def, "after_pad")); } // Reshape for shrink_axis. if (final_shape) { @@ -3166,8 +3179,7 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, const bool need_transpose = is_ndhwc; if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 4, 1, 2, 3}, StrCat(node_def.name(), "_to_NCDHW"), - &tensor)); + tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW")); } // group == 0 signifies that this is a depthwise convolution, so set @@ -3206,7 +3218,7 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); layer->setNbGroups(num_groups); conv_layer = layer; } else { @@ -3222,7 +3234,7 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); layer->setNbGroups(num_groups); layer->setDilationNd(dilation_dhw); conv_layer = layer; @@ -3232,8 +3244,7 @@ Status ConvertConv3DHelper(OpConverterParams* params, int group, // Restore transpose. if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 4, 1}, StrCat(node_def.name(), "_to_NDHWC"), - &output_tensor)); + output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); @@ -3302,8 +3313,7 @@ Status ConvertPool3D(OpConverterParams* params) { if (data_format == "NDHWC") { // NDHWC => NCDHW TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 4, 1, 2, 3}, StrCat(node_def.name(), "_to_NCDHW"), - &tensor)); + tensor, {0, 4, 1, 2, 3}, &tensor, node_def, "to_NCDHW")); } const nvinfer1::Dims3 stride(tf_stride[d_index], tf_stride[h_index], @@ -3324,14 +3334,13 @@ Status ConvertPool3D(OpConverterParams* params) { // SAME_UPPER means that post padding is preferred. layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); } - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); nvinfer1::ITensor* output_tensor = layer->getOutput(0); if (data_format == "NDHWC") { // NCDHW => NDHWC TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 4, 1}, StrCat(node_def.name(), "_to_NDHWC"), - &output_tensor)); + output_tensor, {0, 2, 3, 4, 1}, &output_tensor, node_def, "to_NDHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); @@ -3426,7 +3435,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { const bool need_transpose = (data_format == "NHWC"); if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor)); + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); } nvinfer1::DimsHW kernel_size; @@ -3482,7 +3491,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { #else conv_layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); #endif - conv_layer->setName(node_def.name().c_str()); + SetLayerName(conv_layer, node_def); conv_layer->setNbGroups(1); conv_layer->setDilation(dilation); nvinfer1::ITensor* output_tensor = conv_layer->getOutput(0); @@ -3498,8 +3507,7 @@ Status ConvertFusedConv2DBiasActivation(OpConverterParams* params) { // Restore transpose. if (need_transpose) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"), - &output_tensor)); + output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); @@ -3541,7 +3549,7 @@ Status ConvertPool(OpConverterParams* params) { h_index = 1; w_index = 2; TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor)); + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); } const auto tf_stride = attrs.get>("strides"); @@ -3604,13 +3612,12 @@ Status ConvertPool(OpConverterParams* params) { #else layer->setPadding(nvinfer1::DimsHW{padding[0].first, padding[1].first}); #endif - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); nvinfer1::ITensor* output_tensor = layer->getOutput(0); if (data_format == "NHWC") { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"), - &output_tensor)); + output_tensor, {0, 2, 3, 1}, &output_tensor, node_def, "to_NHWC")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); @@ -3748,7 +3755,7 @@ Status ConvertActivation(OpConverterParams* params) { params->converter->network()->addActivation(*inputs.at(0).tensor(), op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); // Set parameters. #if IS_TRT_VERSION_GE(5, 1, 2, 0) if (node_def.op() == "Elu") { @@ -3852,7 +3859,7 @@ Status ConvertRelu6(OpConverterParams* params) { TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); layer->setAlpha(0.0f); layer->setBeta(6.0f); - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); nvinfer1::ITensor* output_tensor = layer->getOutput(0); params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); @@ -4307,7 +4314,7 @@ Status ConvertBinary(OpConverterParams* params) { nvinfer1::ILayer* layer = params->converter->network()->addElementWise( *tensor_l, *tensor_r, op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); nvinfer1::ITensor* trt_tensor = layer->getOutput(0); #if IS_TRT_VERSION_GE(5, 1, 0, 0) @@ -4408,7 +4415,7 @@ Status ConvertUnary(OpConverterParams* params) { nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(*tensor, op_pair->second); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); nvinfer1::ITensor* output_tensor = layer->getOutput(0); // Set quantization ranges. @@ -4696,7 +4703,7 @@ Status ConvertPad(OpConverterParams* params) { if (pad_index[0] == 1) { legit_pad = false; TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 3, 2, 1}, StrCat(node_def.name(), "_to_pad"), &tensor)); + tensor, {0, 3, 2, 1}, &tensor, node_def, "to_pad")); permuted_pad_index[0] = 3; } @@ -4719,8 +4726,7 @@ Status ConvertPad(OpConverterParams* params) { if (!legit_pad) { TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output_tensor, {0, 3, 2, 1}, StrCat(node_def.name(), "_from_pad"), - &output_tensor)); + output_tensor, {0, 3, 2, 1}, &output_tensor, node_def, "from_pad")); } params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); @@ -5057,7 +5063,7 @@ Status ConvertFusedBatchNorm(OpConverterParams* params) { combined_scale_weights.GetTrtWeights(), dummy_power_weights.GetTrtWeights()); TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name()); - layer->setName(node_def.name().c_str()); + SetLayerName(layer, node_def); nvinfer1::ITensor* output_tensor = layer->getOutput(0); params->outputs->push_back(TRT_TensorOrWeights(output_tensor)); return Status::OK(); @@ -5864,7 +5870,7 @@ Status ConvertResize(OpConverterParams* params) { // Transpose tensor from NHWC to NCHW format. TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - tensor, {0, 3, 1, 2}, StrCat(node_def.name(), "_to_NCHW"), &tensor)); + tensor, {0, 3, 1, 2}, &tensor, node_def, "to_NCHW")); // Calculate output dimensions. // Given input dimensions [N, C, H, W] and output size [H_out, W_out], @@ -5891,7 +5897,7 @@ Status ConvertResize(OpConverterParams* params) { nvinfer1::ITensor* output = layer->getOutput(0); TF_RETURN_IF_ERROR(params->converter->TransposeTensor( - output, {0, 2, 3, 1}, StrCat(node_def.name(), "_to_NHWC"), &output)); + output, {0, 2, 3, 1}, &output, node_def, "to_NHWC")); params->outputs->push_back(TRT_TensorOrWeights(output)); // Success return Status::OK(); diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h index a621735fad1..27d713b0fc1 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes.h @@ -515,11 +515,15 @@ class Converter { // Transpose 'input_tensor' with given permutation 'order_with_batch_dim' to // 'output_tensor'. The permutation 'order_with_batch_dim' contains the batch - // dimension which should always be 0. + // dimension which should always be 0. If this is for adding a transpose layer + // to support the conversion of 'node_def', callers need to provide a + // non-empty 'sub_op_name' appended to the name of 'node_def' to avoid layer + // name conflicts. Status TransposeTensor(nvinfer1::ITensor* input_tensor, const std::vector& order_with_batch_dim, - absl::string_view name, - nvinfer1::ITensor** output_tensor); + nvinfer1::ITensor** output_tensor, + const NodeDef& node_def, + absl::string_view sub_op_name = ""); // Converts 'input' into 'tensor' with shape specified by 'dims' (which // doesn't contain the batch dimension). diff --git a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc index 95f3f47efb1..8e79c63b902 100644 --- a/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc +++ b/tensorflow/compiler/tf2tensorrt/convert/convert_nodes_test.cc @@ -204,6 +204,16 @@ void ExpectTrtDimsEqualsArray(const std::vector& lhs, << " actual: " << DebugString(rhs); } +void ExpectTrtLayerNames(absl::Span names, + nvinfer1::INetworkDefinition* network) { + EXPECT_EQ(network->getNbLayers(), names.size()); + + for (int i = 0; i < network->getNbLayers(); i++) { + auto layer = network->getLayer(i); + EXPECT_EQ(layer->getName(), names[i]); + } +} + Matcher> ArrayFloatNear(const std::vector& values, float max_abs_error = 1e-5, bool nan_sensitive = false) { @@ -887,24 +897,25 @@ TEST_F(ConverterTest, TransposeTensor) { nvinfer1::ITensor* input_tensor = converter_->network()->addInput( "", nvinfer1::DataType::kFLOAT, GetTestDims({2, 3, 5})); nvinfer1::ITensor* output_tensor = nullptr; - + NodeDef dummy_node_def = MakeNodeDef("dummy_op", "DummyOp", {}); // Rank doesn't match. ExpectStatus( - converter_->TransposeTensor(input_tensor, {0, 1}, "Bad perm", - &output_tensor), + converter_->TransposeTensor(input_tensor, {0, 1}, &output_tensor, + dummy_node_def, "sub1"), error::INVALID_ARGUMENT, "Rank of perm for transpose does not match with that of the input"); // Transpose at batch dimension. - ExpectStatus(converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, - "Batch perm", &output_tensor), - error::UNIMPLEMENTED, - "Transpose at batch dimension is not supported."); + ExpectStatus( + converter_->TransposeTensor(input_tensor, {1, 0, 2, 3}, &output_tensor, + dummy_node_def, "sub2"), + error::UNIMPLEMENTED, "Transpose at batch dimension is not supported."); // OK. - TF_EXPECT_OK(converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, "OK", - &output_tensor)); + TF_EXPECT_OK(converter_->TransposeTensor( + input_tensor, {0, 3, 1, 2}, &output_tensor, dummy_node_def, "sub3")); ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions()); + ExpectTrtLayerNames({"dummy_op-sub3"}, converter_->network()); } void TestPrepareTensorForShape( From 2e56481abcef1dd1625fba465a5d02ee6b347842 Mon Sep 17 00:00:00 2001 From: Mark Daoust Date: Mon, 12 Oct 2020 09:20:40 -0700 Subject: [PATCH 0134/1447] Add doctests, `dict` initialization and `__getitem__` to `tf.lookup` PiperOrigin-RevId: 336677229 Change-Id: If888fbbc0aeb737348e387e6728f825ebd4d118e --- .../python/kernel_tests/lookup_ops_test.py | 1972 ++++++++--------- tensorflow/python/ops/lookup_ops.py | 188 +- 2 files changed, 1085 insertions(+), 1075 deletions(-) diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py index d6f796d5947..d564da12c27 100644 --- a/tensorflow/python/kernel_tests/lookup_ops_test.py +++ b/tensorflow/python/kernel_tests/lookup_ops_test.py @@ -195,6 +195,20 @@ class StaticHashTableTest(BaseLookupTableTest): result = self.evaluate(output) self.assertAllEqual([0, 1, -1], result) + def testStaticHashTableGetItem(self): + default_val = constant_op.constant(-1, dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = self.getHashTable()(lookup_ops.KeyValueTensorInitializer( + keys, values), default_val) + self.initialize_table(table) + + input_string = constant_op.constant(["brain", "salad", "tank"]) + output = table[input_string] + + result = self.evaluate(output) + self.assertAllEqual([0, 1, -1], result) + def testStaticHashTableWithSparseTensorInput(self): default_val = constant_op.constant(-1, dtypes.int64) keys = constant_op.constant(["brain", "salad", "surgery"]) @@ -972,6 +986,21 @@ class StaticVocabularyTableTest(BaseLookupTableTest): self.assertAllEqual([0, 1, 2, 3], self.evaluate(out)) self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size())) + def testStaticVocabularyTableGetItem(self): + vocab_file = self._createVocabFile("feat_to_id_1.txt") + vocab_size = 3 + oov_buckets = 1 + table = self.getVocabularyTable()(lookup_ops.TextFileIdTableInitializer( + vocab_file, vocab_size=vocab_size), oov_buckets) + + self.initialize_table(table) + + input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"]) + + out = table[input_string] + self.assertAllEqual([0, 1, 2, 3], self.evaluate(out)) + self.assertEqual(vocab_size + oov_buckets, self.evaluate(table.size())) + def testInt32StaticVocabularyTable(self): vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000")) vocab_size = 3 @@ -1244,71 +1273,85 @@ class StaticVocabularyTableTest(BaseLookupTableTest): class DenseHashTableOpTest(test.TestCase): def testBasic(self): - with self.cached_session(): + keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) + values = constant_op.constant([0, 1, 2, 3], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=0, + deleted_key=-1) + self.assertAllEqual(0, self.evaluate(table.size())) - keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) - values = constant_op.constant([0, 1, 2, 3], dtypes.int64) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) + + remove_string = constant_op.constant([12, 15], dtypes.int64) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) + + input_string = constant_op.constant([11, 12, 15], dtypes.int64) + output = table.lookup(input_string) + self.assertAllEqual([3], output.get_shape()) + + result = self.evaluate(output) + self.assertAllEqual([0, -1, -1], result) + + def testGetItem(self): + keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) + values = constant_op.constant([0, 1, 2, 3], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=0, + deleted_key=-1) + + self.evaluate(table.insert(keys, values)) + + input_string = constant_op.constant([11, 12, 15], dtypes.int64) + output = table[input_string] + self.assertAllEqual([3], output.get_shape()) + + result = self.evaluate(output) + self.assertAllEqual([0, 1, -1], result) + + def testBasicBool(self): + keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) + values = constant_op.constant([True, True, True, True], dtypes.bool) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.bool, + default_value=False, + empty_key=0, + deleted_key=-1) + self.assertAllEqual(0, self.evaluate(table.size())) + + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) + + remove_string = constant_op.constant([11, 15], dtypes.int64) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) + + input_string = constant_op.constant([11, 12, 15], dtypes.int64) + output = table.lookup(input_string) + self.assertAllEqual([3], output.get_shape()) + + result = self.evaluate(output) + self.assertAllEqual([False, True, False], result) + + def testSameEmptyAndDeletedKey(self): + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Empty and deleted keys"): table = lookup_ops.DenseHashTable( dtypes.int64, dtypes.int64, default_value=-1, - empty_key=0, - deleted_key=-1) + empty_key=42, + deleted_key=42) self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) - - remove_string = constant_op.constant([12, 15], dtypes.int64) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) - - input_string = constant_op.constant([11, 12, 15], dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([3], output.get_shape()) - - result = self.evaluate(output) - self.assertAllEqual([0, -1, -1], result) - - def testBasicBool(self): - with self.cached_session(): - - keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) - values = constant_op.constant([True, True, True, True], dtypes.bool) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.bool, - default_value=False, - empty_key=0, - deleted_key=-1) - self.assertAllEqual(0, self.evaluate(table.size())) - - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) - - remove_string = constant_op.constant([11, 15], dtypes.int64) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) - - input_string = constant_op.constant([11, 12, 15], dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([3], output.get_shape()) - - result = self.evaluate(output) - self.assertAllEqual([False, True, False], result) - - def testSameEmptyAndDeletedKey(self): - with self.cached_session(): - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Empty and deleted keys"): - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=42, - deleted_key=42) - self.assertAllEqual(0, self.evaluate(table.size())) - @test_util.run_v1_only("uses placeholders") def testLookupUnknownShape(self): with self.cached_session(): @@ -1331,212 +1374,203 @@ class DenseHashTableOpTest(test.TestCase): self.assertAllEqual([0, 1, -1], result) def testMapStringToFloat(self): - with self.cached_session(): + keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string) + values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32) + default_value = constant_op.constant(-1.5, dtypes.float32) + table = lookup_ops.DenseHashTable( + dtypes.string, + dtypes.float32, + default_value=default_value, + empty_key="", + deleted_key="$") + self.assertAllEqual(0, self.evaluate(table.size())) - keys = constant_op.constant(["a", "b", "c", "d"], dtypes.string) - values = constant_op.constant([0.0, 1.1, 2.2, 3.3], dtypes.float32) - default_value = constant_op.constant(-1.5, dtypes.float32) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) + + remove_string = constant_op.constant(["b", "e"]) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) + + input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string) + output = table.lookup(input_string) + self.assertAllEqual([4], output.get_shape()) + + result = self.evaluate(output) + self.assertAllClose([0, -1.5, 3.3, -1.5], result) + + def testMapInt64ToFloat(self): + for float_dtype in [dtypes.float32, dtypes.float64]: + keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) + values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype) + default_value = constant_op.constant(-1.5, float_dtype) table = lookup_ops.DenseHashTable( - dtypes.string, - dtypes.float32, + dtypes.int64, + float_dtype, default_value=default_value, - empty_key="", - deleted_key="$") + empty_key=0, + deleted_key=-1) self.assertAllEqual(0, self.evaluate(table.size())) self.evaluate(table.insert(keys, values)) self.assertAllEqual(4, self.evaluate(table.size())) - remove_string = constant_op.constant(["b", "e"]) + remove_string = constant_op.constant([12, 15], dtypes.int64) self.evaluate(table.remove(remove_string)) self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["a", "b", "d", "e"], dtypes.string) + input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64) output = table.lookup(input_string) self.assertAllEqual([4], output.get_shape()) result = self.evaluate(output) self.assertAllClose([0, -1.5, 3.3, -1.5], result) - def testMapInt64ToFloat(self): - for float_dtype in [dtypes.float32, dtypes.float64]: - with self.cached_session(): - - keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) - values = constant_op.constant([0.0, 1.1, 2.2, 3.3], float_dtype) - default_value = constant_op.constant(-1.5, float_dtype) - table = lookup_ops.DenseHashTable( - dtypes.int64, - float_dtype, - default_value=default_value, - empty_key=0, - deleted_key=-1) - self.assertAllEqual(0, self.evaluate(table.size())) - - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) - - remove_string = constant_op.constant([12, 15], dtypes.int64) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) - - input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([4], output.get_shape()) - - result = self.evaluate(output) - self.assertAllClose([0, -1.5, 3.3, -1.5], result) - def testVectorValues(self): - with self.cached_session(): - keys = constant_op.constant([11, 12, 13], dtypes.int64) - values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]], - dtypes.int64) - default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=default_value, - empty_key=0, - deleted_key=-1, - initial_num_buckets=4) - self.assertAllEqual(0, self.evaluate(table.size())) + keys = constant_op.constant([11, 12, 13], dtypes.int64) + values = constant_op.constant([[0, 1, 2, 3], [3, 4, 5, 6], [6, 7, 8, 9]], + dtypes.int64) + default_value = constant_op.constant([-1, -2, -3, -4], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=default_value, + empty_key=0, + deleted_key=-1, + initial_num_buckets=4) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) - self.assertAllEqual(4, len(self.evaluate(table.export()[0]))) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) + self.assertAllEqual(4, len(self.evaluate(table.export()[0]))) - self.evaluate( - table.insert( - constant_op.constant([14], dtypes.int64), - constant_op.constant([[2, 3, 4, 5]], dtypes.int64))) - self.assertAllEqual(4, self.evaluate(table.size())) - self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) + self.evaluate( + table.insert( + constant_op.constant([14], dtypes.int64), + constant_op.constant([[2, 3, 4, 5]], dtypes.int64))) + self.assertAllEqual(4, self.evaluate(table.size())) + self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) - remove_string = constant_op.constant([12, 16], dtypes.int64) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) - self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) + remove_string = constant_op.constant([12, 16], dtypes.int64) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) + self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) - input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([4, 4], - output.shape, - msg="Saw shape: %s" % output.shape) + input_string = constant_op.constant([11, 12, 14, 15], dtypes.int64) + output = table.lookup(input_string) + self.assertAllEqual([4, 4], + output.shape, + msg="Saw shape: %s" % output.shape) - result = self.evaluate(output) - self.assertAllEqual( - [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]], - result) + result = self.evaluate(output) + self.assertAllEqual( + [[0, 1, 2, 3], [-1, -2, -3, -4], [2, 3, 4, 5], [-1, -2, -3, -4]], + result) def testVectorKeys(self): - with self.cached_session(): - keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64) - values = constant_op.constant([10, 11, 12], dtypes.int64) - empty_key = constant_op.constant([0, 3], dtypes.int64) - deleted_key = constant_op.constant([-1, -1], dtypes.int64) - default_value = constant_op.constant(-1, dtypes.int64) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=default_value, - empty_key=empty_key, - deleted_key=deleted_key, - initial_num_buckets=8) - self.assertAllEqual(0, self.evaluate(table.size())) + keys = constant_op.constant([[0, 1], [1, 2], [1, 3]], dtypes.int64) + values = constant_op.constant([10, 11, 12], dtypes.int64) + empty_key = constant_op.constant([0, 3], dtypes.int64) + deleted_key = constant_op.constant([-1, -1], dtypes.int64) + default_value = constant_op.constant(-1, dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=default_value, + empty_key=empty_key, + deleted_key=deleted_key, + initial_num_buckets=8) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - self.evaluate( - table.insert( - constant_op.constant([[0, 0]], dtypes.int64), - constant_op.constant([13], dtypes.int64))) - self.assertAllEqual(4, self.evaluate(table.size())) - self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) + self.evaluate( + table.insert( + constant_op.constant([[0, 0]], dtypes.int64), + constant_op.constant([13], dtypes.int64))) + self.assertAllEqual(4, self.evaluate(table.size())) + self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) - remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) - self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) + remove_string = constant_op.constant([[1, 2], [7, 8]], dtypes.int64) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) + self.assertAllEqual(8, len(self.evaluate(table.export()[0]))) - input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]], - dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([4], output.get_shape()) + input_string = constant_op.constant([[0, 1], [1, 2], [1, 3], [0, 2]], + dtypes.int64) + output = table.lookup(input_string) + self.assertAllEqual([4], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual([10, -1, 12, -1], result) + result = self.evaluate(output) + self.assertAllEqual([10, -1, 12, -1], result) def testResize(self): - with self.cached_session(): - keys = constant_op.constant([11, 12, 13], dtypes.int64) - values = constant_op.constant([0, 1, 2], dtypes.int64) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=0, - deleted_key=-1, - initial_num_buckets=4) - self.assertAllEqual(0, self.evaluate(table.size())) + keys = constant_op.constant([11, 12, 13], dtypes.int64) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=0, + deleted_key=-1, + initial_num_buckets=4) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) - self.assertAllEqual(4, len(self.evaluate(table.export()[0]))) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) + self.assertAllEqual(4, len(self.evaluate(table.export()[0]))) - keys2 = constant_op.constant([12, 99], dtypes.int64) - self.evaluate(table.remove(keys2)) - self.assertAllEqual(2, self.evaluate(table.size())) - self.assertAllEqual(4, len(self.evaluate(table.export()[0]))) + keys2 = constant_op.constant([12, 99], dtypes.int64) + self.evaluate(table.remove(keys2)) + self.assertAllEqual(2, self.evaluate(table.size())) + self.assertAllEqual(4, len(self.evaluate(table.export()[0]))) - keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64) - values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64) + keys3 = constant_op.constant([13, 14, 15, 16, 17], dtypes.int64) + values3 = constant_op.constant([3, 4, 5, 6, 7], dtypes.int64) - self.evaluate(table.insert(keys3, values3)) - self.assertAllEqual(6, self.evaluate(table.size())) - self.assertAllEqual(16, len(self.evaluate(table.export()[0]))) + self.evaluate(table.insert(keys3, values3)) + self.assertAllEqual(6, self.evaluate(table.size())) + self.assertAllEqual(16, len(self.evaluate(table.export()[0]))) - keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18], - dtypes.int64) - output = table.lookup(keys4) - self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output)) + keys4 = constant_op.constant([10, 11, 12, 13, 14, 15, 16, 17, 18], + dtypes.int64) + output = table.lookup(keys4) + self.assertAllEqual([-1, 0, -1, 3, 4, 5, 6, 7, -1], self.evaluate(output)) def testExport(self): - with self.cached_session(): + keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) + values = constant_op.constant([1, 2, 3, 4], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=100, + deleted_key=200, + initial_num_buckets=8) + self.assertAllEqual(0, self.evaluate(table.size())) - keys = constant_op.constant([11, 12, 13, 14], dtypes.int64) - values = constant_op.constant([1, 2, 3, 4], dtypes.int64) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=100, - deleted_key=200, - initial_num_buckets=8) - self.assertAllEqual(0, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) + keys2 = constant_op.constant([12, 15], dtypes.int64) + self.evaluate(table.remove(keys2)) + self.assertAllEqual(3, self.evaluate(table.size())) - keys2 = constant_op.constant([12, 15], dtypes.int64) - self.evaluate(table.remove(keys2)) - self.assertAllEqual(3, self.evaluate(table.size())) + exported_keys, exported_values = table.export() - exported_keys, exported_values = table.export() + np_keys = self.evaluate(exported_keys) + np_values = self.evaluate(exported_values) - np_keys = self.evaluate(exported_keys) - np_values = self.evaluate(exported_values) + self.assertAllEqual(8, len(np_keys)) + self.assertAllEqual(8, len(np_values)) - self.assertAllEqual(8, len(np_keys)) - self.assertAllEqual(8, len(np_values)) - - # pair up keys and values, drop extra added dimension - pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0] - # sort by key - pairs = pairs[pairs[:, 0].argsort()] - self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0], - [100, 0], [100, 0], [200, 2]], pairs) + # pair up keys and values, drop extra added dimension + pairs = np.dstack((np_keys.flatten(), np_values.flatten()))[0] + # sort by key + pairs = pairs[pairs[:, 0].argsort()] + self.assertAllEqual([[11, 1], [13, 3], [14, 4], [100, 0], [100, 0], + [100, 0], [100, 0], [200, 2]], pairs) @test_util.run_v1_only("Saver V1 only") def testSaveRestore(self): @@ -1910,137 +1944,134 @@ class DenseHashTableOpTest(test.TestCase): self.assertAllEqual([0, 1, -1, 3, -1], output) def testReprobe(self): - with self.cached_session(): - # Insert 6 keys into a table with 8 buckets. - # The values are chosen to make sure collisions occur when using GCC STL - keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64) - values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=0, - deleted_key=-1, - initial_num_buckets=8) - self.assertAllEqual(0, self.evaluate(table.size())) + # Insert 6 keys into a table with 8 buckets. + # The values are chosen to make sure collisions occur when using GCC STL + keys = constant_op.constant([11, 12, 13, 19, 20, 21], dtypes.int64) + values = constant_op.constant([51, 52, 53, 54, 55, 56], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=0, + deleted_key=-1, + initial_num_buckets=8) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(6, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(6, self.evaluate(table.size())) - input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22], - dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([9], output.get_shape()) + input_string = constant_op.constant([10, 11, 12, 13, 14, 19, 20, 21, 22], + dtypes.int64) + output = table.lookup(input_string) + self.assertAllEqual([9], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result) + result = self.evaluate(output) + self.assertAllEqual([-1, 51, 52, 53, -1, 54, 55, 56, -1], result) def testCustomEmptyKey(self): - with self.cached_session(): - keys = constant_op.constant([11, 0, 13], dtypes.int64) - values = constant_op.constant([0, 1, 2], dtypes.int64) - table = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=12, - deleted_key=-1) - self.assertAllEqual(0, self.evaluate(table.size())) + keys = constant_op.constant([11, 0, 13], dtypes.int64) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=12, + deleted_key=-1) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant([11, 0, 15], dtypes.int64) - output = table.lookup(input_string) - self.assertAllEqual([3], output.get_shape()) + input_string = constant_op.constant([11, 0, 15], dtypes.int64) + output = table.lookup(input_string) + self.assertAllEqual([3], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual([0, 1, -1], result) + result = self.evaluate(output) + self.assertAllEqual([0, 1, -1], result) def testErrors(self): - with self.cached_session(): - table = lookup_ops.DenseHashTable( + table = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=0, + deleted_key=-1) + + # Inserting the empty key returns an error + keys1 = constant_op.constant([11, 0], dtypes.int64) + values1 = constant_op.constant([0, 1], dtypes.int64) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "empty_key"): + self.evaluate(table.insert(keys1, values1)) + + # Looking up the empty key returns an error + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "empty_key"): + self.evaluate(table.lookup(keys1)) + + # Inserting the deleted key returns an error + keys2 = constant_op.constant([11, -1], dtypes.int64) + values2 = constant_op.constant([0, 1], dtypes.int64) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "deleted_key"): + self.evaluate(table.insert(keys2, values2)) + + # Looking up the empty key returns an error + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "deleted_key"): + self.evaluate(table.lookup(keys2)) + + # Arbitrary tensors of keys are not supported + keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64) + values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Expected key shape"): + self.evaluate(table.lookup(keys)) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Expected key shape"): + self.evaluate(table.insert(keys, values)) + + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Number of buckets must be"): + table2 = lookup_ops.DenseHashTable( dtypes.int64, dtypes.int64, default_value=-1, - empty_key=0, - deleted_key=-1) + empty_key=17, + deleted_key=-1, + initial_num_buckets=12) + self.assertAllEqual(0, self.evaluate(table2.size())) - # Inserting the empty key returns an error - keys1 = constant_op.constant([11, 0], dtypes.int64) - values1 = constant_op.constant([0, 1], dtypes.int64) - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "empty_key"): - self.evaluate(table.insert(keys1, values1)) + with self.assertRaisesRegex( + errors_impl.InvalidArgumentError, + "Empty and deleted keys must have same shape"): + table3 = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=42, + deleted_key=[1, 2]) + self.assertAllEqual(0, self.evaluate(table3.size())) - # Looking up the empty key returns an error - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "empty_key"): - self.evaluate(table.lookup(keys1)) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Empty and deleted keys cannot be equal"): + table4 = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=42, + deleted_key=42) + self.assertAllEqual(0, self.evaluate(table4.size())) - # Inserting the deleted key returns an error - keys2 = constant_op.constant([11, -1], dtypes.int64) - values2 = constant_op.constant([0, 1], dtypes.int64) - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "deleted_key"): - self.evaluate(table.insert(keys2, values2)) - - # Looking up the empty key returns an error - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "deleted_key"): - self.evaluate(table.lookup(keys2)) - - # Arbitrary tensors of keys are not supported - keys = constant_op.constant([[11, 0], [12, 1]], dtypes.int64) - values = constant_op.constant([[11, 0], [12, 1]], dtypes.int64) - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Expected key shape"): - self.evaluate(table.lookup(keys)) - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Expected key shape"): - self.evaluate(table.insert(keys, values)) - - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Number of buckets must be"): - table2 = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=17, - deleted_key=-1, - initial_num_buckets=12) - self.assertAllEqual(0, self.evaluate(table2.size())) - - with self.assertRaisesRegex( - errors_impl.InvalidArgumentError, - "Empty and deleted keys must have same shape"): - table3 = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=42, - deleted_key=[1, 2]) - self.assertAllEqual(0, self.evaluate(table3.size())) - - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Empty and deleted keys cannot be equal"): - table4 = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=42, - deleted_key=42) - self.assertAllEqual(0, self.evaluate(table4.size())) - - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Empty and deleted keys cannot be equal"): - table5 = lookup_ops.DenseHashTable( - dtypes.int64, - dtypes.int64, - default_value=-1, - empty_key=[1, 2, 3], - deleted_key=[1, 2, 3]) - self.assertAllEqual(0, self.evaluate(table5.size())) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Empty and deleted keys cannot be equal"): + table5 = lookup_ops.DenseHashTable( + dtypes.int64, + dtypes.int64, + default_value=-1, + empty_key=[1, 2, 3], + deleted_key=[1, 2, 3]) + self.assertAllEqual(0, self.evaluate(table5.size())) @test_util.run_in_graph_and_eager_modes def testStringToResource(self): @@ -2091,68 +2122,65 @@ class IndexTableFromFile(test.TestCase): def test_string_index_table_from_file(self): vocabulary_file = self._createVocabFile("f2i_vocab1.txt") - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, num_oov_buckets=1) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=1) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_string_index_table_from_multicolumn_file(self): vocabulary_file = self._createVocabFile( "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1")) - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, - num_oov_buckets=1, - key_column_index=0, - value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, + num_oov_buckets=1, + key_column_index=0, + value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_string_index_table_from_multicolumn_file_custom_delimiter(self): vocabulary_file = self._createVocabFile( "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1")) - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, - num_oov_buckets=1, - key_column_index=0, - value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER, - delimiter=" ") - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, + num_oov_buckets=1, + key_column_index=0, + value_column_index=lookup_ops.TextFileIndex.LINE_NUMBER, + delimiter=" ") + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_string_index_table_from_file_tensor_filename(self): vocabulary_file = self._createVocabFile("f2i_vocab1.txt") - with self.cached_session(): - vocabulary_file = constant_op.constant(vocabulary_file) - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, num_oov_buckets=1) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + vocabulary_file = constant_op.constant(vocabulary_file) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=1) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) - if not context.executing_eagerly(): - self.assertEqual(1, - len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + self.assertEqual(1, + len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS))) @test_util.run_v1_only("placeholder usage") def test_string_index_table_from_file_placeholder_filename(self): @@ -2175,70 +2203,64 @@ class IndexTableFromFile(test.TestCase): def test_int32_index_table_from_file(self): vocabulary_file = self._createVocabFile( "f2i_vocab2.txt", values=("42", "1", "-1000")) - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, - num_oov_buckets=1, - key_dtype=dtypes.int32) - ids = table.lookup( - constant_op.constant((1, -1000, 11), dtype=dtypes.int32)) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, + num_oov_buckets=1, + key_dtype=dtypes.int32) + ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int32)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_int64_index_table_from_file(self): vocabulary_file = self._createVocabFile( "f2i_vocab3.txt", values=("42", "1", "-1000")) - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, - num_oov_buckets=1, - key_dtype=dtypes.int64) - ids = table.lookup( - constant_op.constant((1, -1000, 11), dtype=dtypes.int64)) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, + num_oov_buckets=1, + key_dtype=dtypes.int64) + ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_index_table_from_file_with_default_value(self): default_value = -42 vocabulary_file = self._createVocabFile("f2i_vocab4.txt") - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, default_value=default_value) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, default_value=default_value) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, default_value), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, default_value), self.evaluate(ids)) def test_index_table_from_file_with_oov_buckets(self): vocabulary_file = self._createVocabFile("f2i_vocab5.txt") - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, num_oov_buckets=1000) - ids = table.lookup( - constant_op.constant(["salad", "surgery", "tarkus", "toccata"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=1000) + ids = table.lookup( + constant_op.constant(["salad", "surgery", "tarkus", "toccata"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual( - ( - 1, # From vocabulary file. - 2, # From vocabulary file. - 867, # 3 + fingerprint("tarkus") mod 300. - 860), # 3 + fingerprint("toccata") mod 300. - self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual( + ( + 1, # From vocabulary file. + 2, # From vocabulary file. + 867, # 3 + fingerprint("tarkus") mod 300. + 860), # 3 + fingerprint("toccata") mod 300. + self.evaluate(ids)) def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self): self.assertRaises( @@ -2269,26 +2291,24 @@ class IndexTableFromFile(test.TestCase): def test_index_table_from_file_with_vocab_size_too_small(self): vocabulary_file = self._createVocabFile("f2i_vocab6.txt") - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, vocab_size=2) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=2) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, -1, -1), self.evaluate(ids)) - self.assertEqual(2, self.evaluate(table.size())) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, -1, -1), self.evaluate(ids)) + self.assertEqual(2, self.evaluate(table.size())) def test_index_table_from_file_with_vocab_size_too_large(self): vocabulary_file = self._createVocabFile("f2i_vocab7.txt") - with self.cached_session(): - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Invalid vocab_size"): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, vocab_size=4) - self.evaluate(table.initializer) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Invalid vocab_size"): + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=4) + self.evaluate(table.initializer) def test_index_table_from_file_with_vocab_size(self): vocabulary_file = self._createVocabFile("f2i_vocab8.txt") @@ -2299,50 +2319,46 @@ class IndexTableFromFile(test.TestCase): vocabulary_file=vocabulary_file, vocab_size=0) - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, vocab_size=3) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=3) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, -1), self.evaluate(ids)) - self.assertEqual(3, self.evaluate(table.size())) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, -1), self.evaluate(ids)) + self.assertEqual(3, self.evaluate(table.size())) def test_index_table_from_file_with_invalid_hashers(self): vocabulary_file = self._createVocabFile("invalid_hasher.txt") - with self.cached_session(): - with self.assertRaises(TypeError): - lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, - vocab_size=3, - num_oov_buckets=1, - hasher_spec=1) - - table = lookup_ops.index_table_from_file( + with self.assertRaises(TypeError): + lookup_ops.index_table_from_file( vocabulary_file=vocabulary_file, vocab_size=3, num_oov_buckets=1, - hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) + hasher_spec=1) - self.assertRaises(ValueError, table.lookup, - constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, + vocab_size=3, + num_oov_buckets=1, + hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) + + self.assertRaises(ValueError, table.lookup, + constant_op.constant(["salad", "surgery", "tarkus"])) def test_index_table_from_file_table_ref_with_oov_buckets(self): vocabulary_file = self._createVocabFile("f2i_vocab9.txt") - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, num_oov_buckets=1) - self.assertIsNotNone(table.resource_handle) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=1) + self.assertIsNotNone(table.resource_handle) def test_index_table_from_file_table_ref_without_oov_buckets(self): vocabulary_file = self._createVocabFile("f2i_vocab10.txt") - with self.cached_session(): - table = lookup_ops.index_table_from_file( - vocabulary_file=vocabulary_file, num_oov_buckets=0) - self.assertIsNotNone(table.resource_handle) + table = lookup_ops.index_table_from_file( + vocabulary_file=vocabulary_file, num_oov_buckets=0) + self.assertIsNotNone(table.resource_handle) class IndexTableFromTensor(test.TestCase): @@ -2365,75 +2381,67 @@ class IndexTableFromTensor(test.TestCase): self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_int32_index_table_from_tensor_with_tensor_init(self): - with self.cached_session(): - table = lookup_ops.index_table_from_tensor( - vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32) - ids = table.lookup( - constant_op.constant((1, -1000, 11), dtype=dtypes.int32)) + table = lookup_ops.index_table_from_tensor( + vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int32) + ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int32)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.FailedPreconditionError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.FailedPreconditionError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_int64_index_table_from_tensor_with_tensor_init(self): - with self.cached_session(): - table = lookup_ops.index_table_from_tensor( - vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64) - ids = table.lookup( - constant_op.constant((1, -1000, 11), dtype=dtypes.int64)) + table = lookup_ops.index_table_from_tensor( + vocabulary_list=(42, 1, -1000), num_oov_buckets=1, dtype=dtypes.int64) + ids = table.lookup(constant_op.constant((1, -1000, 11), dtype=dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.FailedPreconditionError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, 3), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.FailedPreconditionError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, 3), self.evaluate(ids)) def test_index_table_from_tensor_with_default_value(self): default_value = -42 - with self.cached_session(): - table = lookup_ops.index_table_from_tensor( - vocabulary_list=["brain", "salad", "surgery"], - default_value=default_value) - ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_tensor( + vocabulary_list=["brain", "salad", "surgery"], + default_value=default_value) + ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"])) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.FailedPreconditionError): - self.evaluate(ids) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((1, 2, default_value), self.evaluate(ids)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.FailedPreconditionError): + self.evaluate(ids) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((1, 2, default_value), self.evaluate(ids)) def test_index_table_from_tensor_missing_vocabulary_list(self): - with self.cached_session(): - with self.assertRaisesRegex(ValueError, - "vocabulary_list must be specified"): - lookup_ops.index_table_from_tensor( - vocabulary_list=None, num_oov_buckets=1) + with self.assertRaisesRegex(ValueError, + "vocabulary_list must be specified"): + lookup_ops.index_table_from_tensor( + vocabulary_list=None, num_oov_buckets=1) def test_index_table_from_tensor_empty_vocabulary_list(self): - with self.cached_session(): - with self.assertRaisesRegex(errors_impl.OpError, - "keys and values cannot be empty"): - _ = lookup_ops.index_table_from_tensor( - vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1) - self.evaluate(lookup_ops.tables_initializer()) + with self.assertRaisesRegex(errors_impl.OpError, + "keys and values cannot be empty"): + _ = lookup_ops.index_table_from_tensor( + vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1) + self.evaluate(lookup_ops.tables_initializer()) def test_index_table_from_tensor_with_invalid_hashers(self): - with self.cached_session(): - with self.assertRaises(TypeError): - lookup_ops.index_table_from_tensor( - vocabulary_list=["brain", "salad", "surgery"], - num_oov_buckets=1, - hasher_spec=1) - - table = lookup_ops.index_table_from_tensor( + with self.assertRaises(TypeError): + lookup_ops.index_table_from_tensor( vocabulary_list=["brain", "salad", "surgery"], num_oov_buckets=1, - hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) + hasher_spec=1) - self.assertRaises(ValueError, table.lookup, - constant_op.constant(["salad", "surgery", "tarkus"])) + table = lookup_ops.index_table_from_tensor( + vocabulary_list=["brain", "salad", "surgery"], + num_oov_buckets=1, + hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) + + self.assertRaises(ValueError, table.lookup, + constant_op.constant(["salad", "surgery", "tarkus"])) class IndexToStringTableFromFileTest(test.TestCase): @@ -2450,147 +2458,135 @@ class IndexToStringTableFromFileTest(test.TestCase): type_funcs = [str, constant_op.constant] for type_func in type_funcs: vocabulary_file = type_func(vocabulary_path) - with self.cached_session(): - table = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file) - features = table.lookup( - constant_op.constant([0, 1, 2, 3], dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), - self.evaluate(features)) + table = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file) + features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), + self.evaluate(features)) def test_index_to_string_table_from_multicolumn_file(self): vocabulary_file = self._createVocabFile( "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1")) - with self.cached_session(): - table = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file, - key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER, - value_column_index=0) - features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), - self.evaluate(features)) + table = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file, + key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER, + value_column_index=0) + features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), + self.evaluate(features)) def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self): vocabulary_file = self._createVocabFile( "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1")) - with self.cached_session(): - table = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file, - key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER, - value_column_index=0, - delimiter=" ") - features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), - self.evaluate(features)) + table = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file, + key_column_index=lookup_ops.TextFileIndex.LINE_NUMBER, + value_column_index=0, + delimiter=" ") + features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), + self.evaluate(features)) def test_index_to_string_table_with_default_value(self): default_value = b"NONE" vocabulary_file = self._createVocabFile("f2i_vocab2.txt") - with self.cached_session(): - table = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file, default_value=default_value) - features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"salad", b"surgery", default_value), - self.evaluate(features)) + table = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file, default_value=default_value) + features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"salad", b"surgery", default_value), + self.evaluate(features)) def test_index_to_string_table_with_vocab_size_too_small(self): default_value = b"NONE" vocabulary_file = self._createVocabFile("f2i_vocab2.txt") - with self.cached_session(): - table = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file, - vocab_size=2, - default_value=default_value) - features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"salad", default_value, default_value), - self.evaluate(features)) + table = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file, + vocab_size=2, + default_value=default_value) + features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"salad", default_value, default_value), + self.evaluate(features)) def test_index_to_string_table_with_vocab_size_too_large(self): vocabulary_file = self._createVocabFile("f2i_vocab6.txt") - with self.cached_session(): - with self.assertRaisesRegex(errors_impl.InvalidArgumentError, - "Invalid vocab_size"): - _ = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file, vocab_size=4) - self.evaluate(lookup_ops.tables_initializer()) + with self.assertRaisesRegex(errors_impl.InvalidArgumentError, + "Invalid vocab_size"): + _ = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=4) + self.evaluate(lookup_ops.tables_initializer()) def test_index_to_string_table_with_vocab_size(self): vocabulary_file = self._createVocabFile("f2i_vocab7.txt") - with self.cached_session(): - table = lookup_ops.index_to_string_table_from_file( - vocabulary_file=vocabulary_file, vocab_size=3) - features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64)) + table = lookup_ops.index_to_string_table_from_file( + vocabulary_file=vocabulary_file, vocab_size=3) + features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64)) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"salad", b"surgery", b"UNK"), - self.evaluate(features)) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"salad", b"surgery", b"UNK"), self.evaluate(features)) class IndexToStringTableFromTensorTest(test.TestCase): def test_index_to_string_table_from_tensor(self): - with self.cached_session(): - vocabulary_list = constant_op.constant(["brain", "salad", "surgery"]) - table = lookup_ops.index_to_string_table_from_tensor( - vocabulary_list=vocabulary_list) + vocabulary_list = constant_op.constant(["brain", "salad", "surgery"]) + table = lookup_ops.index_to_string_table_from_tensor( + vocabulary_list=vocabulary_list) - indices = constant_op.constant([0, 1, 2, 3], dtypes.int64) - features = table.lookup(indices) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) + indices = constant_op.constant([0, 1, 2, 3], dtypes.int64) + features = table.lookup(indices) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), - self.evaluate(features)) + self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"), + self.evaluate(features)) def test_duplicate_entries(self): - with self.cached_session(): - vocabulary_list = constant_op.constant(["hello", "hello"]) - table = lookup_ops.index_to_string_table_from_tensor( - vocabulary_list=vocabulary_list) - indices = constant_op.constant([0, 1, 4], dtypes.int64) - features = table.lookup(indices) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features)) + vocabulary_list = constant_op.constant(["hello", "hello"]) + table = lookup_ops.index_to_string_table_from_tensor( + vocabulary_list=vocabulary_list) + indices = constant_op.constant([0, 1, 4], dtypes.int64) + features = table.lookup(indices) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features)) def test_index_to_string_with_default_value(self): default_value = b"NONE" - with self.cached_session(): - vocabulary_list = constant_op.constant(["brain", "salad", "surgery"]) - table = lookup_ops.index_to_string_table_from_tensor( - vocabulary_list=vocabulary_list, default_value=default_value) - indices = constant_op.constant([1, 2, 4], dtypes.int64) - features = table.lookup(indices) - if not context.executing_eagerly(): - with self.assertRaises(errors_impl.OpError): - self.evaluate(features) - self.evaluate(lookup_ops.tables_initializer()) - self.assertAllEqual((b"salad", b"surgery", default_value), - self.evaluate(features)) + vocabulary_list = constant_op.constant(["brain", "salad", "surgery"]) + table = lookup_ops.index_to_string_table_from_tensor( + vocabulary_list=vocabulary_list, default_value=default_value) + indices = constant_op.constant([1, 2, 4], dtypes.int64) + features = table.lookup(indices) + if not context.executing_eagerly(): + with self.assertRaises(errors_impl.OpError): + self.evaluate(features) + self.evaluate(lookup_ops.tables_initializer()) + self.assertAllEqual((b"salad", b"surgery", default_value), + self.evaluate(features)) class IdTableWithHashBucketsTest(test.TestCase): @@ -2753,45 +2749,40 @@ class IdTableWithHashBucketsTest(test.TestCase): def testIdTableWithHashBucketsInitializationAcrossSessions(self): vocab_file = self._createVocabFile("feat_to_id_5.txt") - with self.cached_session(): - default_value = -1 - vocab_size = 3 - oov_buckets = 1 - table1 = lookup_ops.IdTableWithHashBuckets( - lookup_ops.StaticHashTable( - lookup_ops.TextFileIdTableInitializer( - vocab_file, vocab_size=vocab_size), default_value), - oov_buckets) + default_value = -1 + vocab_size = 3 + oov_buckets = 1 + table1 = lookup_ops.IdTableWithHashBuckets( + lookup_ops.StaticHashTable( + lookup_ops.TextFileIdTableInitializer( + vocab_file, vocab_size=vocab_size), default_value), oov_buckets) - self.evaluate(table1.initializer) + self.evaluate(table1.initializer) - input_string_1 = constant_op.constant( - ["brain", "salad", "surgery", "UNK"]) + input_string_1 = constant_op.constant(["brain", "salad", "surgery", "UNK"]) - out1 = table1.lookup(input_string_1) + out1 = table1.lookup(input_string_1) - self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1)) - self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size())) + self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1)) + self.assertEqual(vocab_size + oov_buckets, self.evaluate(table1.size())) - with self.cached_session(): - default_value = -1 - vocab_size = 3 - oov_buckets = 1 + default_value = -1 + vocab_size = 3 + oov_buckets = 1 - # Underlying lookup table already initialized in previous session. - # No need to call self.evaluate(table2.initializer) - table2 = lookup_ops.IdTableWithHashBuckets( - lookup_ops.StaticHashTable( - lookup_ops.TextFileIdTableInitializer( - vocab_file, vocab_size=vocab_size), default_value), - oov_buckets) + # Underlying lookup table already initialized in previous session. + # No need to call self.evaluate(table2.initializer) + table2 = lookup_ops.IdTableWithHashBuckets( + lookup_ops.StaticHashTable( + lookup_ops.TextFileIdTableInitializer( + vocab_file, vocab_size=vocab_size), default_value), oov_buckets) - input_string_2 = constant_op.constant(["fruit", "salad", "UNK"]) + input_string_2 = constant_op.constant(["fruit", "salad", "UNK"]) - out2 = table2.lookup(input_string_2) + out2 = table2.lookup(input_string_2) - self.assertAllEqual([3, 1, 3], self.evaluate(out2)) - self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size())) + self.assertAllEqual([3, 1, 3], self.evaluate(out2)) + self.assertEqual(vocab_size + oov_buckets, self.evaluate(table2.size())) def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self): vocab_file = self._createVocabFile("feat_to_id_6.txt") @@ -2980,84 +2971,79 @@ class IdTableWithHashBucketsTest(test.TestCase): def testIdTableWithHashBucketsWithInvalidHashers(self): vocab_file = self._createVocabFile("feat_to_id_4.txt") - with self.cached_session(): - default_value = -1 - vocab_size = 3 - oov_buckets = 1 - lookup_table = lookup_ops.StaticHashTable( - lookup_ops.TextFileIdTableInitializer( - vocab_file, vocab_size=vocab_size), default_value) + default_value = -1 + vocab_size = 3 + oov_buckets = 1 + lookup_table = lookup_ops.StaticHashTable( + lookup_ops.TextFileIdTableInitializer( + vocab_file, vocab_size=vocab_size), default_value) - with self.assertRaises(TypeError): - lookup_ops.IdTableWithHashBuckets( - lookup_table, oov_buckets, hasher_spec=1) + with self.assertRaises(TypeError): + lookup_ops.IdTableWithHashBuckets( + lookup_table, oov_buckets, hasher_spec=1) + table = lookup_ops.IdTableWithHashBuckets( + lookup_table, + oov_buckets, + hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) + + input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"]) + + with self.assertRaises(ValueError): + table.lookup(input_string) + + with self.assertRaises(ValueError): + table = lookup_ops.IdTableWithHashBuckets( + lookup_table, oov_buckets, hasher_spec=lookup_ops.StrongHashSpec([])) + + with self.assertRaises(ValueError): table = lookup_ops.IdTableWithHashBuckets( lookup_table, oov_buckets, - hasher_spec=lookup_ops.HasherSpec("my-awesome-hash", None)) + hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3])) - input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"]) - - with self.assertRaises(ValueError): - table.lookup(input_string) - - with self.assertRaises(ValueError): - table = lookup_ops.IdTableWithHashBuckets( - lookup_table, - oov_buckets, - hasher_spec=lookup_ops.StrongHashSpec([])) - - with self.assertRaises(ValueError): - table = lookup_ops.IdTableWithHashBuckets( - lookup_table, - oov_buckets, - hasher_spec=lookup_ops.StrongHashSpec([1, 2, 3])) - - with self.assertRaises(TypeError): - table = lookup_ops.IdTableWithHashBuckets( - lookup_table, - oov_buckets, - hasher_spec=lookup_ops.StrongHashSpec([None, 2])) + with self.assertRaises(TypeError): + table = lookup_ops.IdTableWithHashBuckets( + lookup_table, + oov_buckets, + hasher_spec=lookup_ops.StrongHashSpec([None, 2])) def testIdTableWithHashBucketsNoInnerTable(self): - with self.cached_session(): - table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1) - self.assertIsNone(table.resource_handle) + table = lookup_ops.IdTableWithHashBuckets(None, num_oov_buckets=1) + self.assertIsNone(table.resource_handle) class MutableHashTableOpTest(test.TestCase): def testMutableHashTable(self): - with self.cached_session(): - default_val = -1 - keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"]) - values = constant_op.constant([0, 1, 2, 3], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = -1 + keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"]) + values = constant_op.constant([0, 1, 2, 3], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) - remove_string = constant_op.constant(["tarkus", "tank"]) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) + remove_string = constant_op.constant(["tarkus", "tank"]) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - output = table.lookup(input_string) - self.assertAllEqual([3], output.get_shape()) + input_string = constant_op.constant(["brain", "salad", "tank"]) + output = table.lookup(input_string) + self.assertAllEqual([3], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual([0, 1, -1], result) + result = self.evaluate(output) + self.assertAllEqual([0, 1, -1], result) - exported_keys, exported_values = table.export() + exported_keys, exported_values = table.export() - # exported data is in the order of the internal map, i.e. undefined - sorted_keys = np.sort(self.evaluate(exported_keys)) - sorted_values = np.sort(self.evaluate(exported_values)) - self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys) - self.assertAllEqual([0, 1, 2], sorted_values) + # exported data is in the order of the internal map, i.e. undefined + sorted_keys = np.sort(self.evaluate(exported_keys)) + sorted_values = np.sort(self.evaluate(exported_values)) + self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys) + self.assertAllEqual([0, 1, 2], sorted_values) @test_util.run_v1_only("SaverV1") def testSaveRestore(self): @@ -3256,370 +3242,354 @@ class MutableHashTableOpTest(test.TestCase): self.assertAllEqual([b"-", b"a", b"b"], output) def testMutableHashTableOfTensors(self): - with self.cached_session(): - default_val = constant_op.constant([-1, -1], dtypes.int64) - keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"]) - values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]], - dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = constant_op.constant([-1, -1], dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery", "tarkus"]) + values = constant_op.constant([[0, 1], [2, 3], [4, 5], [6, 7]], + dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) - remove_string = constant_op.constant(["tarkus", "tank"]) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) + remove_string = constant_op.constant(["tarkus", "tank"]) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - output = table.lookup(input_string) - self.assertAllEqual([3, 2], output.get_shape()) + input_string = constant_op.constant(["brain", "salad", "tank"]) + output = table.lookup(input_string) + self.assertAllEqual([3, 2], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result) + result = self.evaluate(output) + self.assertAllEqual([[0, 1], [2, 3], [-1, -1]], result) - exported_keys, exported_values = table.export() - # exported data is in the order of the internal map, i.e. undefined - sorted_keys = np.sort(self.evaluate(exported_keys)) - sorted_values = np.sort(self.evaluate(exported_values), axis=0) - self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys) - sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0) - self.assertAllEqual(sorted_expected_values, sorted_values) + exported_keys, exported_values = table.export() + # exported data is in the order of the internal map, i.e. undefined + sorted_keys = np.sort(self.evaluate(exported_keys)) + sorted_values = np.sort(self.evaluate(exported_values), axis=0) + self.assertAllEqual([b"brain", b"salad", b"surgery"], sorted_keys) + sorted_expected_values = np.sort([[4, 5], [2, 3], [0, 1]], axis=0) + self.assertAllEqual(sorted_expected_values, sorted_values) def testMutableHashTableExportInsert(self): - with self.cached_session(): - default_val = constant_op.constant([-1, -1], dtypes.int64) - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64) - table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.assertAllEqual(0, self.evaluate(table1.size())) - self.evaluate(table1.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table1.size())) + default_val = constant_op.constant([-1, -1], dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64) + table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.assertAllEqual(0, self.evaluate(table1.size())) + self.evaluate(table1.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table1.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - expected_output = [[0, 1], [2, 3], [-1, -1]] - output1 = table1.lookup(input_string) - self.assertAllEqual(expected_output, self.evaluate(output1)) + input_string = constant_op.constant(["brain", "salad", "tank"]) + expected_output = [[0, 1], [2, 3], [-1, -1]] + output1 = table1.lookup(input_string) + self.assertAllEqual(expected_output, self.evaluate(output1)) - exported_keys, exported_values = table1.export() - self.assertAllEqual(3, self.evaluate(exported_keys).size) - self.assertAllEqual(6, self.evaluate(exported_values).size) + exported_keys, exported_values = table1.export() + self.assertAllEqual(3, self.evaluate(exported_keys).size) + self.assertAllEqual(6, self.evaluate(exported_values).size) - # Populate a second table from the exported data - table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.assertAllEqual(0, self.evaluate(table2.size())) - self.evaluate(table2.insert(exported_keys, exported_values)) - self.assertAllEqual(3, self.evaluate(table2.size())) + # Populate a second table from the exported data + table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.assertAllEqual(0, self.evaluate(table2.size())) + self.evaluate(table2.insert(exported_keys, exported_values)) + self.assertAllEqual(3, self.evaluate(table2.size())) - # Verify lookup result is still the same - output2 = table2.lookup(input_string) - self.assertAllEqual(expected_output, self.evaluate(output2)) + # Verify lookup result is still the same + output2 = table2.lookup(input_string) + self.assertAllEqual(expected_output, self.evaluate(output2)) def testMutableHashTableOfTensorsInvalidShape(self): - with self.cached_session(): - default_val = constant_op.constant([-1, -1], dtypes.int64) - keys = constant_op.constant(["brain", "salad", "surgery"]) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = constant_op.constant([-1, -1], dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery"]) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - # Shape [6] instead of [3, 2] - values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64) - with self.assertRaisesOpError("Expected shape"): - self.evaluate(table.insert(keys, values)) - - # Shape [2,3] instead of [3, 2] - values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64) - with self.assertRaisesOpError("Expected shape"): - self.evaluate(table.insert(keys, values)) - - # Shape [2, 2] instead of [3, 2] - values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64) - with self.assertRaisesOpError("Expected shape"): - self.evaluate(table.insert(keys, values)) - - # Shape [3, 1] instead of [3, 2] - values = constant_op.constant([[0], [2], [4]], dtypes.int64) - with self.assertRaisesOpError("Expected shape"): - self.evaluate(table.insert(keys, values)) - - # Valid Insert - values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64) + # Shape [6] instead of [3, 2] + values = constant_op.constant([0, 1, 2, 3, 4, 5], dtypes.int64) + with self.assertRaisesOpError("Expected shape"): self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + + # Shape [2,3] instead of [3, 2] + values = constant_op.constant([[0, 1, 2], [3, 4, 5]], dtypes.int64) + with self.assertRaisesOpError("Expected shape"): + self.evaluate(table.insert(keys, values)) + + # Shape [2, 2] instead of [3, 2] + values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64) + with self.assertRaisesOpError("Expected shape"): + self.evaluate(table.insert(keys, values)) + + # Shape [3, 1] instead of [3, 2] + values = constant_op.constant([[0], [2], [4]], dtypes.int64) + with self.assertRaisesOpError("Expected shape"): + self.evaluate(table.insert(keys, values)) + + # Valid Insert + values = constant_op.constant([[0, 1], [2, 3], [4, 5]], dtypes.int64) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) def testMutableHashTableInvalidDefaultValue(self): - with self.cached_session(): - default_val = constant_op.constant([[-1, -1]], dtypes.int64) - with self.assertRaisesOpError("Default value must be a vector"): - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = constant_op.constant([[-1, -1]], dtypes.int64) + with self.assertRaisesOpError("Default value must be a vector"): + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) def testMutableHashTableDuplicateInsert(self): - with self.cached_session(): - default_val = -1 - keys = constant_op.constant(["brain", "salad", "surgery", "brain"]) - values = constant_op.constant([0, 1, 2, 3], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = -1 + keys = constant_op.constant(["brain", "salad", "surgery", "brain"]) + values = constant_op.constant([0, 1, 2, 3], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - output = table.lookup(input_string) + input_string = constant_op.constant(["brain", "salad", "tank"]) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllEqual([3, 1, -1], result) + result = self.evaluate(output) + self.assertAllEqual([3, 1, -1], result) def testMutableHashTableFindHighRank(self): - with self.cached_session(): - default_val = -1 - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([0, 1, 2], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = -1 + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant([["brain", "salad"], - ["tank", "tarkus"]]) - output = table.lookup(input_string) - self.assertAllEqual([2, 2], output.get_shape()) + input_string = constant_op.constant([["brain", "salad"], + ["tank", "tarkus"]]) + output = table.lookup(input_string) + self.assertAllEqual([2, 2], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual([[0, 1], [-1, -1]], result) + result = self.evaluate(output) + self.assertAllEqual([[0, 1], [-1, -1]], result) def testMutableHashTableInsertHighRank(self): - with self.cached_session(): - default_val = -1 - keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]]) - values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = -1 + keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]]) + values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"]) - output = table.lookup(input_string) + input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"]) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllEqual([0, 1, 3, -1], result) + result = self.evaluate(output) + self.assertAllEqual([0, 1, 3, -1], result) def testMutableHashTableRemoveHighRank(self): - with self.test_session(): - default_val = -1 - keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]]) - values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = -1 + keys = constant_op.constant([["brain", "salad"], ["surgery", "tank"]]) + values = constant_op.constant([[0, 1], [2, 3]], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(4, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(4, self.evaluate(table.size())) - remove_string = constant_op.constant(["salad", "tarkus"]) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(3, self.evaluate(table.size())) + remove_string = constant_op.constant(["salad", "tarkus"]) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"]) - output = table.lookup(input_string) + input_string = constant_op.constant(["brain", "salad", "tank", "tarkus"]) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllEqual([0, -1, 3, -1], result) + result = self.evaluate(output) + self.assertAllEqual([0, -1, 3, -1], result) def testMutableHashTableOfTensorsFindHighRank(self): - with self.cached_session(): - default_val = constant_op.constant([-1, -1, -1], dtypes.int64) - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]], - dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = constant_op.constant([-1, -1, -1], dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]], + dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant([["brain", "salad"], - ["tank", "tarkus"]]) - output = table.lookup(input_string) - self.assertAllEqual([2, 2, 3], output.get_shape()) + input_string = constant_op.constant([["brain", "salad"], + ["tank", "tarkus"]]) + output = table.lookup(input_string) + self.assertAllEqual([2, 2, 3], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual( - [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result) + result = self.evaluate(output) + self.assertAllEqual( + [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result) def testMutableHashTableOfTensorsRemoveHighRank(self): - with self.test_session(): - default_val = constant_op.constant([-1, -1, -1], dtypes.int64) - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]], - dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = constant_op.constant([-1, -1, -1], dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]], + dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - remove_string = constant_op.constant([["brain", "tank"]]) - self.evaluate(table.remove(remove_string)) - self.assertAllEqual(2, self.evaluate(table.size())) + remove_string = constant_op.constant([["brain", "tank"]]) + self.evaluate(table.remove(remove_string)) + self.assertAllEqual(2, self.evaluate(table.size())) - input_string = constant_op.constant([["brain", "salad"], - ["surgery", "tank"]]) - output = table.lookup(input_string) - self.assertAllEqual([2, 2, 3], output.get_shape()) + input_string = constant_op.constant([["brain", "salad"], + ["surgery", "tank"]]) + output = table.lookup(input_string) + self.assertAllEqual([2, 2, 3], output.get_shape()) - result = self.evaluate(output) - self.assertAllEqual( - [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result) + result = self.evaluate(output) + self.assertAllEqual( + [[[-1, -1, -1], [2, 3, 4]], [[4, 5, 6], [-1, -1, -1]]], result) def testMultipleMutableHashTables(self): - with self.cached_session(): - default_val = -1 - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([0, 1, 2], dtypes.int64) + default_val = -1 + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1, 2], dtypes.int64) - table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) - self.evaluate(table1.insert(keys, values)) - self.evaluate(table2.insert(keys, values)) - self.evaluate(table3.insert(keys, values)) + table1 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + table2 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + table3 = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) + self.evaluate(table1.insert(keys, values)) + self.evaluate(table2.insert(keys, values)) + self.evaluate(table3.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table1.size())) - self.assertAllEqual(3, self.evaluate(table2.size())) - self.assertAllEqual(3, self.evaluate(table3.size())) + self.assertAllEqual(3, self.evaluate(table1.size())) + self.assertAllEqual(3, self.evaluate(table2.size())) + self.assertAllEqual(3, self.evaluate(table3.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - output1 = table1.lookup(input_string) - output2 = table2.lookup(input_string) - output3 = table3.lookup(input_string) + input_string = constant_op.constant(["brain", "salad", "tank"]) + output1 = table1.lookup(input_string) + output2 = table2.lookup(input_string) + output3 = table3.lookup(input_string) - out1, out2, out3 = self.evaluate([output1, output2, output3]) - self.assertAllEqual([0, 1, -1], out1) - self.assertAllEqual([0, 1, -1], out2) - self.assertAllEqual([0, 1, -1], out3) + out1, out2, out3 = self.evaluate([output1, output2, output3]) + self.assertAllEqual([0, 1, -1], out1) + self.assertAllEqual([0, 1, -1], out2) + self.assertAllEqual([0, 1, -1], out3) def testMutableHashTableWithTensorDefault(self): - with self.cached_session(): - default_val = constant_op.constant(-1, dtypes.int64) - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([0, 1, 2], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = constant_op.constant(-1, dtypes.int64) + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - output = table.lookup(input_string) + input_string = constant_op.constant(["brain", "salad", "tank"]) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllEqual([0, 1, -1], result) + result = self.evaluate(output) + self.assertAllEqual([0, 1, -1], result) def testSignatureMismatch(self): - with self.cached_session(): - default_val = -1 - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([0, 1, 2], dtypes.int64) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, - default_val) + default_val = -1 + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1, 2], dtypes.int64) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, + default_val) - # insert with keys of the wrong type - with self.assertRaises(ValueError): - self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values)) + # insert with keys of the wrong type + with self.assertRaises(ValueError): + self.evaluate(table.insert(constant_op.constant([4, 5, 6]), values)) - # insert with values of the wrong type - with self.assertRaises(ValueError): - self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"]))) + # insert with values of the wrong type + with self.assertRaises(ValueError): + self.evaluate(table.insert(keys, constant_op.constant(["a", "b", "c"]))) - self.assertAllEqual(0, self.evaluate(table.size())) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string_ref = variables.Variable("brain") - input_int64_ref = variables.Variable(-1, dtype=dtypes.int64) - self.evaluate(variables.global_variables_initializer()) + input_string_ref = variables.Variable("brain") + input_int64_ref = variables.Variable(-1, dtype=dtypes.int64) + self.evaluate(variables.global_variables_initializer()) - # Ref types do not produce an insert signature mismatch. - self.evaluate(table.insert(input_string_ref, input_int64_ref)) - self.assertAllEqual(3, self.evaluate(table.size())) + # Ref types do not produce an insert signature mismatch. + self.evaluate(table.insert(input_string_ref, input_int64_ref)) + self.assertAllEqual(3, self.evaluate(table.size())) - # Ref types do not produce a lookup signature mismatch. - self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref))) + # Ref types do not produce a lookup signature mismatch. + self.assertEqual(-1, self.evaluate(table.lookup(input_string_ref))) - # lookup with keys of the wrong type - input_string = constant_op.constant([1, 2, 3], dtypes.int64) - with self.assertRaises(ValueError): - self.evaluate(table.lookup(input_string)) + # lookup with keys of the wrong type + input_string = constant_op.constant([1, 2, 3], dtypes.int64) + with self.assertRaises(ValueError): + self.evaluate(table.lookup(input_string)) - # default value of the wrong type - with self.assertRaises(TypeError): - lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK") + # default value of the wrong type + with self.assertRaises(TypeError): + lookup_ops.MutableHashTable(dtypes.string, dtypes.int64, "UNK") def testMutableHashTableStringFloat(self): - with self.cached_session(): - default_val = -1.5 - keys = constant_op.constant(["brain", "salad", "surgery"]) - values = constant_op.constant([0, 1.1, 2.2], dtypes.float32) - table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = -1.5 + keys = constant_op.constant(["brain", "salad", "surgery"]) + values = constant_op.constant([0, 1.1, 2.2], dtypes.float32) + table = lookup_ops.MutableHashTable(dtypes.string, dtypes.float32, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant(["brain", "salad", "tank"]) - output = table.lookup(input_string) + input_string = constant_op.constant(["brain", "salad", "tank"]) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllClose([0, 1.1, default_val], result) + result = self.evaluate(output) + self.assertAllClose([0, 1.1, default_val], result) def testMutableHashTableIntFloat(self): - with self.cached_session(): - default_val = -1.0 - keys = constant_op.constant([3, 7, 0], dtypes.int64) - values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32) - table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = -1.0 + keys = constant_op.constant([3, 7, 0], dtypes.int64) + values = constant_op.constant([7.5, -1.2, 9.9], dtypes.float32) + table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.float32, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant([7, 0, 11], dtypes.int64) - output = table.lookup(input_string) + input_string = constant_op.constant([7, 0, 11], dtypes.int64) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllClose([-1.2, 9.9, default_val], result) + result = self.evaluate(output) + self.assertAllClose([-1.2, 9.9, default_val], result) def testMutableHashTableInt64String(self): - with self.cached_session(): - default_val = "n/a" - keys = constant_op.constant([0, 1, 2], dtypes.int64) - values = constant_op.constant(["brain", "salad", "surgery"]) - table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string, - default_val) - self.assertAllEqual(0, self.evaluate(table.size())) + default_val = "n/a" + keys = constant_op.constant([0, 1, 2], dtypes.int64) + values = constant_op.constant(["brain", "salad", "surgery"]) + table = lookup_ops.MutableHashTable(dtypes.int64, dtypes.string, + default_val) + self.assertAllEqual(0, self.evaluate(table.size())) - self.evaluate(table.insert(keys, values)) - self.assertAllEqual(3, self.evaluate(table.size())) + self.evaluate(table.insert(keys, values)) + self.assertAllEqual(3, self.evaluate(table.size())) - input_string = constant_op.constant([0, 1, 3], dtypes.int64) - output = table.lookup(input_string) + input_string = constant_op.constant([0, 1, 3], dtypes.int64) + output = table.lookup(input_string) - result = self.evaluate(output) - self.assertAllEqual((b"brain", b"salad", b"n/a"), result) + result = self.evaluate(output) + self.assertAllEqual((b"brain", b"salad", b"n/a"), result) def testExportShapeInference(self): default_value = -1 diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py index e53629250c9..f99102fee52 100644 --- a/tensorflow/python/ops/lookup_ops.py +++ b/tensorflow/python/ops/lookup_ops.py @@ -146,6 +146,10 @@ class LookupInterface(trackable.TrackableResource): """Looks up `keys` in a table, outputs the corresponding values.""" raise NotImplementedError + def __getitem__(self, keys): + """Looks up `keys` in a table, outputs the corresponding values.""" + return self.lookup(keys) + class InitializableLookupTableBase(LookupInterface): """Initializable lookup table interface. @@ -255,14 +259,28 @@ class StaticHashTable(InitializableLookupTableBase): Example usage: - ```python - keys_tensor = tf.constant([1, 2]) - vals_tensor = tf.constant([3, 4]) - input_tensor = tf.constant([1, 5]) - table = tf.lookup.StaticHashTable( - tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), -1) - print(table.lookup(input_tensor)) - ``` + >>> keys_tensor = tf.constant(['a', 'b', 'c']) + >>> vals_tensor = tf.constant([7, 8, 9]) + >>> input_tensor = tf.constant(['a', 'f']) + >>> table = tf.lookup.StaticHashTable( + ... tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor), + ... default_value=-1) + >>> table.lookup(input_tensor).numpy() + array([ 7, -1], dtype=int32) + + Or for more pythonic code: + + >>> table[input_tensor].numpy() + array([ 7, -1], dtype=int32) + + The result of a lookup operation has the same shape as the argument: + + >>> input_tensor = tf.constant([['a', 'b'], ['c', 'd']]) + >>> table[input_tensor].numpy() + array([[ 7, 8], + [ 9, -1]], dtype=int32) + + """ def __init__(self, initializer, default_value, name=None): @@ -422,16 +440,15 @@ class DatasetInitializer(TableInitializerBase): """Creates a table initializer from a `tf.data.Dataset`. Sample usage: - ```python - keys = tf.data.Dataset.range(100) - values = tf.data.Dataset.range(100).map( - lambda x: string_ops.as_string(x * 2)) - ds = tf.data.Dataset.zip((keys, values)) - init = tf.lookup.experimental.DatasetInitializer(ds) - table = tf.lookup.StaticHashTable(init, "") - output = table.lookup([0, 1, 2]) - assertEquals(outputs, ["0", "2", "4"]) - ``` + + >>> keys = tf.data.Dataset.range(100) + >>> values = tf.data.Dataset.range(100).map( + ... lambda x: string_ops.as_string(x * 2)) + >>> ds = tf.data.Dataset.zip((keys, values)) + >>> init = tf.lookup.experimental.DatasetInitializer(ds) + >>> table = tf.lookup.StaticHashTable(init, "") + >>> table.lookup(tf.constant([0, 1, 2], dtype=tf.int64)).numpy() + array([b'0', b'2', b'4'], dtype=object) Attributes: dataset: A `tf.data.Dataset` object that produces tuples of scalars. The @@ -479,7 +496,19 @@ class DatasetInitializer(TableInitializerBase): @tf_export("lookup.KeyValueTensorInitializer") class KeyValueTensorInitializer(TableInitializerBase): - """Table initializers given `keys` and `values` tensors.""" + """Table initializers given `keys` and `values` tensors. + + >>> keys_tensor = tf.constant(['a', 'b', 'c']) + >>> vals_tensor = tf.constant([7, 8, 9]) + >>> input_tensor = tf.constant(['a', 'f']) + >>> init = tf.lookup.KeyValueTensorInitializer(keys_tensor, vals_tensor) + >>> table = tf.lookup.StaticHashTable( + ... init, + ... default_value=-1) + >>> table.lookup(input_tensor).numpy() + array([ 7, -1], dtype=int32) + + """ def __init__(self, keys, values, key_dtype=None, value_dtype=None, name=None): """Constructs a table initializer object based on keys and values tensors. @@ -537,7 +566,7 @@ class KeyValueTensorInitializer(TableInitializerBase): class TextFileIndex(object): """The key and value content to get from each line. - This class defines the key and value used for tf.lookup.TextFileInitializer. + This class defines the key and value used for `tf.lookup.TextFileInitializer`. The key and value content to get from each line is specified either by the following, or a value `>=0`. @@ -555,7 +584,7 @@ class TextFileIndex(object): @tf_export("lookup.TextFileInitializer") class TextFileInitializer(TableInitializerBase): - """Table initializers from a text file. + r"""Table initializers from a text file. This initializer assigns one entry in the table for each line in the file. @@ -574,11 +603,11 @@ class TextFileInitializer(TableInitializerBase): For example if we have a file with the following content: - ``` - emerson 10 - lake 20 - palmer 30 - ``` + >>> import tempfile + >>> f = tempfile.NamedTemporaryFile(delete=False) + >>> content='\n'.join(["emerson 10", "lake 20", "palmer 30",]) + >>> f.file.write(content.encode('utf-8')) + >>> f.file.close() The following snippet initializes a table with the first column as keys and second column as values: @@ -587,12 +616,13 @@ class TextFileInitializer(TableInitializerBase): * `lake -> 20` * `palmer -> 30` - ```python - table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer( - "test.txt", tf.string, 0, tf.int64, 1, delimiter=" "), -1) - ... - table.init.run() - ``` + >>> init= tf.lookup.TextFileInitializer( + ... filename=f.name, + ... key_dtype=tf.string, key_index=0, + ... value_dtype=tf.int64, value_index=1, + ... delimiter=" ") + >>> table = tf.lookup.StaticHashTable(init, default_value=-1) + >>> table.lookup(tf.constant(['palmer','lake','tarkus'])).numpy() Similarly to initialize the whole line as keys and the line number as values. @@ -600,13 +630,14 @@ class TextFileInitializer(TableInitializerBase): * `lake 20 -> 1` * `palmer 30 -> 2` - ```python - table = tf.lookup.StaticHashTable(tf.lookup.TextFileInitializer( - "test.txt", tf.string, tf.lookup.TextFileIndex.WHOLE_LINE, - tf.int64, tf.lookup.TextFileIndex.LINE_NUMBER, delimiter=" "), -1) - ... - table.init.run() - ``` + >>> init = tf.lookup.TextFileInitializer( + ... filename=f.name, + ... key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, + ... value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER, + ... delimiter=" ") + >>> table = tf.lookup.StaticHashTable(init, -1) + >>> table.lookup(tf.constant('palmer 30')).numpy() + 2 """ def __init__(self, @@ -1106,45 +1137,53 @@ class IdTableWithHashBuckets(LookupInterface): @tf_export("lookup.StaticVocabularyTable", v1=[]) class StaticVocabularyTable(LookupInterface): - r"""String to Id table wrapper that assigns out-of-vocabulary keys to buckets. + r"""String to Id table that assigns out-of-vocabulary keys to hash buckets. For example, if an instance of `StaticVocabularyTable` is initialized with a string-to-id initializer that maps: - * `emerson -> 0` - * `lake -> 1` - * `palmer -> 2` + >>> init = tf.lookup.KeyValueTensorInitializer( + ... keys=tf.constant(['emerson', 'lake', 'palmer']), + ... values=tf.constant([0, 1, 2], dtype=tf.int64)) + >>> table = tf.lookup.StaticVocabularyTable( + ... init, + ... num_oov_buckets=5) The `Vocabulary` object will performs the following mapping: * `emerson -> 0` * `lake -> 1` * `palmer -> 2` - * ` -> bucket_id`, where bucket_id will be between `3` and - `3 + num_oov_buckets - 1`, calculated by: + * ` -> bucket_id`, where `bucket_id` will be between `3` and + `3 + num_oov_buckets - 1 = 7`, calculated by: `hash() % num_oov_buckets + vocab_size` - If input_tensor is `["emerson", "lake", "palmer", "king", "crimson"]`, - the lookup result is `[0, 1, 2, 4, 7]`. + If input_tensor is: + + >>> input_tensor = tf.constant(["emerson", "lake", "palmer", + ... "king", "crimson"]) + >>> table[input_tensor].numpy() + array([0, 1, 2, 6, 7]) If `initializer` is None, only out-of-vocabulary buckets are used. Example usage: - ```python - num_oov_buckets = 3 - input_tensor = tf.constant(["emerson", "lake", "palmer", "king", "crimnson"]) - table = tf.lookup.StaticVocabularyTable( - tf.lookup.TextFileInitializer( - filename, - key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, - value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER, - delimiter="\t"), - num_oov_buckets) - out = table.lookup(input_tensor). - table.init.run() - print(out.eval()) - ``` + >>> num_oov_buckets = 3 + >>> vocab = ["emerson", "lake", "palmer", "crimnson"] + >>> import tempfile + >>> f = tempfile.NamedTemporaryFile(delete=False) + >>> f.write('\n'.join(vocab).encode('utf-8')) + >>> f.close() + + >>> init = tf.lookup.TextFileInitializer( + ... f.name, + ... key_dtype=tf.string, key_index=tf.lookup.TextFileIndex.WHOLE_LINE, + ... value_dtype=tf.int64, value_index=tf.lookup.TextFileIndex.LINE_NUMBER) + >>> table = tf.lookup.StaticVocabularyTable(init, num_oov_buckets) + >>> table.lookup(tf.constant(["palmer", "crimnson" , "king", + ... "tarkus", "black", "moon"])).numpy() + array([2, 3, 5, 6, 6, 4]) The hash function used for generating out-of-vocabulary buckets ID is Fingerprint64. @@ -1158,8 +1197,8 @@ class StaticVocabularyTable(LookupInterface): """Construct a `StaticVocabularyTable` object. Args: - initializer: A TableInitializerBase object that contains the data used to - initialize the table. If None, then we only use out-of-vocab buckets. + initializer: A `TableInitializerBase` object that contains the data used + to initialize the table. If None, then we only use out-of-vocab buckets. num_oov_buckets: Number of buckets to use for out-of-vocabulary keys. Must be greater than zero. lookup_key_dtype: Data type of keys passed to `lookup`. Defaults to @@ -1926,17 +1965,18 @@ class DenseHashTable(LookupInterface): Example usage: - ```python - table = tf.lookup.DenseHashTable(key_dtype=tf.int64, - value_dtype=tf.int64, - default_value=-1, - empty_key=0, - deleted_key=-1) - - sess.run(table.insert(keys, values)) - out = table.lookup(query_keys) - print(out.eval()) - ``` + >>> table = tf.lookup.experimental.DenseHashTable( + ... key_dtype=tf.string, + ... value_dtype=tf.int64, + ... default_value=-1, + ... empty_key='', + ... deleted_key='$') + >>> keys = tf.constant(['a', 'b', 'c']) + >>> values = tf.constant([0, 1, 2], dtype=tf.int64) + >>> table.insert(keys, values) + >>> table.remove(tf.constant(['c'])) + >>> table.lookup(tf.constant(['a', 'b', 'c','d'])).numpy() + array([ 0, 1, -1, -1]) """ # TODO(andreasst): consider extracting common code with MutableHashTable into From 38718b4ed5feabbd4019fa24c828bdc1733289c9 Mon Sep 17 00:00:00 2001 From: Zhenyu Tan Date: Mon, 12 Oct 2020 09:45:35 -0700 Subject: [PATCH 0135/1447] Remove forward compatibility check for bincount. PiperOrigin-RevId: 336682477 Change-Id: Id80821c153be5d6649777f3587d890da2ac6f084 --- tensorflow/python/ops/bincount_ops.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py index 758f0180a84..bda86bf7461 100644 --- a/tensorflow/python/ops/bincount_ops.py +++ b/tensorflow/python/ops/bincount_ops.py @@ -18,7 +18,6 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -from tensorflow.python.framework import constant_op from tensorflow.python.framework import dtypes from tensorflow.python.framework import ops from tensorflow.python.framework import sparse_tensor @@ -120,26 +119,6 @@ def bincount(arr, """ name = "bincount" if name is None else name with ops.name_scope(name): - # Somehow forward compatible needs to be False. - if not binary_output and axis is None: - arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32) - array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0 - output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * ( - math_ops.reduce_max(arr) + 1) - if minlength is not None: - minlength = ops.convert_to_tensor( - minlength, name="minlength", dtype=dtypes.int32) - output_size = gen_math_ops.maximum(minlength, output_size) - if maxlength is not None: - maxlength = ops.convert_to_tensor( - maxlength, name="maxlength", dtype=dtypes.int32) - output_size = gen_math_ops.minimum(maxlength, output_size) - if weights is not None: - weights = ops.convert_to_tensor(weights, name="weights") - return gen_math_ops.unsorted_segment_sum(weights, arr, output_size) - weights = constant_op.constant([], dtype) - return gen_math_ops.bincount(arr, output_size, weights) - if not isinstance(arr, sparse_tensor.SparseTensor): arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr") if weights is not None: From cbade8270ef7eff9eb0bbe607b3a4d4563812299 Mon Sep 17 00:00:00 2001 From: Pankaj Kanwar Date: Mon, 12 Oct 2020 09:57:45 -0700 Subject: [PATCH 0136/1447] Update Release notes for CUDA 11 and CuDNN 8.0.2 support. PiperOrigin-RevId: 336684814 Change-Id: I285f27126291c85cfb36e64edfcea11a6f422c0a --- RELEASE.md | 1 + 1 file changed, 1 insertion(+) diff --git a/RELEASE.md b/RELEASE.md index 18649653304..f1eead827e5 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -34,6 +34,7 @@ shape assumptions (note that you can pass shapes with `None` entries for axes that are meant to be dynamic). You can also disable the input checking entirely by setting `model.input_spec = None`. +* TF pip packages now use CUDA11 and cuDNN 8.0.2. * XLA:CPU and XLA:GPU devices are no longer registered by default. Use `TF_XLA_FLAGS=--tf_xla_enable_xla_devices` if you really need them (to be removed). From 411fd532b578d2c4ebe89fefdc2b6962ae13e0a9 Mon Sep 17 00:00:00 2001 From: Jiho Choi Date: Mon, 12 Oct 2020 10:00:04 -0700 Subject: [PATCH 0137/1447] Fix TraceMe instrumentation for the padding size. PiperOrigin-RevId: 336685281 Change-Id: I831b71a42ae5ab0de8de51ec1b1f0223772d46b1 --- tensorflow/core/kernels/batching_util/BUILD | 2 ++ .../core/kernels/batching_util/batch_resource_base.cc | 7 +++++++ .../core/kernels/batching_util/shared_batch_scheduler.h | 4 +--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD index 16de576d2c9..8f233957032 100644 --- a/tensorflow/core/kernels/batching_util/BUILD +++ b/tensorflow/core/kernels/batching_util/BUILD @@ -249,6 +249,8 @@ cc_library( "//tensorflow/core/kernels/batching_util:threadsafe_status", "//tensorflow/core/platform:status", "//tensorflow/core/platform:thread_annotations", + "//tensorflow/core/profiler/lib:traceme", + "//tensorflow/core/profiler/lib:traceme_encode", "//tensorflow/core/util:incremental_barrier", ], ) diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc index 98175b5b9d0..d638760b833 100644 --- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc +++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc @@ -20,6 +20,8 @@ limitations under the License. #include "tensorflow/core/kernels/batching_util/concat_split_util.h" #include "tensorflow/core/lib/gtl/cleanup.h" #include "tensorflow/core/lib/monitoring/percentile_sampler.h" +#include "tensorflow/core/profiler/lib/traceme.h" +#include "tensorflow/core/profiler/lib/traceme_encode.h" #include "tensorflow/core/util/incremental_barrier.h" namespace tensorflow { @@ -202,6 +204,11 @@ Status BatchResourceBase::ConcatInputTensors( const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size()); const int padding_amount = padded_batch_size - batch.size(); + profiler::TraceMe trace_me([padded_batch_size, padding_amount]() { + return profiler::TraceMeEncode( + "ConcatInputTensors", {{"batch_size_after_padding", padded_batch_size}, + {"padding_amount", padding_amount}}); + }); RecordPaddingSize(padding_amount, GetModelName(context), padded_batch_size); RecordProcessedBatchSize(padded_batch_size, GetModelName(context)); diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h index 04b84e6054e..9bb853f708e 100644 --- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h +++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h @@ -809,9 +809,7 @@ void Queue::ProcessBatch(std::unique_ptr> batch) { profiler::TraceMeConsumer trace_me( [&] { return profiler::TraceMeEncode( - "ProcessBatch", - {{"size", batch->size()}, - {"padding", max_execution_batch_size() - batch->size()}}); + "ProcessBatch", {{"batch_size_before_padding", batch->size()}}); }, profiler::ContextType::kSharedBatchScheduler, batch->traceme_context_id()); From d5bfcbf8b6200cb79e7f41de44ca78328e76687d Mon Sep 17 00:00:00 2001 From: Scott Zhu Date: Mon, 12 Oct 2020 10:28:36 -0700 Subject: [PATCH 0138/1447] Update TFOpLambda layer to handle ops that are not public TF API. The error of non tf public API is raised at get_config(), and it shouldn't prevent the execution of the layer. PiperOrigin-RevId: 336691891 Change-Id: I13cdf3486bf24a9d56b7e16be9074c263bdac1cc --- tensorflow/python/keras/layers/core.py | 6 +++++- tensorflow/python/keras/layers/core_test.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py index 34d896f9486..212ce42ddaa 100644 --- a/tensorflow/python/keras/layers/core.py +++ b/tensorflow/python/keras/layers/core.py @@ -1312,8 +1312,12 @@ class TFOpLambda(Layer): # (For standard layers users could just set `name` when creating the # layer to work around a collision, but they can't do that for # auto-generated layers) + if self.symbol: + name = 'tf.' + self.symbol + else: + name = self.function.__name__ kwargs['name'] = K.unique_object_name( - 'tf.' + self.symbol, zero_based=True, avoid_observed_names=True) + name, zero_based=True, avoid_observed_names=True) kwargs['autocast'] = False # Decorate the function to produce this layer's call method diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py index b7a11d32c71..3f113bf8cbb 100644 --- a/tensorflow/python/keras/layers/core_test.py +++ b/tensorflow/python/keras/layers/core_test.py @@ -559,5 +559,20 @@ class CoreLayersTest(keras_parameterized.TestCase): self.assertAllEqual(np.ones((10, 20)), layer([x, y])) +@keras_parameterized.run_all_keras_modes +class TFOpLambdaTest(keras_parameterized.TestCase): + + def test_non_tf_symbol(self): + def dummy_func(a, b): + return a + b + + layer = core.TFOpLambda(dummy_func) + self.assertIsNone(layer.symbol) + self.assertEqual(layer.name, 'dummy_func') + + with self.assertRaisesRegex(ValueError, 'was generated from .*dummy_func'): + layer.get_config() + + if __name__ == '__main__': test.main() From 52df91c5634e6c666843849a1c6ff29b3d2676be Mon Sep 17 00:00:00 2001 From: Pankaj Kanwar Date: Mon, 12 Oct 2020 10:30:20 -0700 Subject: [PATCH 0139/1447] Create a V2 Op to stop the gradient when the input is out of range. PiperOrigin-RevId: 336692325 Change-Id: I36fd3fcfc58a30d5218beca512fbfc7c24b8b5cb --- RELEASE.md | 5 + tensorflow/cc/gradients/array_grad.cc | 29 ++-- tensorflow/compiler/tests/unary_ops_test.py | 6 +- .../api_def_QuantizeAndDequantizeV4.pbtxt | 8 ++ .../api_def_QuantizeAndDequantizeV4Grad.pbtxt | 8 ++ .../api_def_QuantizeAndDequantizeV4.pbtxt | 3 + .../api_def_QuantizeAndDequantizeV4Grad.pbtxt | 3 + .../api_def_QuantizeAndDequantizeV4.pbtxt | 4 + .../api_def_QuantizeAndDequantizeV4Grad.pbtxt | 4 + .../kernels/quantize_and_dequantize_op.cc | 126 ++++++++++++++++++ .../core/kernels/quantize_and_dequantize_op.h | 71 ++++++++++ .../quantize_and_dequantize_op_gpu.cu.cc | 40 ++++++ .../quantize_and_dequantize_op_test.cc | 48 +++++++ tensorflow/core/ops/array_ops.cc | 64 +++++++++ .../eager/pywrap_gradient_exclusions.cc | 7 +- .../python/kernel_tests/array_ops_test.py | 21 ++- tensorflow/python/ops/array_ops.py | 113 +++++++++++++++- .../tools/api/golden/v1/tensorflow.pbtxt | 4 + .../golden/v1/tensorflow.quantization.pbtxt | 4 + .../api/golden/v1/tensorflow.raw_ops.pbtxt | 8 ++ .../tools/api/golden/v2/tensorflow.pbtxt | 4 + .../golden/v2/tensorflow.quantization.pbtxt | 4 + .../api/golden/v2/tensorflow.raw_ops.pbtxt | 8 ++ 23 files changed, 574 insertions(+), 18 deletions(-) create mode 100644 tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt create mode 100644 tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt create mode 100644 tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4.pbtxt create mode 100644 tensorflow/core/api_def/java_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4.pbtxt create mode 100644 tensorflow/core/api_def/python_api/api_def_QuantizeAndDequantizeV4Grad.pbtxt diff --git a/RELEASE.md b/RELEASE.md index f1eead827e5..78cbbd5c27a 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -47,6 +47,11 @@ * `tf.data.experimental.service.WorkerServer` now takes a config tuple instead of individual arguments. Usages should be updated to `tf.data.experimental.service.WorkerServer(worker_config)`. +* `tf.quantization.quantize_and_dequantize_v2` has been introduced, which + updates the gradient definition for quantization which is outside the range + to be 0. To simulate the V1 the behavior of + tf.quantization.quantize_and_dequantize(...) use + tf.grad_pass_through(tf.quantization.quantize_and_dequantize_v2)(...). ## Known Caveats diff --git a/tensorflow/cc/gradients/array_grad.cc b/tensorflow/cc/gradients/array_grad.cc index e9173227aad..480243a29e6 100644 --- a/tensorflow/cc/gradients/array_grad.cc +++ b/tensorflow/cc/gradients/array_grad.cc @@ -15,13 +15,12 @@ limitations under the License. #include +#include "tensorflow/cc/framework/grad_op_registry.h" +#include "tensorflow/cc/framework/gradients.h" #include "tensorflow/cc/ops/array_ops_internal.h" #include "tensorflow/cc/ops/standard_ops.h" #include "tensorflow/core/lib/strings/strcat.h" -#include "tensorflow/cc/framework/grad_op_registry.h" -#include "tensorflow/cc/framework/gradients.h" - namespace tensorflow { namespace ops { namespace { @@ -90,15 +89,25 @@ Status QuantizeAndDequantizeGrad(const Scope& scope, const Operation& op, } REGISTER_GRADIENT_OP("QuantizeAndDequantize", QuantizeAndDequantizeGrad); -Status QuantizeAndDequantizeV2Grad(const Scope& scope, const Operation& op, - const std::vector& grad_inputs, - std::vector* grad_outputs) { - grad_outputs->push_back(Identity(scope, grad_inputs[0])); - grad_outputs->push_back(NoGradient()); - grad_outputs->push_back(NoGradient()); +Status QuantizeAndDequantizeV4GradHelper(const Scope& scope, + const Operation& op, + const std::vector& grad_inputs, + std::vector* grad_outputs) { + Input input = Shape(scope, op.input(0)); + Input input_min = op.input(1); + Input input_max = op.input(2); + int64 axis; + TF_RETURN_IF_ERROR(GetNodeAttr(op.node()->attrs(), "axis", &axis)); + auto qdq_v4_grad = QuantizeAndDequantizeV4Grad( + scope, grad_inputs[0], input, input_min, input_max, + QuantizeAndDequantizeV4Grad::Axis(axis)); + grad_outputs->push_back(qdq_v4_grad.input_backprop); + grad_outputs->push_back(qdq_v4_grad.input_min_backprop); + grad_outputs->push_back(qdq_v4_grad.input_max_backprop); return scope.status(); } -REGISTER_GRADIENT_OP("QuantizeAndDequantizeV2", QuantizeAndDequantizeV2Grad); +REGISTER_GRADIENT_OP("QuantizeAndDequantizeV4", + QuantizeAndDequantizeV4GradHelper); Status QuantizeAndDequantizeV3Grad(const Scope& scope, const Operation& op, const std::vector& grad_inputs, diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py index b5f82bcff12..f3f6fa8ae52 100644 --- a/tensorflow/compiler/tests/unary_ops_test.py +++ b/tensorflow/compiler/tests/unary_ops_test.py @@ -542,7 +542,7 @@ class UnaryOpsTest(xla_test.XLATestCase): for dtype in self.float_types: def quantize_and_dequantize_v2(x): - return array_ops.quantize_and_dequantize_v2( + return array_ops.quantize_and_dequantize( x, -127, 127, signed_input=True, num_bits=8) self._assertOpOutputMatchesExpected( @@ -551,7 +551,7 @@ class UnaryOpsTest(xla_test.XLATestCase): expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype)) def quantize_and_dequantize_v2_round_half_up(x): - return array_ops.quantize_and_dequantize_v2( + return array_ops.quantize_and_dequantize( x, -1, 1.0, @@ -575,7 +575,7 @@ class UnaryOpsTest(xla_test.XLATestCase): dtype=dtype)) def quantize_and_dequantize_v2_round_half_to_even(x): - return array_ops.quantize_and_dequantize_v2( + return array_ops.quantize_and_dequantize( x, -1.0, 1.0, diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt new file mode 100644 index 00000000000..a84ccb78436 --- /dev/null +++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV4.pbtxt @@ -0,0 +1,8 @@ +op { + graph_op_name: "QuantizeAndDequantizeV4" + summary: "Returns the gradient of `QuantizeAndDequantizeV4`." + description: <