diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py index 5c63ee7a97b..47ae2765822 100644 --- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py +++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py @@ -71,7 +71,8 @@ def RunLSTM(sess, is_training=True, dropout=0., num_dirs=True, - dtype=dtypes.float32): + dtype=dtypes.float32, + num_proj=None): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 @@ -94,7 +95,8 @@ def RunLSTM(sess, initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, - num_units).astype(dtype.as_numpy_dtype), + num_proj if num_proj else num_units) + .astype(dtype.as_numpy_dtype), dtype=dtype) initial_c_op = variable_scope.get_variable( "initial_c_op", @@ -115,13 +117,19 @@ def RunLSTM(sess, with variable_scope.variable_scope("test", initializer=initializer): w = variable_scope.get_variable( "rnn/lstm_cell/kernel", - shape=[input_size + num_units, num_units * 4], + shape=[input_size + (num_proj if num_proj else num_units), + num_units * 4], dtype=dtype) b = variable_scope.get_variable( "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype) + if num_proj: + pw = variable_scope.get_variable( + "rnn/lstm_cell/projection/kernel", + shape=[num_units, num_proj], dtype=dtype) # canonical lstm. must set forget_bias to 0. to align with cudnn lstm. - cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True) + cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True, + num_proj=num_proj if num_proj else None) outputs_op, state_tuple_op = rnn.dynamic_rnn( cell, inputs_static, @@ -134,8 +142,12 @@ def RunLSTM(sess, # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( - num_layers, num_units, input_size) - opaque_params = format_converter.tf_canonical_to_opaque([w, b]) + num_layers, num_units, input_size, + num_proj=num_proj if num_proj else None) + if num_proj: + opaque_params = format_converter.tf_canonical_to_opaque([w, b], [pw,]) + else: + opaque_params = format_converter.tf_canonical_to_opaque([w, b]) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) @@ -150,16 +162,22 @@ def RunLSTM(sess, time_major=time_major, dropout=dropout, is_training=is_training, - rnn_mode=cudnn_rnn_ops.CUDNN_LSTM) + rnn_mode=cudnn_rnn_ops.CUDNN_LSTM, + num_proj=num_proj if num_proj else None) # Remove the trivial 1st dimension. cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple( c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1), h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1)) if is_training: - (inp_grad_op, hgrad_op, - cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients( - outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b]) + if num_proj: + (inp_grad_op, hgrad_op, + cgrad_op, wgrad_op, bgrad_op, pwgrad_op) = gradients_impl.gradients( + outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b, pw]) + else: + (inp_grad_op, hgrad_op, + cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients( + outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b]) (cu_inp_grad_op, cu_hgrad_op, cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients( @@ -170,10 +188,16 @@ def RunLSTM(sess, # Remove the trivial 1st dimension cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1) - cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( - opaque_grad_op) + if num_proj: + cu_wgrad_op, cu_bgrad_op, cu_pwgrad_op = \ + format_converter.opaque_to_tf_canonical(opaque_grad_op) + else: + cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( + opaque_grad_op) cu_wgrad_op = cu_wgrad_op[0] cu_bgrad_op = cu_bgrad_op[0] + if num_proj: + cu_pwgrad_op = cu_pwgrad_op[0] # cudnn lstm has 2 biases each gate. When converting to tf canonical format, # the two biases are summed into one. Thus here bias gradient should be # halved when comparing with tf lstm. @@ -183,18 +207,30 @@ def RunLSTM(sess, sess.run(init_op) if is_training: - outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([ - outputs_op, state_tuple_op, inp_grad_op, - (hgrad_op, cgrad_op), wgrad_op, bgrad_op - ]) - (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, - cu_bgrad) = sess.run( - [ - cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, - (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op - ], - feed_dict={inputs: inputs_np} if dynamic_shape_input else None) - + if num_proj: + outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad, pwgrad = \ + sess.run([ + outputs_op, state_tuple_op, inp_grad_op, + (hgrad_op, cgrad_op), wgrad_op, bgrad_op, pwgrad_op + ]) + (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, + cu_bgrad, cu_pwgrad) = sess.run([ + cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, + (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op, cu_pwgrad_op + ], + feed_dict={inputs: inputs_np} if dynamic_shape_input else None) + else: + outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([ + outputs_op, state_tuple_op, inp_grad_op, + (hgrad_op, cgrad_op), wgrad_op, bgrad_op + ]) + (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, + cu_bgrad) = sess.run([ + cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, + (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op + ], + feed_dict={inputs: inputs_np} if dynamic_shape_input else None) + logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) @@ -205,11 +241,20 @@ def RunLSTM(sess, logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad)) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) + if num_proj: + logging.vlog(1, "pwgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) - return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, - cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, - cu_bgrad) + if num_proj: + logging.vlog(1, "cu_pwgrad: %s" % str(cu_bgrad)) + if num_proj: + return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, + cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, pwgrad, + cu_wgrad, cu_bgrad, cu_pwgrad) + else: + return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, + cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, + cu_bgrad) else: outputs, state_tuple = sess.run([outputs_op, state_tuple_op]) cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op], @@ -226,35 +271,43 @@ def RunLSTM(sess, # Basic set of RNN configs to test. They can be further extended in relevant # test (e.g. adding num_dirs). +#NAMED_RNN_TESTCASES = ({ +# "testcase_name": "xsmall", +# "num_units": 1, +# "input_size": 1, +# "batch_size": 1, +# "time": 1, +# "num_layers": 1, +#}, { +# "testcase_name": "small", +# "num_units": 4, +# "input_size": 4, +# "batch_size": 4, +# "time": 4, +# "num_layers": 1, +#}, { +# "testcase_name": "medium", +# "num_units": 128, +# "input_size": 64, +# "batch_size": 8, +# "time": 16, +# "num_layers": 1, +#}, { +# "testcase_name": "large", +# "num_units": 128, +# "input_size": 128, +# "batch_size": 16, +# "time": 32, +# "num_layers": 1, +#}) NAMED_RNN_TESTCASES = ({ - "testcase_name": "xsmall", - "num_units": 1, - "input_size": 1, - "batch_size": 1, - "time": 1, - "num_layers": 1, -}, { "testcase_name": "small", "num_units": 4, "input_size": 4, "batch_size": 4, "time": 4, "num_layers": 1, -}, { - "testcase_name": "medium", - "num_units": 128, - "input_size": 64, - "batch_size": 8, - "time": 16, - "num_layers": 1, -}, { - "testcase_name": "large", - "num_units": 128, - "input_size": 128, - "batch_size": 16, - "time": 32, - "num_layers": 1, -}) +}, ) def ExpandNamedTestCases(inputs, *remove_keys, **extra_configs): @@ -349,19 +402,22 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): time_major, dynamic_shape_input=False, rtol=3e-6, - atol=3e-6): + atol=3e-6, + num_proj=None): with self.session(use_gpu=True) as sess: - (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad, - state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunLSTM( - sess, - num_units, - input_size, - batch_size, - time, - num_layers, - variable_seq_lengths=variable_seq_lengths, - time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + if num_proj is not None and num_proj != 0: + (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, + cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, pwgrad, cu_wgrad, + cu_bgrad, cu_pwgrad) = RunLSTM( + sess, num_units, input_size, batch_size, time, num_layers, + variable_seq_lengths=variable_seq_lengths, + dynamic_shape_input=dynamic_shape_input, num_proj=num_proj) + else: + (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, + cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, + cu_bgrad) = RunLSTM(sess, num_units, input_size, batch_size, time, + num_layers, variable_seq_lengths=variable_seq_lengths, + dynamic_shape_input=dynamic_shape_input, num_proj=num_proj) self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol) for s, cu_s in zip(state_tuple, cu_state_tuple): @@ -371,6 +427,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol) self.assertAllClose(bgrad, cu_bgrad, rtol=rtol, atol=atol) self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol) + if num_proj is not None and num_proj != 0: + self.assertAllClose(pwgrad, cu_pwgrad, rtol=rtol, atol=atol) @parameterized.named_parameters( ExpandNamedTestCases( @@ -378,10 +436,15 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], + "use_proj": [True, False], })) @test_util.run_gpu_only def test_training(self, num_units, input_size, batch_size, time, num_layers, - variable_seq_lengths, time_major, dynamic_shape_input): + variable_seq_lengths, time_major, dynamic_shape_input, + use_proj): + num_proj = num_units // 2 + if use_proj and num_proj == 0: + self.skipTest("num_proj cannot be 0") self._test_training_helper( num_units, input_size, @@ -391,7 +454,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): dtypes.float32, variable_seq_lengths=variable_seq_lengths, time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + dynamic_shape_input=dynamic_shape_input, + num_proj=num_proj if use_proj else None) @parameterized.named_parameters( ExpandNamedTestCases( @@ -399,11 +463,15 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], + "use_proj": [True, False], })) @test_util.run_gpu_only def test_training_fp16(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, - dynamic_shape_input): + dynamic_shape_input, use_proj): + num_proj = num_units // 2 + if use_proj and num_proj == 0: + self.skipTest("num_proj cannot be 0") self._test_training_helper( num_units, input_size, @@ -415,7 +483,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): atol=5e-4, variable_seq_lengths=variable_seq_lengths, time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + dynamic_shape_input=dynamic_shape_input, + num_proj=num_proj if use_proj else None) @parameterized.named_parameters( ExpandNamedTestCases( @@ -423,10 +492,15 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], + "use_proj": [True, False], })) @test_util.run_gpu_only def test_inference(self, num_units, input_size, batch_size, time, num_layers, - variable_seq_lengths, time_major, dynamic_shape_input): + variable_seq_lengths, time_major, dynamic_shape_input, + use_proj): + num_proj = num_units // 2 + if use_proj and num_proj == 0: + self.skipTest("num_proj cannot be 0") with self.session(use_gpu=True) as sess: (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM( sess, @@ -438,7 +512,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): is_training=False, variable_seq_lengths=variable_seq_lengths, time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + dynamic_shape_input=dynamic_shape_input, + num_proj=num_proj if use_proj else None) self.assertAllClose(outputs, cu_outputs) # h @@ -452,11 +527,15 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], + "use_proj": [True, False], })) @test_util.run_gpu_only def test_inference_fp16(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, - dynamic_shape_input): + dynamic_shape_input, use_proj): + num_proj = num_units // 2 + if use_proj and num_proj == 0: + self.skipTest("num_proj cannot be 0") with self.session(use_gpu=True) as sess: (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM( sess, @@ -469,7 +548,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): dtype=dtypes.float16, variable_seq_lengths=variable_seq_lengths, time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + dynamic_shape_input=dynamic_shape_input, + num_proj=num_proj if use_proj else None) rtol, atol = 5e-3, 5e-4 self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol) @@ -486,12 +566,16 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], + "use_proj": [True, False], })) @test_util.run_gpu_only def test_inference_with_dropout(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, - dynamic_shape_input): + dynamic_shape_input, use_proj): """Validates that dropout does not affect Cudnn Rnn inference.""" + num_proj = num_units // 2 + if use_proj and num_proj == 0: + self.skipTest("num_proj cannot be 0") # Hand-picked dropouts are used below (0. and 1.) with ops.Graph().as_default() as g: with self.session(use_gpu=True, graph=g) as sess: @@ -507,7 +591,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): dropout=0., variable_seq_lengths=variable_seq_lengths, time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + dynamic_shape_input=dynamic_shape_input, + num_proj=num_proj if use_proj else None) with ops.Graph().as_default() as g: with self.session(use_gpu=True, graph=g) as sess: @@ -522,7 +607,8 @@ class CudnnLSTMTest(test_util.TensorFlowTestCase, parameterized.TestCase): dropout=1., variable_seq_lengths=variable_seq_lengths, time_major=time_major, - dynamic_shape_input=dynamic_shape_input) + dynamic_shape_input=dynamic_shape_input, + num_proj=num_proj if use_proj else None) self.assertAllClose(cu_outputs, cu_outputs2) # h @@ -890,40 +976,61 @@ class CudnnParamsFormatConverterTest(test_util.TensorFlowTestCase, parameterized.TestCase): """Class for testing various format converters.""" - def _test_lstm_helper(self, num_units, input_size, num_layers, direction): + def _test_lstm_helper(self, num_units, input_size, num_layers, direction, + num_proj=None): with self.session(use_gpu=True) as sess: random_seed.set_random_seed(0) np.random.seed(0) num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2 format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( - num_layers, num_units, input_size, direction=direction) + num_layers, num_units, input_size, direction=direction, + num_proj=num_proj if num_proj else None) - ws, bs = [], [] + ws, bs, pws = [], [], [] for _ in range(num_layers * num_dirs): w = constant_op.constant( - np.random.rand(input_size + num_units, 4 * num_units), + np.random.rand(input_size + (num_proj if num_proj else num_units), + 4 * num_units), dtype=dtypes.float32) b = constant_op.constant( np.random.rand(4 * num_units), dtype=dtypes.float32) ws.append(w) bs.append(b) + if num_proj: + pw = constant_op.constant( + np.random.rand(num_units, num_proj), dtype=dtypes.float32) + pws.append(pw) + + if num_proj: + opaque_params = format_converter.tf_canonical_to_opaque(ws + bs, pws) + else: + opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) - opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size( cudnn_rnn_ops.CUDNN_LSTM, num_layers, num_units, input_size, - direction=direction) + direction=direction, + num_proj=num_proj if num_proj else None) - ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params) + if num_proj: + ws_r, bs_r, pws_r = format_converter.opaque_to_tf_canonical( + opaque_params) + ws, ws_r, pws, bs, bs_r, pws_r = sess.run([ws, ws_r, pws, bs, bs_r, + pws_r]) + else: + ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params) + ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r]) # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical() # returns the original input. - ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r]) for w, w_r in zip(ws, ws_r): self.assertAllClose(w, w_r) + if num_proj: + for pw, pw_r in zip(pws, pws_r): + self.assertAllClose(pw, pw_r) for b, b_r in zip(bs, bs_r): self.assertAllClose(b, b_r) @@ -942,6 +1049,18 @@ class CudnnParamsFormatConverterTest(test_util.TensorFlowTestCase, self._test_lstm_helper(num_units, input_size, num_layers, cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION) + @parameterized.named_parameters((c["testcase_name"], c["num_units"], + c["input_size"], c["num_layers"]) + for c in NAMED_RNN_TESTCASES) + @test_util.run_gpu_only + def test_lstmp(self, num_units, input_size, num_layers): + num_proj = num_units // 2 + if num_proj == 0: + self.skipTest("num_proj cannot be 0") + self._test_lstm_helper(num_units, input_size, num_layers, + cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION, + num_proj=num_proj) + @parameterized.named_parameters((c["testcase_name"], c["num_units"], c["input_size"], c["num_layers"]) for c in NAMED_RNN_TESTCASES) @@ -950,6 +1069,18 @@ class CudnnParamsFormatConverterTest(test_util.TensorFlowTestCase, self._test_lstm_helper(num_units, input_size, num_layers, cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION) + @parameterized.named_parameters((c["testcase_name"], c["num_units"], + c["input_size"], c["num_layers"]) + for c in NAMED_RNN_TESTCASES) + @test_util.run_gpu_only + def test_lstmp_bidi(self, num_units, input_size, num_layers): + num_proj = num_units // 2 + if num_proj == 0: + self.skipTest("num_proj cannot be 0") + self._test_lstm_helper(num_units, input_size, num_layers, + cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION, + num_proj=num_proj) + def _test_gru_helper(self, num_units, input_size, num_layers, direction): with self.session(use_gpu=True) as sess: random_seed.set_random_seed(0) diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py index fd3dc975779..c3bcd02fdda 100644 --- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py +++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py @@ -184,6 +184,7 @@ class CudnnParamsFormatConverter(object): num_layers, num_units, input_size, + num_proj=None, input_mode=CUDNN_INPUT_LINEAR_MODE, direction=CUDNN_RNN_UNIDIRECTION): """Constructor. @@ -193,6 +194,8 @@ class CudnnParamsFormatConverter(object): num_units: the number of units within the RNN model. input_size: the size of the input, it could be different from the num_units. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. input_mode: indicate whether there is a linear projection between the input and the actual computation before the first layer. It could be one of 'linear_input', 'skip_input' or 'auto_select'. * 'linear_input' @@ -207,14 +210,16 @@ class CudnnParamsFormatConverter(object): self._input_size = input_size self._num_units = num_units self._input_mode = input_mode + self._num_proj = num_proj self._direction = direction self._num_dirs = 1 if self._direction == CUDNN_RNN_UNIDIRECTION else 2 self._num_params = ( self._num_params_per_layer * self._num_layers * self._num_dirs) - def tf_canonical_to_opaque(self, tf_canonicals): + def tf_canonical_to_opaque(self, tf_canonicals, weights_proj=None): r"""Converts tf canonical weights to cudnn opaque param.""" - cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals) + cu_weights, cu_biases = self._tf_canonical_to_cu_canonical(tf_canonicals, + weights_proj) cu_weights = [array_ops.reshape(w, [-1]) for w in cu_weights] opaque_params = self._cu_canonical_to_opaque(cu_weights, cu_biases) return opaque_params @@ -222,8 +227,14 @@ class CudnnParamsFormatConverter(object): def opaque_to_tf_canonical(self, opaque_param): r"""Converts cudnn opaque param to tf canonical weights.""" cu_weights, cu_biases = self._opaque_to_cu_canonical(opaque_param) - weights, biases = self._cu_canonical_to_tf_canonical(cu_weights, cu_biases) - return weights, biases + if self._num_proj: + weights, biases, weights_proj = self._cu_canonical_to_tf_canonical( + cu_weights, cu_biases) + return weights, biases, weights_proj + else: + weights, biases = self._cu_canonical_to_tf_canonical( + cu_weights, cu_biases) + return weights, biases def _opaque_to_cu_canonical(self, opaque_param): """Converts opaque params to Cudnn canonical format. @@ -235,15 +246,31 @@ class CudnnParamsFormatConverter(object): 2 list for weights and biases respectively. """ with ops.device("/gpu:0"): - weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( - num_layers=self._num_layers, - num_units=self._num_units, - input_size=self._input_size, - params=opaque_param, - num_params=self._num_params, - rnn_mode=self._rnn_mode, - input_mode=self._input_mode, - direction=self._direction) + if self._num_proj: + num_params_weights = (self._num_params + + 1 * self._num_layers * self._num_dirs) + num_params_biases = self._num_params + weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical_v2( + num_layers=self._num_layers, + num_units=self._num_units, + input_size=self._input_size, + params=opaque_param, + rnn_mode=self._rnn_mode, + input_mode=self._input_mode, + direction=self._direction, + num_params_weights=num_params_weights, + num_params_biases=num_params_biases, + num_proj=self._num_proj) + else: + weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( + num_layers=self._num_layers, + num_units=self._num_units, + input_size=self._input_size, + params=opaque_param, + num_params=self._num_params, + rnn_mode=self._rnn_mode, + input_mode=self._input_mode, + direction=self._direction) return (weights, biases) def _cu_canonical_to_opaque(self, cu_weights, cu_biases): @@ -256,16 +283,28 @@ class CudnnParamsFormatConverter(object): a single opaque tensor. """ with ops.device("/gpu:0"): - return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( - num_layers=self._num_layers, - num_units=self._num_units, - input_size=self._input_size, - weights=cu_weights, - biases=cu_biases, - rnn_mode=self._rnn_mode, - input_mode=self._input_mode, - direction=self._direction) - + if self._num_proj: + return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params_v2( + num_layers=self._num_layers, + num_units=self._num_units, + input_size=self._input_size, + weights=cu_weights, + biases=cu_biases, + rnn_mode=self._rnn_mode, + input_mode=self._input_mode, + num_proj=self._num_proj, + direction=self._direction) + else: + return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( + num_layers=self._num_layers, + num_units=self._num_units, + input_size=self._input_size, + weights=cu_weights, + biases=cu_biases, + rnn_mode=self._rnn_mode, + input_mode=self._input_mode, + direction=self._direction) + def _cu_canonical_to_tf_canonical(self, cu_weights, cu_biases): r"""Transform from Cudnn canonical to tf canonical. @@ -289,9 +328,11 @@ class CudnnParamsFormatConverter(object): 1 tuple, tf canonical weights and biases. """ tf_weights, tf_biases = [], [] + tf_weights_proj = [] layer_weights_num = self._num_params_per_layer * self._num_dirs layer_biases_num = layer_weights_num + layer_weights_num += (1 * self._num_dirs) if self._num_proj else 0 for i in range(self._num_layers): layer_weights = cu_weights[i * layer_weights_num:(i + 1) * @@ -299,7 +340,7 @@ class CudnnParamsFormatConverter(object): layer_biases = cu_biases[i * layer_biases_num:(i + 1) * layer_biases_num] if self._direction == CUDNN_RNN_UNIDIRECTION: self._cu_canonical_to_tf_canonical_single_layer( - layer_weights, layer_biases, tf_weights, tf_biases) + layer_weights, layer_biases, tf_weights, tf_biases, tf_weights_proj) else: fw_weights = layer_weights[:len(layer_weights) // 2] bw_weights = layer_weights[len(layer_weights) // 2:] @@ -311,6 +352,7 @@ class CudnnParamsFormatConverter(object): fw_biases, tf_weights, tf_biases, + tf_weights_proj, ) self._cu_canonical_to_tf_canonical_single_layer( @@ -318,11 +360,16 @@ class CudnnParamsFormatConverter(object): bw_biases, tf_weights, tf_biases, + tf_weights_proj, ) - return (tf_weights, tf_biases) + if self._num_proj: + return (tf_weights, tf_biases, tf_weights_proj) + else: + return (tf_weights, tf_biases) def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases, - tf_weights, tf_biases): + tf_weights, tf_biases, + tf_weigths_proj=None): r"""Transform single layer Cudnn canonicals to tf canonicals. The elements of cu_weights, cu_biases are laid out in the following format: @@ -337,7 +384,7 @@ class CudnnParamsFormatConverter(object): """ raise NotImplementedError("Abstract method") - def _tf_canonical_to_cu_canonical(self, tf_canonicals): + def _tf_canonical_to_cu_canonical(self, tf_canonicals, weights_proj): r"""Transform from tf canonical to Cudnn canonical. This is the reverse routine of _TransformCanonical(). @@ -356,6 +403,7 @@ class CudnnParamsFormatConverter(object): --------------- |fwd |bak | --------------- + weights_proj: (optional) weights matrices for projection Returns: 2 lists: the recovered cudnn canonical weights and biases. """ @@ -370,6 +418,9 @@ class CudnnParamsFormatConverter(object): layer_biases = biases[i * layer_biases_num:(i + 1) * layer_biases_num] if self._direction == CUDNN_RNN_UNIDIRECTION: cu_weights.extend(self._tf_to_cudnn_weights(i, *layer_weights)) + if weights_proj is not None: + pw = array_ops.transpose(weights_proj[i]) + cu_weights.append(pw) cu_biases.extend(self._tf_to_cudnn_biases(*layer_biases)) else: fw_weights, bw_weights = layer_weights[:len( @@ -377,9 +428,15 @@ class CudnnParamsFormatConverter(object): fw_biases, bw_biases = layer_biases[:len( layer_biases) // 2], layer_biases[len(layer_biases) // 2:] cu_weights.extend(self._tf_to_cudnn_weights(i, *fw_weights)) + if weights_proj is not None: + pw0 = array_ops.transpose(weights_proj[2*i+0]) + cu_weights.append(pw0) cu_biases.extend(self._tf_to_cudnn_biases(*fw_biases)) cu_weights.extend(self._tf_to_cudnn_weights(i, *bw_weights)) + if weights_proj is not None: + pw1 = array_ops.transpose(weights_proj[2*i+1]) + cu_weights.append(pw1) cu_biases.extend(self._tf_to_cudnn_biases(*bw_biases)) return cu_weights, cu_biases @@ -415,7 +472,10 @@ class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter): def _cudnn_to_tf_weights(self, *cu_weights): r"""Stitching cudnn canonical weights to generate tf canonical weights.""" - w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o = cu_weights + if self._num_proj: + w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o, pw = cu_weights + else: + w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o = cu_weights # pylint: disable=invalid-name W_i = array_ops.concat([w_i, r_i], axis=1) @@ -425,7 +485,11 @@ class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter): # pylint: enable=invalid-name # Cudnn LSTM weights are in ifco order, other tf LSTMs are in icfo order. reordered = self._cudnn_to_tf_gate_params(* [W_i, W_f, W_c, W_o]) - return (array_ops.transpose(array_ops.concat(reordered, axis=0)),) + if self._num_proj: + return (array_ops.transpose(array_ops.concat(reordered, axis=0)), + array_ops.transpose(pw)) + else: + return (array_ops.transpose(array_ops.concat(reordered, axis=0)),) def _tf_to_cudnn_weights(self, layer, *tf_weights): r"""Reverse the operations in StitchWeights().""" @@ -434,7 +498,7 @@ class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter): if layer == 0: input_weight_width = input_size else: - input_weight_width = num_units + input_weight_width = self._num_proj if self._num_proj else num_units if self._direction == CUDNN_RNN_BIDIRECTION: input_weight_width *= 2 @@ -444,10 +508,15 @@ class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter): W_i, W_f, W_c, W_o = self._tf_to_cudnn_gate_params(*array_ops.split( w, 4, axis=0)) - w_i, r_i = array_ops.split(W_i, [input_weight_width, num_units], axis=1) - w_c, r_c = array_ops.split(W_c, [input_weight_width, num_units], axis=1) - w_f, r_f = array_ops.split(W_f, [input_weight_width, num_units], axis=1) - w_o, r_o = array_ops.split(W_o, [input_weight_width, num_units], axis=1) + hidden_state_width = self._num_proj if self._num_proj else num_units + w_i, r_i = array_ops.split(W_i, [input_weight_width, hidden_state_width], + axis=1) + w_c, r_c = array_ops.split(W_c, [input_weight_width, hidden_state_width], + axis=1) + w_f, r_f = array_ops.split(W_f, [input_weight_width, hidden_state_width], + axis=1) + w_o, r_o = array_ops.split(W_o, [input_weight_width, hidden_state_width], + axis=1) return w_i, w_f, w_c, w_o, r_i, r_f, r_c, r_o # pylint: enable=invalid-name @@ -483,10 +552,16 @@ class CudnnParamsFormatConverterLSTM(CudnnParamsFormatConverter): return b_wi, b_wf, b_wc, b_wo, b_ri, b_rf, b_rc, b_ro def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases, - tf_weights, tf_biases): - (w,) = self._cudnn_to_tf_weights(*cu_weights) + tf_weights, tf_biases, + tf_weights_proj=None): + if self._num_proj: + (w, pw) = self._cudnn_to_tf_weights(*cu_weights) + tf_weights.append(w) + tf_weights_proj.append(pw) + else: + (w,) = self._cudnn_to_tf_weights(*cu_weights) + tf_weights.append(w) (b,) = self._cudnn_to_tf_biases(*cu_biases) - tf_weights.append(w) tf_biases.append(b) @@ -554,7 +629,8 @@ class CudnnParamsFormatConverterGRU(CudnnParamsFormatConverter): return b_wi, b_wr, b_wh, b_ri, b_rr, b_rh def _cu_canonical_to_tf_canonical_single_layer(self, cu_weights, cu_biases, - tf_weights, tf_biases): + tf_weights, tf_biases, + tf_weights_proj=None): # pylint: disable=invalid-name W_ir, w_h, r_h = self._cudnn_to_tf_weights(*cu_weights) b_ir, b_wh, b_rh = self._cudnn_to_tf_biases(*cu_biases) @@ -727,8 +803,9 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject): def format_converter(self): if self._format_converter is None: self._format_converter = self._format_converter_cls( - self._num_layers, self._num_units, self._input_size, self._input_mode, - self._direction) + self._num_layers, self._num_units, self._input_size, + input_mode=self._input_mode, + direction=self._direction) return self._format_converter def restore(self, restored_tensors, restored_shapes): @@ -962,6 +1039,7 @@ def _cudnn_rnn(inputs, direction=CUDNN_RNN_UNIDIRECTION, dropout=0., seed=0, + num_proj=None, name=None): """Cudnn RNN. @@ -999,6 +1077,8 @@ def _cudnn_rnn(inputs, dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. name: name of the operation. Returns: outputs, output_h, output_c @@ -1027,13 +1107,15 @@ def _cudnn_rnn(inputs, if sequence_lengths is not None: args["sequence_lengths"] = sequence_lengths args["time_major"] = time_major + args["num_proj"] = 0 if num_proj is None else num_proj outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args) - elif time_major is False: + elif time_major is False or num_proj: batch_size = array_ops.shape(inputs)[0] max_time = array_ops.shape(inputs)[1] sequence_lengths = array_ops.fill([batch_size], max_time) args["sequence_lengths"] = sequence_lengths args["time_major"] = time_major + args["num_proj"] = 0 if num_proj is None else num_proj outputs, output_h, output_c, _, _ = gen_cudnn_rnn_ops.cudnn_rnnv3(**args) elif use_cudnn_v2 != "1": outputs, output_h, output_c, _ = gen_cudnn_rnn_ops.cudnn_rnn(**args) @@ -1053,6 +1135,7 @@ def cudnn_lstm(inputs, direction=CUDNN_RNN_UNIDIRECTION, dropout=0., seed=0, + num_proj=None, name=None): """Cudnn LSTM. @@ -1089,13 +1172,15 @@ def cudnn_lstm(inputs, dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. name: name of the operation. Returns: outputs, output_h, output_c """ return _cudnn_rnn(inputs, input_h, input_c, params, is_training, CUDNN_LSTM, sequence_lengths, time_major, input_mode, direction, - dropout, seed, name) + dropout, seed, num_proj, name) def _cudnn_rnn_no_input_c(inputs, @@ -1151,7 +1236,7 @@ def _cudnn_rnn_no_input_c(inputs, input_c = array_ops.constant([], dtype=input_h.dtype) outputs, output_h, _ = _cudnn_rnn( inputs, input_h, input_c, params, is_training, rnn_mode, sequence_lengths, - time_major, input_mode, direction, dropout, seed, name) + time_major, input_mode, direction, dropout, seed, None, name) return outputs, output_h @@ -1322,6 +1407,7 @@ def cudnn_rnn_opaque_params_to_canonical(rnn_mode, direction=CUDNN_RNN_UNIDIRECTION, dropout=0, seed=0, + num_proj=None, name=None): """Convert cudnn opaque params to canonical. @@ -1346,6 +1432,8 @@ def cudnn_rnn_opaque_params_to_canonical(rnn_mode, dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. name: name of the operation. Returns: weights list and bias list @@ -1358,19 +1446,39 @@ def cudnn_rnn_opaque_params_to_canonical(rnn_mode, check_input_mode(input_mode) num_params = _get_num_params(rnn_mode, num_layers, direction) seed, seed2 = random_seed.get_seed(seed) - weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( - rnn_mode=rnn_mode, - num_layers=num_layers, - num_units=num_units, - input_size=input_size, - params=params, - input_mode=input_mode, - direction=direction, - dropout=dropout, - seed=seed, - seed2=seed2, - num_params=num_params, - name=name) + num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2 + if num_proj is not None and num_proj != 0: + num_params_weights = (num_params + 1 * num_layers * num_dirs) + num_params_biases = num_params + weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical_v2( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + params=params, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + num_params_weights=num_params_weights, + num_params_biases=num_params_biases, + num_proj=num_proj, + name=name) + else: + weights, biases = gen_cudnn_rnn_ops.cudnn_rnn_params_to_canonical( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + params=params, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + num_params=num_params, + name=name) return weights, biases @@ -1384,6 +1492,7 @@ def cudnn_rnn_canonical_to_opaque_params(rnn_mode, direction=CUDNN_RNN_UNIDIRECTION, dropout=0, seed=0, + num_proj=None, name=None): """Converts params from the canonical format to a specific format of cuDNN. @@ -1409,6 +1518,8 @@ def cudnn_rnn_canonical_to_opaque_params(rnn_mode, dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. name: name of the operation. Returns: an opaque Cudnn param. @@ -1419,20 +1530,35 @@ def cudnn_rnn_canonical_to_opaque_params(rnn_mode, check_direction(direction) check_input_mode(input_mode) seed, seed2 = random_seed.get_seed(seed) - return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( - rnn_mode=rnn_mode, - num_layers=num_layers, - num_units=num_units, - input_size=input_size, - weights=weights, - biases=biases, - input_mode=input_mode, - direction=direction, - dropout=dropout, - seed=seed, - seed2=seed2, - name=name) - + if num_proj is not None and num_proj != 0: + return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params_v2( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + weights=weights, + biases=biases, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + num_proj=num_proj, + name=name) + else: + return gen_cudnn_rnn_ops.cudnn_rnn_canonical_to_params( + rnn_mode=rnn_mode, + num_layers=num_layers, + num_units=num_units, + input_size=input_size, + weights=weights, + biases=biases, + input_mode=input_mode, + direction=direction, + dropout=dropout, + seed=seed, + seed2=seed2, + name=name) def cudnn_rnn_opaque_params_size(rnn_mode, num_layers, @@ -1443,6 +1569,7 @@ def cudnn_rnn_opaque_params_size(rnn_mode, dtype=dtypes.float32, dropout=0, seed=0, + num_proj=None, name=None): """Returns opaque params size for specific Cudnn config. @@ -1467,6 +1594,8 @@ def cudnn_rnn_opaque_params_size(rnn_mode, dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. name: name of the operation. Returns: a int, size of Cudnn opaque params. @@ -1482,6 +1611,7 @@ def cudnn_rnn_opaque_params_size(rnn_mode, num_layers=num_layers, num_units=num_units, input_size=input_size, + num_proj=num_proj, T=dtype, S=dtypes.int32, dropout=dropout, @@ -1510,7 +1640,8 @@ class _CudnnRNN(object): direction=CUDNN_RNN_UNIDIRECTION, dtype=dtypes.float32, dropout=0., - seed=0): + seed=0, + num_proj=None): """Creates a CudnnRNN model from model spec. Args: @@ -1534,6 +1665,8 @@ class _CudnnRNN(object): dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the op seed used for initializing dropout. See `tf.set_random_seed` for behavior. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. Raises: ValueError: if direction is invalid. """ @@ -1546,6 +1679,7 @@ class _CudnnRNN(object): self._dtype = dtype self._dropout = dropout self._seed = seed + self._num_proj = num_proj @property def input_mode(self): @@ -1571,6 +1705,10 @@ class _CudnnRNN(object): def direction(self): return self._direction + @property + def num_proj(self): + return self._num_proj + def params_size(self): """Calculates the size of the opaque parameter buffer needed for this model. @@ -1582,6 +1720,7 @@ class _CudnnRNN(object): num_layers=self._num_layers, num_units=self._num_units, input_size=self._input_size, + num_proj=self._num_proj, dtype=self._dtype, dropout=self._dropout, seed=self._seed, @@ -1637,7 +1776,8 @@ class _CudnnRNN(object): input_mode=self._input_mode, direction=self._direction, dropout=self._dropout, - seed=self._seed) + seed=self._seed, + num_proj=self._num_proj) def params_to_canonical(self, params): """Converts params from a specific format of cuDNN to the canonical format. @@ -1657,7 +1797,8 @@ class _CudnnRNN(object): input_mode=self._input_mode, direction=self._direction, dropout=self._dropout, - seed=self._seed) + seed=self._seed, + num_proj=self._num_proj) def canonical_to_params(self, weights, biases): """Converts params from the canonical format to a specific format of cuDNN. @@ -1679,7 +1820,8 @@ class _CudnnRNN(object): input_mode=self._input_mode, direction=self._direction, dropout=self._dropout, - seed=self._seed) + seed=self._seed, + num_proj=self._num_proj) class CudnnLSTM(_CudnnRNN): @@ -1697,7 +1839,8 @@ class CudnnLSTM(_CudnnRNN): direction=CUDNN_RNN_UNIDIRECTION, dtype=dtypes.float32, dropout=0., - seed=0): + seed=0, + num_proj=None): """Creates a Cudnn LSTM model from model spec. Args: @@ -1716,6 +1859,8 @@ class CudnnLSTM(_CudnnRNN): dtype: dtype of params, tf.float32 or tf.float64. dropout: whether to enable dropout. With it is 0, dropout is disabled. seed: the seed used for initializing dropout. + num_proj: The output dimensionality for the projection matrices. + If None or 0, no projection is performed. """ super(CudnnLSTM, self).__init__( CUDNN_LSTM, @@ -1726,7 +1871,8 @@ class CudnnLSTM(_CudnnRNN): direction=direction, dtype=dtype, dropout=dropout, - seed=seed) + seed=seed, + num_proj=num_proj) def __call__(self, input_data, diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc index d43fe747333..e70fa88b2cf 100644 --- a/tensorflow/core/kernels/cudnn_rnn_ops.cc +++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc @@ -502,20 +502,23 @@ struct CudnnRnnModelShapes { int dir_count; int max_seq_length; int batch_size; + int c_num_units; TensorShape input_shape; TensorShape output_shape; TensorShape hidden_state_shape; + TensorShape c_state_shape; // At present only fields related to cached RnnDescriptor are concerned. bool IsCompatibleWith(const CudnnRnnModelShapes& rhs) const { return num_layers == rhs.num_layers && input_size == rhs.input_size && - num_units == rhs.num_units && dir_count == rhs.dir_count; + num_units == rhs.num_units && dir_count == rhs.dir_count && + c_num_units == rhs.c_num_units; } string DebugString() const { return strings::Printf( "[num_layers, input_size, num_units, dir_count, max_seq_length, " - "batch_size]: [%d, %d, %d, %d, %d, %d] ", + "batch_size, c_num_units]: [%d, %d, %d, %d, %d, %d, %d] ", num_layers, input_size, num_units, dir_count, max_seq_length, - batch_size); + batch_size, c_num_units); } }; @@ -562,6 +565,7 @@ Status ExtractForwardInput(OpKernelContext* context, const CudnnModelTypes& model_types, bool time_major, const Tensor** input, const Tensor** input_h, const Tensor** input_c, const Tensor** params, + const int num_proj, CudnnRnnModelShapes* model_shapes) { TF_RETURN_IF_ERROR(context->input("input", input)); TF_RETURN_IF_ERROR(context->input("input_h", input_h)); @@ -615,12 +619,48 @@ Status ExtractForwardInput(OpKernelContext* context, model_shapes->hidden_state_shape.DebugString()); } if (model_types.HasInputC()) { - if ((*input_h)->shape() != (*input_c)->shape()) { - return errors::InvalidArgument( - "input_h and input_c must have the same shape: ", - (*input_h)->shape().DebugString(), " ", - (*input_c)->shape().DebugString()); + model_shapes->c_num_units = (*input_c)->dim_size(2); + if (time_major) { + model_shapes->c_state_shape = + TensorShape({model_shapes->dir_count * model_shapes->num_layers, + model_shapes->batch_size, model_shapes->c_num_units}); + } else { + model_shapes->c_state_shape = + TensorShape({model_shapes->batch_size, + model_shapes->dir_count * model_shapes->num_layers, + model_shapes->c_num_units}); } + if (num_proj == 0) { + if ((*input_h)->shape() != (*input_c)->shape()) { + return errors::InvalidArgument( + "input_h and input_c must have the same shape w/o projection: ", + (*input_h)->shape().DebugString(), " ", + (*input_c)->shape().DebugString()); + } + } else { + if ((*input_h)->dim_size(2) > (*input_c)->dim_size(2) || + num_proj != (*input_h)->dim_size(2) || + (*input_h)->dim_size(0) != (*input_c)->dim_size(0) || + (*input_h)->dim_size(1) != (*input_c)->dim_size(1)) { + return errors::InvalidArgument( + "Invalid input_h and input_c w/ projection size: ", num_proj, " ", + (*input_h)->shape().DebugString(), " ", + (*input_c)->shape().DebugString()); + } + } + } else { + // dummy c_state_shape TODO(kaixih): remove the time_major branch + if (time_major) { + model_shapes->c_state_shape = + TensorShape({model_shapes->dir_count * model_shapes->num_layers, + model_shapes->batch_size, model_shapes->num_units}); + } else { + model_shapes->c_state_shape = + TensorShape({model_shapes->batch_size, + model_shapes->dir_count * model_shapes->num_layers, + model_shapes->num_units}); + } + model_shapes->c_num_units = 0; } if (time_major) { model_shapes->output_shape = @@ -639,18 +679,19 @@ Status ExtractForwardInput(OpKernelContext* context, const CudnnModelTypes& model_types, bool time_major, const Tensor** input, const Tensor** input_h, const Tensor** input_c, const Tensor** params, - const Tensor** sequence_lengths, + const Tensor** sequence_lengths, const int num_proj, CudnnRnnModelShapes* model_shapes) { TF_RETURN_IF_ERROR(context->input("sequence_lengths", sequence_lengths)); return ExtractForwardInput(context, model_types, time_major, input, input_h, - input_c, params, model_shapes); + input_c, params, num_proj, model_shapes); } template Status CreateForwardAndBackwardIODescriptors( OpKernelContext* context, const CudnnRnnModelShapes& model_shapes, std::unique_ptr* input_desc, - std::unique_ptr* state_desc, + std::unique_ptr* h_state_desc, + std::unique_ptr* c_state_desc, std::unique_ptr* output_desc, const absl::Span& seq_lengths, bool time_major) { StreamExecutor* executor = context->op_device_context()->stream()->parent(); @@ -658,6 +699,7 @@ Status CreateForwardAndBackwardIODescriptors( const TensorShape& input_shape = model_shapes.input_shape; const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; + const TensorShape& c_state_shape = model_shapes.c_state_shape; const TensorShape& output_shape = model_shapes.output_shape; DCHECK_EQ(input_shape.dims(), 3); @@ -689,13 +731,28 @@ Status CreateForwardAndBackwardIODescriptors( hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1), hidden_state_shape.dim_size(2), data_type); TF_RETURN_IF_ERROR(hidden_state_desc_s.status()); - *state_desc = hidden_state_desc_s.ConsumeValueOrDie(); + *h_state_desc = hidden_state_desc_s.ConsumeValueOrDie(); } else { auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor( hidden_state_shape.dim_size(1), hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(2), data_type); TF_RETURN_IF_ERROR(hidden_state_desc_s.status()); - *state_desc = hidden_state_desc_s.ConsumeValueOrDie(); + *h_state_desc = hidden_state_desc_s.ConsumeValueOrDie(); + } + + DCHECK_EQ(c_state_shape.dims(), 3); + if (time_major) { + auto c_state_desc_s = executor->createRnnStateTensorDescriptor( + c_state_shape.dim_size(0), c_state_shape.dim_size(1), + c_state_shape.dim_size(2), data_type); + TF_RETURN_IF_ERROR(c_state_desc_s.status()); + *c_state_desc = c_state_desc_s.ConsumeValueOrDie(); + } else { + auto c_state_desc_s = executor->createRnnStateTensorDescriptor( + c_state_shape.dim_size(1), c_state_shape.dim_size(0), + c_state_shape.dim_size(2), data_type); + TF_RETURN_IF_ERROR(c_state_desc_s.status()); + *c_state_desc = c_state_desc_s.ConsumeValueOrDie(); } DCHECK_EQ(output_shape.dims(), 3); @@ -739,7 +796,8 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc, ScratchAllocator* workspace_allocator, ProfileResult* output_profile_result) { std::unique_ptr input_desc; - std::unique_ptr state_desc; + std::unique_ptr h_state_desc; + std::unique_ptr c_state_desc; std::unique_ptr output_desc; absl::Span seq_lengths; @@ -748,8 +806,8 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc, sequence_lengths->template flat().data(), model_shapes.batch_size); } TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors( - context, model_shapes, &input_desc, &state_desc, &output_desc, - seq_lengths, time_major)); + context, model_shapes, &input_desc, &h_state_desc, &c_state_desc, + &output_desc, seq_lengths, time_major)); auto input_data = AsDeviceMemory(input); auto input_h_data = AsDeviceMemory(input_h); @@ -769,11 +827,11 @@ Status DoForward(OpKernelContext* context, const RnnDescriptor& rnn_desc, Stream* stream = context->op_device_context()->stream(); bool launch_success = stream - ->ThenRnnForward(rnn_desc, *input_desc, input_data, *state_desc, - input_h_data, *state_desc, input_c_data, params_data, - *output_desc, &output_data, *state_desc, - &output_h_data, *state_desc, &output_c_data, - is_training, reserve_space_allocator, + ->ThenRnnForward(rnn_desc, *input_desc, input_data, *h_state_desc, + input_h_data, *c_state_desc, input_c_data, + params_data, *output_desc, &output_data, + *h_state_desc, &output_h_data, *c_state_desc, + &output_c_data, is_training, reserve_space_allocator, workspace_allocator, output_profile_result) .ok(); return launch_success @@ -801,7 +859,8 @@ Status DoBackward( ScratchAllocator* workspace_allocator, ProfileResult* output_profile_result) { std::unique_ptr input_desc; - std::unique_ptr state_desc; + std::unique_ptr h_state_desc; + std::unique_ptr c_state_desc; std::unique_ptr output_desc; absl::Span seq_lengths; @@ -810,8 +869,8 @@ Status DoBackward( sequence_lengths->template flat().data(), model_shapes.batch_size); } TF_RETURN_IF_ERROR(CreateForwardAndBackwardIODescriptors( - context, model_shapes, &input_desc, &state_desc, &output_desc, - seq_lengths, time_major)); + context, model_shapes, &input_desc, &h_state_desc, &c_state_desc, + &output_desc, seq_lengths, time_major)); auto input_data = AsDeviceMemory(input); auto input_h_data = AsDeviceMemory(input_h); @@ -847,15 +906,16 @@ Status DoBackward( Stream* stream = context->op_device_context()->stream(); bool launch_success = stream - ->ThenRnnBackward(rnn_desc, *input_desc, input_data, *state_desc, - input_h_data, *state_desc, input_c_data, - params_data, *output_desc, output_data, *state_desc, - output_h_data, *state_desc, output_c_data, - output_backprop_data, output_h_backprop_data, - output_c_backprop_data, &input_backprop_data, - &input_h_backprop_data, &input_c_backprop_data, - ¶ms_backprop_data, &reserve_space_uint8, - workspace_allocator, output_profile_result) + ->ThenRnnBackward(rnn_desc, *input_desc, input_data, *h_state_desc, + input_h_data, *c_state_desc, input_c_data, + params_data, *output_desc, output_data, + *h_state_desc, output_h_data, *c_state_desc, + output_c_data, output_backprop_data, + output_h_backprop_data, output_c_backprop_data, + &input_backprop_data, &input_h_backprop_data, + &input_c_backprop_data, ¶ms_backprop_data, + &reserve_space_uint8, workspace_allocator, + output_profile_result) .ok(); return launch_success ? Status::OK() @@ -932,7 +992,7 @@ class CudnnRNNKernelCommon : public OpKernel { bool ResetRndGenState() { return reset_rnd_gen_state_; } template - Status ExtractCudnnRNNParamsInfo(OpKernelContext* context, + Status ExtractCudnnRNNParamsInfo(OpKernelContext* context, int num_proj, std::unique_ptr* rnn_desc) { const Tensor* num_layers_t = nullptr; TF_RETURN_IF_ERROR(context->input("num_layers", &num_layers_t)); @@ -953,6 +1013,9 @@ class CudnnRNNKernelCommon : public OpKernel { } int input_size = input_size_t->scalar()(); + int h_num_units = (num_proj == 0 ? num_units : num_proj); + int c_num_units = (num_proj == 0 ? 0 : num_units); + RnnInputMode input_mode; TF_RETURN_IF_ERROR( ToRNNInputMode(rnn_input_mode(), num_units, input_size, &input_mode)); @@ -962,9 +1025,10 @@ class CudnnRNNKernelCommon : public OpKernel { // random number generator, therefore set state_allocator to nullptr. const AlgorithmConfig algo_config; auto rnn_desc_s = stream->parent()->createRnnDescriptor( - num_layers, num_units, input_size, /*batch_size=*/0, input_mode, - rnn_direction_mode(), rnn_mode(), ToDataType::value, algo_config, - dropout(), seed(), /* state_allocator=*/nullptr); + num_layers, h_num_units, input_size, /*c_size=*/c_num_units, + /*batch_size=*/0, input_mode, rnn_direction_mode(), rnn_mode(), + ToDataType::value, algo_config, dropout(), seed(), + /* state_allocator=*/nullptr); if (!rnn_desc_s.ok()) { return FromExecutorStatus(rnn_desc_s); } @@ -983,9 +1047,9 @@ class CudnnRNNKernelCommon : public OpKernel { se::dnn::DataType data_type = ToDataType::value; auto rnn_desc_s = executor->createRnnDescriptor( model_shapes.num_layers, model_shapes.num_units, - model_shapes.input_size, model_shapes.batch_size, input_mode, - rnn_direction_mode(), rnn_mode(), data_type, algo_config, dropout(), - seed(), dropout_state_allocator); + model_shapes.input_size, model_shapes.c_num_units, + model_shapes.batch_size, input_mode, rnn_direction_mode(), rnn_mode(), + data_type, algo_config, dropout(), seed(), dropout_state_allocator); TF_RETURN_IF_ERROR(rnn_desc_s.status()); *rnn_desc = rnn_desc_s.ConsumeValueOrDie(); @@ -1035,11 +1099,18 @@ template class CudnnRNNParamsSizeOp : public CudnnRNNKernelCommon { public: explicit CudnnRNNParamsSizeOp(OpKernelConstruction* context) - : CudnnRNNKernelCommon(context) {} + : CudnnRNNKernelCommon(context) { + if (context->HasAttr("num_proj")) { + OP_REQUIRES_OK(context, context->GetAttr("num_proj", &num_proj_)); + } else { + num_proj_ = 0; + } + } void Compute(OpKernelContext* context) override { std::unique_ptr rnn_desc; - OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo(context, &rnn_desc)); + OP_REQUIRES_OK(context, + ExtractCudnnRNNParamsInfo(context, num_proj_, &rnn_desc)); int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes(); CHECK(params_size_in_bytes % sizeof(T) == 0) << "params_size_in_bytes must be multiple of element size"; @@ -1049,6 +1120,9 @@ class CudnnRNNParamsSizeOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK(context, context->allocate_output(0, {1}, &output_t)); *output_t->template flat().data() = params_size; } + + private: + int num_proj_; }; #define REGISTER_GPU(T) \ @@ -1074,7 +1148,33 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { public: explicit CudnnRNNParamsToCanonical(OpKernelConstruction* context) : CudnnRNNKernelCommon(context) { - OP_REQUIRES_OK(context, context->GetAttr("num_params", &num_params_)); + if (context->HasAttr("num_params")) { + OP_REQUIRES_OK(context, context->GetAttr("num_params", &num_params_)); + } else { + num_params_ = 0; + } + if (context->HasAttr("num_params_weights")) { + OP_REQUIRES_OK(context, context->GetAttr("num_params_weights", + &num_params_weights_)); + } else { + num_params_weights_ = 0; + } + if (context->HasAttr("num_params_biases")) { + OP_REQUIRES_OK(context, + context->GetAttr("num_params_biases", + &num_params_biases_)); + } else { + num_params_biases_ = 0; + } + if (context->HasAttr("num_proj")) { + OP_REQUIRES_OK(context, context->GetAttr("num_proj", &num_proj_)); + } else { + num_proj_ = 0; + } + if (num_proj_ == 0) { + num_params_weights_ = num_params_; + num_params_biases_ = num_params_; + } } void Compute(OpKernelContext* context) override { @@ -1083,7 +1183,8 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { Stream* stream = context->op_device_context()->stream(); std::unique_ptr rnn_desc; - OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo(context, &rnn_desc)); + OP_REQUIRES_OK(context, + ExtractCudnnRNNParamsInfo(context, num_proj_, &rnn_desc)); int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes(); CHECK(params_size_in_bytes % sizeof(T) == 0) << "params_size_in_bytes must be multiple of element size"; @@ -1109,25 +1210,38 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { if (rnn_direction_mode() == RnnDirectionMode::kRnnBidirectional) { num_dirs = 2; } - const int num_params_per_layer = num_params_ / num_layers / num_dirs; + const int num_params_weights_per_layer = + num_params_weights_ / num_layers / num_dirs; // Number of params applied on inputs. The rest are applied on recurrent // hidden states. - const int num_params_input_state = num_params_per_layer / 2; - CHECK(num_params_ % (num_layers * num_dirs) == 0) - << "Number of params is not a multiple of num_layers * num_dirs."; - CHECK(num_params_per_layer % 2 == 0) - << "Number of params per layer is not a even number."; + const int num_params_input_state = num_params_weights_per_layer / 2; + CHECK(num_params_weights_ % (num_layers * num_dirs) == 0) + << "Number of params (weights) is not a multiple of num_layers * " + "num_dirs."; + CHECK(num_params_biases_ % (num_layers * num_dirs) == 0) + << "Number of params (bias) is not a multiple of num_layers * " + "num_dirs."; + if (num_proj_ == 0) { + CHECK(num_params_weights_per_layer % 2 == 0) + << "Number of params per layer is not a even number w/o projection."; + } else { + CHECK(num_params_weights_per_layer % 2 != 0) + << "Number of params per layer is not a odd number w/ projection."; + } - CHECK(num_params_ == rnn_desc->ParamsWeightRegions().size()) - << "Number of params mismatch. Expected " << num_params_ << ", got " - << rnn_desc->ParamsWeightRegions().size(); + CHECK(num_params_weights_ == rnn_desc->ParamsWeightRegions().size()) + << "C Number of params mismatch. Expected " << num_params_weights_ + << ", got " << rnn_desc->ParamsWeightRegions().size(); + int h_num_units = (num_proj_ == 0 ? num_units : num_proj_); + int c_num_units = (num_proj_ == 0 ? 0 : num_units); for (int i = 0; i < rnn_desc->ParamsWeightRegions().size(); i++) { int64 size_in_bytes = rnn_desc->ParamsWeightRegions()[i].size; int64 size = size_in_bytes / sizeof(T); - const int layer_idx = i / num_params_per_layer; - const int index_within_layer = i % num_params_per_layer; - int width = 0, height = num_units; - // In CuDNN layout, each layer has num_params_per_layer params, with the + const int layer_idx = i / num_params_weights_per_layer; + const int index_within_layer = i % num_params_weights_per_layer; + int width = 0, height = (num_proj_ == 0 ? h_num_units : c_num_units); + // In CuDNN layout, each layer has num_params_weights_per_layer params, + // with the // first half a.k.a num_params_input_state params applied on the inputs, // and the second half on the recurrent hidden states. bool apply_on_input_state = index_within_layer < num_params_input_state; @@ -1135,7 +1249,7 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { if (layer_idx == 0 && apply_on_input_state) { width = input_size; } else { - width = num_units; + width = h_num_units; } } else { if (apply_on_input_state) { @@ -1145,15 +1259,19 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { } else { // Following layers, cell inputs are concatenated outputs of // its prior layer. - width = 2 * num_units; + width = 2 * h_num_units; } } else { - width = num_units; + width = h_num_units; } } CHECK(size == width * height) << "Params size mismatch. Expected " << width * height << ", got " << size; Tensor* output = nullptr; + int id_in_layer = i % num_params_weights_per_layer; + if (num_proj_ != 0 && id_in_layer == num_params_weights_per_layer-1) { + std::swap(height, width); + } OP_REQUIRES_OK(context, context->allocate_output( i, TensorShape({height, width}), &output)); DeviceMemoryBase data_src_ptr = SliceDeviceMemory( @@ -1162,10 +1280,11 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { stream->ThenMemcpy(&data_dst_ptr, data_src_ptr, size_in_bytes); } - OP_REQUIRES(context, num_params_ == rnn_desc->ParamsBiasRegions().size(), - errors::InvalidArgument("Number of params mismatch. Expected ", - num_params_, ", got ", - rnn_desc->ParamsBiasRegions().size())); + OP_REQUIRES( + context, num_params_biases_ == rnn_desc->ParamsBiasRegions().size(), + errors::InvalidArgument("A Number of params mismatch. Expected ", + num_params_biases_, ", got ", + rnn_desc->ParamsBiasRegions().size())); for (int i = 0; i < rnn_desc->ParamsBiasRegions().size(); i++) { int64 size_in_bytes = rnn_desc->ParamsBiasRegions()[i].size; int64 size = size_in_bytes / sizeof(T); @@ -1175,7 +1294,7 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { Tensor* output = nullptr; OP_REQUIRES_OK(context, - context->allocate_output(num_params_ + i, + context->allocate_output(num_params_weights_ + i, TensorShape({size}), &output)); DeviceMemoryBase data_src_ptr = SliceDeviceMemory( input_ptr, rnn_desc->ParamsBiasRegions()[i].offset, size_in_bytes); @@ -1186,6 +1305,9 @@ class CudnnRNNParamsToCanonical : public CudnnRNNKernelCommon { private: int num_params_; + int num_params_weights_; + int num_params_biases_; + int num_proj_; }; #define REGISTER_GPU(T) \ @@ -1201,17 +1323,37 @@ TF_CALL_float(REGISTER_GPU); TF_CALL_double(REGISTER_GPU); #undef REGISTER_GPU +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("CudnnRNNParamsToCanonicalV2") \ + .Device(DEVICE_GPU) \ + .HostMemory("num_layers") \ + .HostMemory("num_units") \ + .HostMemory("input_size") \ + .TypeConstraint("T"), \ + CudnnRNNParamsToCanonical); +TF_CALL_half(REGISTER_GPU); +TF_CALL_float(REGISTER_GPU); +TF_CALL_double(REGISTER_GPU); +#undef REGISTER_GPU + // Convert weight and bias params from the canonical form to a // platform-specific layout. template class CudnnRNNCanonicalToParams : public CudnnRNNKernelCommon { public: explicit CudnnRNNCanonicalToParams(OpKernelConstruction* context) - : CudnnRNNKernelCommon(context) {} + : CudnnRNNKernelCommon(context) { + if (context->HasAttr("num_proj")) { + OP_REQUIRES_OK(context, context->GetAttr("num_proj", &num_proj_)); + } else { + num_proj_ = 0; + } + } void Compute(OpKernelContext* context) override { std::unique_ptr rnn_desc; - OP_REQUIRES_OK(context, ExtractCudnnRNNParamsInfo(context, &rnn_desc)); + OP_REQUIRES_OK(context, + ExtractCudnnRNNParamsInfo(context, num_proj_, &rnn_desc)); int64 params_size_in_bytes = rnn_desc->ParamsSizeInBytes(); CHECK(params_size_in_bytes % sizeof(T) == 0) << "params_size_in_bytes must be multiple of element size"; @@ -1232,6 +1374,9 @@ class CudnnRNNCanonicalToParams : public CudnnRNNKernelCommon { RestoreParams(biases, rnn_desc->ParamsBiasRegions(), &output_ptr, stream); } + + private: + int num_proj_; }; #define REGISTER_GPU(T) \ @@ -1247,6 +1392,19 @@ TF_CALL_float(REGISTER_GPU); TF_CALL_double(REGISTER_GPU); #undef REGISTER_GPU +#define REGISTER_GPU(T) \ + REGISTER_KERNEL_BUILDER(Name("CudnnRNNCanonicalToParamsV2") \ + .Device(DEVICE_GPU) \ + .HostMemory("num_layers") \ + .HostMemory("num_units") \ + .HostMemory("input_size") \ + .TypeConstraint("T"), \ + CudnnRNNCanonicalToParams); +TF_CALL_half(REGISTER_GPU); +TF_CALL_float(REGISTER_GPU); +TF_CALL_double(REGISTER_GPU); +#undef REGISTER_GPU + // Run the forward operation of the RNN model. template class CudnnRNNForwardOp : public CudnnRNNKernelCommon { @@ -1264,14 +1422,14 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { void Compute(OpKernelContext* context) override { AlgorithmConfig algo_config; ComputeAndReturnAlgorithm(context, &algo_config, /*var_seq_lengths=*/false, - /*time_major=*/true); + /*time_major=*/true, /*num_proj=*/0); } protected: virtual void ComputeAndReturnAlgorithm(OpKernelContext* context, AlgorithmConfig* output_algo_config, bool var_seq_lengths, - bool time_major) { + bool time_major, int num_proj) { CHECK_NE(output_algo_config, nullptr); const Tensor* input = nullptr; @@ -1284,11 +1442,13 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, - &sequence_lengths, &model_shapes)); + &sequence_lengths, num_proj, + &model_shapes)); } else { OP_REQUIRES_OK(context, ExtractForwardInput( context, model_types(), time_major, &input, - &input_h, &input_c, ¶ms, &model_shapes)); + &input_h, &input_c, ¶ms, num_proj, + &model_shapes)); } RnnInputMode input_mode; OP_REQUIRES_OK(context, @@ -1362,13 +1522,14 @@ class CudnnRNNForwardOp : public CudnnRNNKernelCommon { Tensor** output_c) { const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; const TensorShape& output_shape = model_shapes.output_shape; + const TensorShape& c_state_shape = model_shapes.c_state_shape; TF_RETURN_IF_ERROR(context->allocate_output(0, output_shape, output)); TF_RETURN_IF_ERROR( context->allocate_output(1, hidden_state_shape, output_h)); if (HasInputC()) { TF_RETURN_IF_ERROR( - context->allocate_output(2, hidden_state_shape, output_c)); + context->allocate_output(2, c_state_shape, output_c)); } else { // Only LSTM uses input_c and output_c. So for all other models, we only // need to create dummy outputs. @@ -1414,7 +1575,7 @@ class CudnnRNNForwardOpV2 AlgorithmConfig best_algo_config; CudnnRNNForwardOp::ComputeAndReturnAlgorithm( context, &best_algo_config, /*var_seq_lengths=*/false, - /*time_major=*/true); + /*time_major=*/true, /*num_proj=*/0); if (!context->status().ok()) { return; } @@ -1613,13 +1774,18 @@ class CudnnRNNForwardOpV3 explicit CudnnRNNForwardOpV3(OpKernelConstruction* context) : CudnnRNNForwardOp(context) { OP_REQUIRES_OK(context, context->GetAttr("time_major", &time_major_)); + if (context->HasAttr("num_proj")) { + OP_REQUIRES_OK(context, context->GetAttr("num_proj", &num_proj_)); + } else { + num_proj_ = 0; + } } void Compute(OpKernelContext* context) override { AlgorithmConfig best_algo_config; CudnnRNNForwardOp::ComputeAndReturnAlgorithm( context, &best_algo_config, /*var_seq_lengths=*/true, - /*time_major=*/time_major()); + /*time_major=*/time_major(), num_proj_); if (!context->status().ok()) { return; } @@ -1631,6 +1797,9 @@ class CudnnRNNForwardOpV3 OP_REQUIRES_OK(context, context->allocate_output(4, {}, &output_host_reserved)); } + + private: + int num_proj_; }; #define REGISTER_GPU(T) \ @@ -1654,12 +1823,12 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { : CudnnRNNKernelCommon(context) {} void Compute(OpKernelContext* context) override { - ComputeImpl(context, false, true); + ComputeImpl(context, false, true, 0); } protected: virtual void ComputeImpl(OpKernelContext* context, bool var_seq_lengths, - bool time_major) { + bool time_major, int num_proj) { const Tensor* input = nullptr; const Tensor* input_h = nullptr; const Tensor* input_c = nullptr; @@ -1670,11 +1839,13 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { OP_REQUIRES_OK(context, ExtractForwardInput(context, model_types(), time_major, &input, &input_h, &input_c, ¶ms, - &sequence_lengths, &model_shapes)); + &sequence_lengths, num_proj, + &model_shapes)); } else { OP_REQUIRES_OK(context, ExtractForwardInput( context, model_types(), time_major, &input, - &input_h, &input_c, ¶ms, &model_shapes)); + &input_h, &input_c, ¶ms, num_proj, + &model_shapes)); } RnnInputMode input_mode; OP_REQUIRES_OK(context, @@ -1757,6 +1928,7 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { TF_RETURN_IF_ERROR(context->input("reserve_space", reserve_space)); const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; const TensorShape& output_shape = model_shapes.output_shape; + const TensorShape& c_state_shape = model_shapes.c_state_shape; if (output_shape != (*output)->shape()) { return errors::InvalidArgument( @@ -1782,16 +1954,16 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { } if (model_types.HasInputC()) { - if (hidden_state_shape != (*output_c)->shape()) { + if (c_state_shape != (*output_c)->shape()) { return errors::InvalidArgument( "Invalid output_c shape: ", (*output_c)->shape().DebugString(), " ", - hidden_state_shape.DebugString()); + c_state_shape.DebugString()); } - if (hidden_state_shape != (*output_c_backprop)->shape()) { + if (c_state_shape != (*output_c_backprop)->shape()) { return errors::InvalidArgument( "Invalid output_c_backprop shape: ", (*output_c_backprop)->shape().DebugString(), " ", - hidden_state_shape.DebugString()); + c_state_shape.DebugString()); } } return Status::OK(); @@ -1804,6 +1976,7 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { Tensor** input_c_backprop, Tensor** params_backprop) { const TensorShape& input_shape = model_shapes.input_shape; const TensorShape& hidden_state_shape = model_shapes.hidden_state_shape; + const TensorShape& c_state_shape = model_shapes.c_state_shape; TF_RETURN_IF_ERROR( context->allocate_output(0, input_shape, input_backprop)); @@ -1811,7 +1984,7 @@ class CudnnRNNBackwardOp : public CudnnRNNKernelCommon { context->allocate_output(1, hidden_state_shape, input_h_backprop)); if (HasInputC()) { TF_RETURN_IF_ERROR( - context->allocate_output(2, hidden_state_shape, input_c_backprop)); + context->allocate_output(2, c_state_shape, input_c_backprop)); } else { // Only LSTM uses input_c and output_c. So for all other models, we only // need to create dummy outputs. @@ -1879,11 +2052,20 @@ class CudnnRNNBackwardOpV3 explicit CudnnRNNBackwardOpV3(OpKernelConstruction* context) : CudnnRNNBackwardOp(context) { OP_REQUIRES_OK(context, context->GetAttr("time_major", &time_major_)); + if (context->HasAttr("num_proj")) { + OP_REQUIRES_OK(context, context->GetAttr("num_proj", &num_proj_)); + } else { + num_proj_ = 0; + } } void Compute(OpKernelContext* context) override { - CudnnRNNBackwardOp::ComputeImpl(context, true, time_major()); + CudnnRNNBackwardOp::ComputeImpl(context, true, time_major(), + num_proj_); } + + private: + int num_proj_; }; #define REGISTER_GPU(T) \ diff --git a/tensorflow/core/ops/cudnn_rnn_ops.cc b/tensorflow/core/ops/cudnn_rnn_ops.cc index 9b22ccdeeec..1dd7659e137 100644 --- a/tensorflow/core/ops/cudnn_rnn_ops.cc +++ b/tensorflow/core/ops/cudnn_rnn_ops.cc @@ -49,6 +49,7 @@ REGISTER_OP("CudnnRNNParamsSize") .Attr("dropout: float = 0.0") .Attr("seed: int = 0") .Attr("seed2: int = 0") + .Attr("num_proj: int = 0") .Output("params_size: S") .SetShapeFn([](InferenceContext* c) { ShapeHandle unused; @@ -166,11 +167,13 @@ REGISTER_OP("CudnnRNNV3") .Attr("dropout: float = 0.0") .Attr("seed: int = 0") .Attr("seed2: int = 0") + .Attr("num_proj: int = 0") .Attr("is_training: bool = true") .Attr("time_major: bool = true") .SetShapeFn([](InferenceContext* c) { auto input_shape = c->input(0); auto input_h_shape = c->input(1); + auto input_c_shape = c->input(2); auto max_seq_length = c->Dim(input_shape, 0); auto batch_size = c->Dim(input_shape, 1); auto num_units = c->Dim(input_h_shape, 2); @@ -185,7 +188,7 @@ REGISTER_OP("CudnnRNNV3") c->MakeShape({max_seq_length, batch_size, output_size}); auto output_h_shape = input_h_shape; auto output_c_shape TF_ATTRIBUTE_UNUSED = - (rnn_mode == "lstm") ? output_h_shape : c->MakeShape({}); + (rnn_mode == "lstm") ? input_c_shape : c->MakeShape({}); c->set_output(0, output_shape); c->set_output(1, output_h_shape); c->set_output(2, output_c_shape); @@ -293,6 +296,7 @@ REGISTER_OP("CudnnRNNBackpropV3") .Attr("dropout: float = 0.0") .Attr("seed: int = 0") .Attr("seed2: int = 0") + .Attr("num_proj: int = 0") .Attr("time_major: bool = true") .SetShapeFn([](InferenceContext* c) { auto input_shape = c->input(0); @@ -338,6 +342,43 @@ REGISTER_OP("CudnnRNNParamsToCanonical") return Status::OK(); }); +REGISTER_OP("CudnnRNNParamsToCanonicalV2") + .Input("num_layers: int32") + .Input("num_units: int32") + .Input("input_size: int32") + .Input("params: T") + .Output("weights: num_params_weights * T") + .Output("biases: num_params_biases * T") + .Attr("T: {float16, float32, float64}") + .Attr("num_params_weights: int") + .Attr("num_params_biases: int") + .Attr(kRNNModeAttrs) + .Attr(kRNNInputModeAttrs) + .Attr(kRNNDirectionAttrs) + .Attr("dropout: float = 0.0") + .Attr("seed: int = 0") + .Attr("seed2: int = 0") + .Attr("num_proj: int = 0") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle unused; + TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 1, &unused)); + int num_params_weights; + int num_params_biases; + TF_RETURN_IF_ERROR(c->GetAttr("num_params_weights", &num_params_weights)); + TF_RETURN_IF_ERROR(c->GetAttr("num_params_biases", &num_params_biases)); + // Set shape for weight matrices + for (int i = 0; i < num_params_weights; i++) { + c->set_output(i, c->Matrix(InferenceContext::kUnknownDim, + InferenceContext::kUnknownDim)); + } + // Set shape for bias vectors + for (int i = 0; i < num_params_biases; i++) { + c->set_output(num_params_weights + i, + c->Vector(InferenceContext::kUnknownDim)); + } + return Status::OK(); + }); + REGISTER_OP("CudnnRNNCanonicalToParams") .Input("num_layers: int32") .Input("num_units: int32") @@ -358,4 +399,26 @@ REGISTER_OP("CudnnRNNCanonicalToParams") return Status::OK(); }); +REGISTER_OP("CudnnRNNCanonicalToParamsV2") + .Input("num_layers: int32") + .Input("num_units: int32") + .Input("input_size: int32") + .Input("weights: num_params_weights * T") + .Input("biases: num_params_biases * T") + .Output("params: T") + .Attr("T: {float16, float32, float64}") + .Attr("num_params_weights: int") + .Attr("num_params_biases: int") + .Attr(kRNNModeAttrs) + .Attr(kRNNInputModeAttrs) + .Attr(kRNNDirectionAttrs) + .Attr("dropout: float = 0.0") + .Attr("seed: int = 0") + .Attr("seed2: int = 0") + .Attr("num_proj: int = 0") + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->Vector(InferenceContext::kUnknownDim)); + return Status::OK(); + }); + } // namespace tensorflow diff --git a/tensorflow/python/ops/cudnn_rnn_grad.py b/tensorflow/python/ops/cudnn_rnn_grad.py index 9ce906121f2..3c93e0b8ec9 100644 --- a/tensorflow/python/ops/cudnn_rnn_grad.py +++ b/tensorflow/python/ops/cudnn_rnn_grad.py @@ -98,6 +98,7 @@ def _cudnn_rnn_backwardv3(op, *grads): seed=op.get_attr("seed"), seed2=op.get_attr("seed2"), time_major=op.get_attr("time_major"), + num_proj=op.get_attr("num_proj"), rnn_mode=op.get_attr("rnn_mode"), input_mode=op.get_attr("input_mode"), direction=op.get_attr("direction")) + (None,) diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc index c0cc00c7208..4e9d569f669 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.cc +++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc @@ -1002,8 +1002,8 @@ class CudnnRnnParamsDescriptor { class CudnnRnnDescriptor : public dnn::RnnDescriptor { CudnnRnnDescriptor(const CudnnHandle& cudnn, gpu::RnnDescriptor rnn_desc, PersistentRnnPlan rnn_plan, int num_layers, - int hidden_size, int input_size, int batch_size, - cudnnRNNInputMode_t input_mode, + int hidden_size, int input_size, int c_size, + int batch_size, cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type, cudnnDataType_t compute_type, @@ -1015,6 +1015,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { num_layers_(num_layers), hidden_size_(hidden_size), input_size_(input_size), + c_size_(c_size), batch_size_(batch_size), rnn_algo_(ToCudnnRNNAlgo(algorithm_config.algorithm())), input_mode_(input_mode), @@ -1031,7 +1032,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { static port::StatusOr Create( const CudnnHandle& cudnn, int num_layers, int hidden_size, int input_size, - int batch_size, cudnnRNNInputMode_t input_mode, + int c_size, int batch_size, cudnnRNNInputMode_t input_mode, cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type, cudnnDataType_t compute_type, const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed, @@ -1044,12 +1045,29 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm()); // TODO: allow the user to choose an algorithm. - RETURN_IF_CUDNN_ERROR(cudnnSetRNNDescriptor_v6( - cudnn.handle(), /*rnnDesc=*/rnn_desc.get(), /*hiddenSize=*/hidden_size, - /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_desc.handle(), - /*inputMode=*/input_mode, /*direction=*/direction_mode, - /*mode=*/rnn_mode, /*algo=*/rnn_algo, - /*dataType=*/compute_type)); + if (c_size != 0 && hidden_size < c_size) { + RETURN_IF_CUDNN_ERROR(cudnnSetRNNDescriptor_v6( + cudnn.handle(), /*rnnDesc=*/rnn_desc.get(), /*hiddenSize=*/c_size, + /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_desc.handle(), + /*inputMode=*/input_mode, /*direction=*/direction_mode, + /*mode=*/rnn_mode, /*algo=*/rnn_algo, /*dataType=*/compute_type)); +#if CUDNN_VERSION >= 7101 + RETURN_IF_CUDNN_ERROR(cudnnSetRNNProjectionLayers( + cudnn.handle(), /*rnnDesc=*/rnn_desc.get(), + /*recProjSize=*/hidden_size, /*outProjSize=*/0)); +#else + return port::Status(port::error::INVALID_ARGUMENT, + "No supported cudnnSetRNNProjectionLayers when " + "CUDNN_VERSION < 7.1.1"); +#endif + } else { + RETURN_IF_CUDNN_ERROR(cudnnSetRNNDescriptor_v6( + cudnn.handle(), /*rnnDesc=*/rnn_desc.get(), + /*hiddenSize=*/hidden_size, /*numLayers=*/num_layers, + /*dropoutDesc=*/dropout_desc.handle(), /*inputMode=*/input_mode, + /*direction=*/direction_mode, /*mode=*/rnn_mode, /*algo=*/rnn_algo, + /*dataType=*/compute_type)); + } // TODO: For now, we only use cudnnRNN**Ex API to process padded inputs. // But in the future if these APIs are used to process full length arrays, @@ -1098,9 +1116,9 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { #endif return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan), - num_layers, hidden_size, input_size, batch_size, - input_mode, direction_mode, rnn_mode, data_type, - compute_type, algorithm_config, + num_layers, hidden_size, input_size, c_size, + batch_size, input_mode, direction_mode, rnn_mode, + data_type, compute_type, algorithm_config, std::move(dropout_desc), std::move(params_desc)); } @@ -1108,6 +1126,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { int num_layers() const { return num_layers_; } int hidden_size() const { return hidden_size_; } int input_size() const { return input_size_; } + int c_size() const { return c_size_; } int batch_size() const { return batch_size_; } cudnnRNNInputMode_t input_mode() const { return input_mode_; } cudnnDirectionMode_t direction_mode() const { return direction_mode_; } @@ -1136,6 +1155,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor { int num_layers_; int hidden_size_; int input_size_; + int c_size_; // batch_size_ is set to -1 when not using CUDNN_RNN_ALGO_PERSIST_DYNAMIC // algorithm. int batch_size_; @@ -1240,6 +1260,62 @@ port::StatusOr CudnnRnnParamsDescriptor::Create( (type == 0 ? weights : biases).push_back(region); } } + int hidden_size_v; + int num_layers_v; + cudnnDropoutDescriptor_t dropout_desc; + cudnnRNNInputMode_t input_mode; + cudnnDirectionMode_t direction; + cudnnRNNMode_t mode; + cudnnRNNAlgo_t algo; + cudnnDataType_t data_dype; + RETURN_IF_CUDNN_ERROR(cudnnGetRNNDescriptor( + /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc, + /*hiddenSize=*/&hidden_size_v, + /*numLayers=*/&num_layers_v, + /*dropoutDesc=*/&dropout_desc, + /*inputMode=*/&input_mode, + /*direction=*/&direction, + /*mode=*/&mode, + /*algo=*/&algo, + /*dataType=*/&data_type)); + int rec_proj_size_v; + int out_proj_size_v; +#if CUDNN_VERSION >= 7101 + RETURN_IF_CUDNN_ERROR(cudnnGetRNNProjectionLayers( + /*handle=*/cudnn.handle(), + /*rnnDesc=*/rnn_desc, + /*recProjSize*/ &rec_proj_size_v, + /*outProjSize*/ &out_proj_size_v)); +#else + return port::Status(port::error::INVALID_ARGUMENT, + "No supported cudnnGetRNNProjectionLayers when " + "CUDNN_VERSION < 7.1.1"); +#endif + if (rec_proj_size_v != hidden_size_v) { + void* offset = nullptr; + int region_id = 8; + RETURN_IF_CUDNN_ERROR(cudnnGetRNNLinLayerMatrixParams( + /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc, + /*layer=*/layer, /*xDesc=*/input_desc.get(), + /*wDesc=*/filter_desc.get(), + /*w=*/nullptr, /*linLayerID=*/region_id, + /*linLayerMatDesc=*/region_desc_handle.get(), + /*linLayerMat or linLayerBias=*/&offset)); + int dims[] = {1, 1, 1}; + cudnnDataType_t data_type; + cudnnTensorFormat_t tensor_format; + int n_dims; + RETURN_IF_CUDNN_ERROR(cudnnGetFilterNdDescriptor( + /*filterDesc=*/region_desc_handle.get(), + /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]), + /*dataType=*/&data_type, /*format=*/&tensor_format, + /*nbDims=*/&n_dims, /*filterDimA=*/dims)); + int64 size = + dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type); + dnn::RnnDescriptor::ParamsRegion region = { + reinterpret_cast(offset), size}; + weights.push_back(region); + } } return CudnnRnnParamsDescriptor(std::move(filter_desc), params_size_in_bytes, @@ -1404,6 +1480,7 @@ struct RnnModelDims { int max_seq_length = 0; int hidden_size = 0; int input_size = 0; + int c_size = 0; int dir_count = 0; }; @@ -1429,6 +1506,7 @@ port::StatusOr ExtractAndCheckRnnForward( model_dims.max_seq_length = input_desc.max_seq_length(); model_dims.hidden_size = rnn_desc.hidden_size(); model_dims.input_size = input_desc.data_size(); + model_dims.c_size = rnn_desc.c_size(); model_dims.dir_count = (rnn_desc.direction_mode() == CUDNN_BIDIRECTIONAL) ? 2 : 1; @@ -1441,7 +1519,7 @@ port::StatusOr ExtractAndCheckRnnForward( } if (!(input_h_desc.num_layers() == input_c_desc.num_layers() && input_h_desc.batch_size() == input_c_desc.batch_size() && - input_h_desc.data_size() == input_c_desc.data_size())) { + input_h_desc.data_size() <= input_c_desc.data_size())) { return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_c shape"); } if (!(output_desc.max_seq_length() == model_dims.max_seq_length && @@ -1458,7 +1536,7 @@ port::StatusOr ExtractAndCheckRnnForward( } if (!(input_h_desc.num_layers() == output_c_desc.num_layers() && input_h_desc.batch_size() == output_c_desc.batch_size() && - input_h_desc.data_size() == output_c_desc.data_size())) { + input_h_desc.data_size() <= output_c_desc.data_size())) { return port::Status(port::error::INVALID_ARGUMENT, "Invalid output_c shape"); } @@ -1814,7 +1892,7 @@ port::Status CudnnSupport::DoRnnBackwardImpl( port::StatusOr> CudnnSupport::createRnnDescriptor( - int num_layers, int hidden_size, int input_size, int batch_size, + int num_layers, int hidden_size, int input_size, int c_size, int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed, @@ -1825,7 +1903,7 @@ CudnnSupport::createRnnDescriptor( SE_ASSIGN_OR_RETURN( CudnnRnnDescriptor rnn_desc, CudnnRnnDescriptor::Create( - cudnn, num_layers, hidden_size, input_size, batch_size, + cudnn, num_layers, hidden_size, input_size, c_size, batch_size, ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type), GetRnnComputeType(data_type), diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h index 3a49469651c..b0da5cbd3b9 100644 --- a/tensorflow/stream_executor/cuda/cuda_dnn.h +++ b/tensorflow/stream_executor/cuda/cuda_dnn.h @@ -48,11 +48,11 @@ class CudnnSupport : public dnn::DnnSupport { port::StatusOr GetVersion() override; port::StatusOr> createRnnDescriptor( - int num_layers, int hidden_size, int input_size, int batch_size, - dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, - dnn::RnnMode rnn_mode, dnn::DataType data_type, - const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed, - ScratchAllocator* state_allocator) override; + int num_layers, int hidden_size, int input_size, int c_size, + int batch_size, dnn::RnnInputMode input_mode, + dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, + dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config, + float dropout, uint64 seed, ScratchAllocator* state_allocator) override; port::StatusOr> createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size, diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h index 34de1512ee5..295bedba233 100644 --- a/tensorflow/stream_executor/dnn.h +++ b/tensorflow/stream_executor/dnn.h @@ -2052,7 +2052,7 @@ class DnnSupport { // is no longer in use. virtual port::StatusOr> createRnnDescriptor(int num_layers, int hidden_size, int input_size, - int batch_size, dnn::RnnInputMode input_mode, + int c_size, int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config, diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc index 2870c3883e2..6541a138f73 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.cc +++ b/tensorflow/stream_executor/stream_executor_pimpl.cc @@ -379,7 +379,7 @@ bool StreamExecutor::GetBlasGemmAlgorithms( port::StatusOr> StreamExecutor::createRnnDescriptor( - int num_layers, int hidden_size, int input_size, int batch_size, + int num_layers, int hidden_size, int input_size, int c_size, int batch_size, dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, dnn::DataType data_type, const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed, @@ -390,7 +390,7 @@ StreamExecutor::createRnnDescriptor( "Fail to find the dnn implementation."); } return dnn_support->createRnnDescriptor( - num_layers, hidden_size, input_size, batch_size, input_mode, + num_layers, hidden_size, input_size, c_size, batch_size, input_mode, direction_mode, rnn_mode, data_type, algorithm_config, dropout, seed, state_allocator); } diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h index 7ded071467f..533213bd194 100644 --- a/tensorflow/stream_executor/stream_executor_pimpl.h +++ b/tensorflow/stream_executor/stream_executor_pimpl.h @@ -405,11 +405,11 @@ class StreamExecutor { // Create an RNN descriptor based on model shapes and configurations. // The caller retains the ownership of the descriptor. port::StatusOr> createRnnDescriptor( - int num_layers, int hidden_size, int input_size, int batch_size, - dnn::RnnInputMode input_mode, dnn::RnnDirectionMode direction_mode, - dnn::RnnMode rnn_mode, dnn::DataType data_type, - const dnn::AlgorithmConfig &algorithm_config, float dropout, uint64 seed, - ScratchAllocator *state_allocator); + int num_layers, int hidden_size, int input_size, int c_size, + int batch_size, dnn::RnnInputMode input_mode, + dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode, + dnn::DataType data_type, const dnn::AlgorithmConfig &algorithm_config, + float dropout, uint64 seed, ScratchAllocator *state_allocator); // Create a RNN sequence descriptor that specifies either the input or output // sequence. The caller retains the ownership of the returned descriptor. diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt index a809e92d3a5..c4f9210aae2 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt @@ -1036,6 +1036,14 @@ tf_module { name: "cross" argspec: "args=[\'a\', \'b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "cudnn_rnn_canonical_to_params_v2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } + member_method { + name: "cudnn_rnn_params_to_canonical_v2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params_weights\', \'num_params_biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } member_method { name: "cumprod" argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt index a2fa54f0214..2fdb508eda5 100644 --- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt @@ -766,27 +766,35 @@ tf_module { } member_method { name: "CudnnRNNBackpropV3" - argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], " + argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'True\', \'None\'], " } member_method { name: "CudnnRNNCanonicalToParams" argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], " } + member_method { + name: "CudnnRNNCanonicalToParamsV2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } member_method { name: "CudnnRNNParamsSize" - argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], " + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " } member_method { name: "CudnnRNNParamsToCanonical" argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], " } + member_method { + name: "CudnnRNNParamsToCanonicalV2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params_weights\', \'num_params_biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } member_method { name: "CudnnRNNV2" argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], " } member_method { name: "CudnnRNNV3" - argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'True\', \'None\'], " + argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'is_training\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'True\', \'True\', \'None\'], " } member_method { name: "Cumprod" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt index 6e7cc51a285..39d8cbaea81 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt @@ -540,6 +540,14 @@ tf_module { name: "cosh" argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], " } + member_method { + name: "cudnn_rnn_canonical_to_params_v2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } + member_method { + name: "cudnn_rnn_params_to_canonical_v2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params_weights\', \'num_params_biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } member_method { name: "cumsum" argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], " diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt index a2fa54f0214..2fdb508eda5 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt @@ -766,27 +766,35 @@ tf_module { } member_method { name: "CudnnRNNBackpropV3" - argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], " + argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'output\', \'output_h\', \'output_c\', \'output_backprop\', \'output_h_backprop\', \'output_c_backprop\', \'reserve_space\', \'host_reserved\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'True\', \'None\'], " } member_method { name: "CudnnRNNCanonicalToParams" argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], " } + member_method { + name: "CudnnRNNCanonicalToParamsV2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'weights\', \'biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } member_method { name: "CudnnRNNParamsSize" - argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], " + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'T\', \'S\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " } member_method { name: "CudnnRNNParamsToCanonical" argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'None\'], " } + member_method { + name: "CudnnRNNParamsToCanonicalV2" + argspec: "args=[\'num_layers\', \'num_units\', \'input_size\', \'params\', \'num_params_weights\', \'num_params_biases\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'None\'], " + } member_method { name: "CudnnRNNV2" argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'None\'], " } member_method { name: "CudnnRNNV3" - argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'is_training\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'True\', \'True\', \'None\'], " + argspec: "args=[\'input\', \'input_h\', \'input_c\', \'params\', \'sequence_lengths\', \'rnn_mode\', \'input_mode\', \'direction\', \'dropout\', \'seed\', \'seed2\', \'num_proj\', \'is_training\', \'time_major\', \'name\'], varargs=None, keywords=None, defaults=[\'lstm\', \'linear_input\', \'unidirectional\', \'0\', \'0\', \'0\', \'0\', \'True\', \'True\', \'None\'], " } member_method { name: "Cumprod" diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt index a4d728c6976..1db8d32de3c 100644 --- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt +++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt @@ -10,7 +10,7 @@ tf_module { } member_method { name: "audio" - argspec: "args=[\'name\', \'data\', \'sample_rate\', \'step\', \'max_outputs\', \'encoding\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'3\', \'None\', \'None\'], " + argspec: "args=[\'name\', \'data\', \'sample_rate\', \'step\', \'max_outputs\', \'encoding\', \'description\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], " } member_method { name: "create_file_writer" @@ -26,11 +26,11 @@ tf_module { } member_method { name: "histogram" - argspec: "args=[\'name\', \'data\', \'step\', \'buckets\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], " + argspec: "args=[\'name\', \'data\', \'step\', \'buckets\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " } member_method { name: "image" - argspec: "args=[\'name\', \'data\', \'step\', \'max_outputs\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'3\', \'None\'], " + argspec: "args=[\'name\', \'data\', \'step\', \'max_outputs\', \'description\'], varargs=None, keywords=None, defaults=[\'3\', \'None\'], " } member_method { name: "import_event" @@ -42,7 +42,7 @@ tf_module { } member_method { name: "scalar" - argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\'], " } member_method { name: "summary_scope" @@ -50,7 +50,7 @@ tf_module { } member_method { name: "text" - argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], " + argspec: "args=[\'name\', \'data\', \'step\', \'description\'], varargs=None, keywords=None, defaults=[\'None\'], " } member_method { name: "trace_export"