838 lines
31 KiB
Python
838 lines
31 KiB
Python
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
# ==============================================================================
|
|
"""Tests for Grappler AutoMixedPrecision."""
|
|
|
|
from __future__ import absolute_import
|
|
from __future__ import division
|
|
from __future__ import print_function
|
|
|
|
import os
|
|
|
|
from absl.testing import parameterized
|
|
import numpy as np
|
|
|
|
from tensorflow.core.framework import types_pb2
|
|
from tensorflow.core.protobuf import config_pb2
|
|
from tensorflow.core.protobuf import rewriter_config_pb2
|
|
from tensorflow.python import tf2
|
|
from tensorflow.python.client import session
|
|
from tensorflow.python.data.ops import dataset_ops
|
|
from tensorflow.python.framework import constant_op
|
|
from tensorflow.python.framework import dtypes
|
|
from tensorflow.python.framework import function
|
|
from tensorflow.python.framework import ops
|
|
from tensorflow.python.framework import random_seed
|
|
from tensorflow.python.framework import test_util
|
|
from tensorflow.python.layers import layers
|
|
from tensorflow.python.ops import array_ops
|
|
from tensorflow.python.ops import control_flow_ops
|
|
from tensorflow.python.ops import init_ops
|
|
from tensorflow.python.ops import math_ops
|
|
from tensorflow.python.ops import nn
|
|
from tensorflow.python.ops import nn_impl
|
|
from tensorflow.python.ops import random_ops
|
|
from tensorflow.python.ops import tensor_array_ops
|
|
from tensorflow.python.ops import variables
|
|
from tensorflow.python.ops.losses import losses
|
|
from tensorflow.python.platform import test
|
|
from tensorflow.python.training import adam
|
|
from tensorflow.python.training import gradient_descent
|
|
|
|
|
|
def _input(shape):
|
|
"""Generates an input of a given shape."""
|
|
return variables.Variable(random_ops.truncated_normal(shape, seed=0))
|
|
|
|
|
|
def _weight(shape):
|
|
"""Generates a weight of a given shape."""
|
|
# Note that the lambda is needed to allow construction inside loops.
|
|
return variables.Variable(
|
|
lambda: init_ops.glorot_uniform_initializer(seed=0)(shape))
|
|
|
|
|
|
def _bias(shape):
|
|
"""Generates a bias of a given shape."""
|
|
return constant_op.constant(0.1, shape=shape)
|
|
|
|
|
|
def _conv2d(x, w):
|
|
"""Returns a 2d convolution layer with full stride."""
|
|
return nn.conv2d(x, w, strides=[1, 1, 1, 1], padding='SAME')
|
|
|
|
|
|
def _conv3d(x, w):
|
|
"""Returns a 3d convolution layer with full stride."""
|
|
return nn.conv3d(x, w, strides=[1, 1, 1, 1, 1], padding='SAME')
|
|
|
|
|
|
def _max_pool_2x2(x):
|
|
"""Downsamples a feature map by 2X."""
|
|
return nn.max_pool(
|
|
x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
|
|
|
|
|
|
def _fused_batchnorm(x, scale, offset):
|
|
"""Batchnorm."""
|
|
return nn_impl.fused_batch_norm(
|
|
x, scale=scale, offset=offset, is_training=True)
|
|
|
|
|
|
def _conv_bn(x):
|
|
"""Conv followed by batchnorm."""
|
|
i = array_ops.reshape(x, [-1, 8, 8, 1])
|
|
f = _weight([3, 3, 1, 6])
|
|
x = _conv2d(i, f)
|
|
s = _weight([6])
|
|
o = _weight([6])
|
|
y, _, _ = _fused_batchnorm(x, s, o)
|
|
y = array_ops.identity(y)
|
|
return y
|
|
|
|
|
|
def _conv3d_bn(x):
|
|
"""Conv3D followed by batchnorm."""
|
|
i = array_ops.reshape(x, [-1, 8, 8, 8, 1])
|
|
f = _weight([3, 3, 3, 1, 6])
|
|
x = _conv3d(i, f)
|
|
s = _weight([6])
|
|
o = _weight([6])
|
|
x = array_ops.reshape(x, [-1, 8, 8, 6])
|
|
y, _, _ = _fused_batchnorm(x, s, o)
|
|
y = array_ops.identity(y)
|
|
return y
|
|
|
|
|
|
def _matmul_act(x):
|
|
"""Matmul followed by activation."""
|
|
i = array_ops.reshape(x, [8, 8])
|
|
f = _weight([8, 8])
|
|
x = math_ops.matmul(i, f)
|
|
y = nn.relu(x)
|
|
return y
|
|
|
|
|
|
def _conv_pool(x):
|
|
"""(Conv -> bias -> relu -> max_pool) x2."""
|
|
x_image = array_ops.reshape(x, [-1, 8, 8, 1])
|
|
w_conv1 = _weight([3, 3, 1, 6])
|
|
b_conv1 = _bias([6])
|
|
h_conv1 = nn.relu(nn.bias_add(_conv2d(x_image, w_conv1), b_conv1))
|
|
h_pool1 = _max_pool_2x2(h_conv1)
|
|
w_conv2 = _weight([3, 3, 6, 4])
|
|
b_conv2 = _bias([4])
|
|
h_conv2 = nn.relu(nn.bias_add(_conv2d(h_pool1, w_conv2), b_conv2))
|
|
h_pool2 = _max_pool_2x2(h_conv2)
|
|
return h_pool2
|
|
|
|
|
|
def _simple_loop(x, functor):
|
|
"""Simple loop whose body is provided by the functor."""
|
|
init = (constant_op.constant(0), x)
|
|
c = lambda i, j: i < 4
|
|
b = lambda i, j: (i + 1, functor(j))
|
|
ij = control_flow_ops.while_loop(c, b, init)
|
|
return ij
|
|
|
|
|
|
def _loop_vars_intertwined(x0, y0, functor_x, functor_y):
|
|
"""Loop whose loop variables are intertwined."""
|
|
c = lambda i, j, x, y: j < 4
|
|
b = lambda i, j, x, y: (j + 1, i + 1, functor_y(y), functor_x(x))
|
|
init = (constant_op.constant(0), constant_op.constant(0), x0, y0)
|
|
ijzw = control_flow_ops.while_loop(c, b, init)
|
|
return ijzw
|
|
|
|
|
|
def _lstm_cell(prev_c, prev_h, x):
|
|
"""Create an LSTM cell."""
|
|
# i: input gate
|
|
# f: forget gate
|
|
# o: output gate
|
|
# c: cell state
|
|
# x: input
|
|
# h: embedding
|
|
bias = _bias([4])
|
|
w = _weight([8, 16])
|
|
ifoc = math_ops.matmul(array_ops.concat([x, prev_h], axis=1), w)
|
|
i, f, o, c = array_ops.split(ifoc, 4, axis=1)
|
|
i = math_ops.sigmoid(nn.bias_add(i, bias))
|
|
f = math_ops.sigmoid(nn.bias_add(f, bias))
|
|
o = math_ops.sigmoid(nn.bias_add(o, bias))
|
|
c = math_ops.tanh(nn.bias_add(c, bias))
|
|
next_c = f * prev_c + i * c
|
|
next_h = o * math_ops.tanh(next_c)
|
|
return next_c, next_h
|
|
|
|
|
|
def _recurrent_lstm(c, h):
|
|
"""Dynamic single-layer LSTM with TensorArray."""
|
|
|
|
def cond(i, c, h, ta_x):
|
|
del c, h, ta_x
|
|
return i < 4
|
|
|
|
def body(i, c, h, ta_x):
|
|
x = ta_x.read(i)
|
|
next_c, next_h = _lstm_cell(c, h, x)
|
|
return (i + 1, next_c, next_h, ta_x)
|
|
|
|
ta_x = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=4)
|
|
for i in range(0, 4):
|
|
ta_x = ta_x.write(
|
|
i, constant_op.constant(0.1, shape=[8, 4], dtype=dtypes.float32))
|
|
init = (constant_op.constant(0), c, h, ta_x)
|
|
r = control_flow_ops.while_loop(cond, body, init)
|
|
return r
|
|
|
|
|
|
def _make_node_with_color(color, input_tensor, name=None):
|
|
"""Returns a node representative of the specified list type."""
|
|
color = color.lower()
|
|
if color == 'w': # White node
|
|
weights = _weight(input_tensor.get_shape().as_list())
|
|
return math_ops.matmul(input_tensor, weights, name=name)
|
|
if color == 'g': # Gray node
|
|
return math_ops.add(input_tensor, 0.1, name=name)
|
|
if color == 'c': # Clear node
|
|
return nn.relu(input_tensor, name=name)
|
|
if color == 'b': # Black node
|
|
return math_ops.pow(math_ops.pow(input_tensor, 2.), 0.5, name=name)
|
|
raise ValueError('Invalid node color: ' + str(color))
|
|
|
|
|
|
def _build_simple_loop_graph(inp_colors, body_colors, out_colors):
|
|
"""Builds a test graph with a simple loop."""
|
|
a = _input([8, 8])
|
|
for i, color in enumerate(inp_colors):
|
|
a = _make_node_with_color(color, a, 'input_%i' % i)
|
|
|
|
def body(x):
|
|
for i, color in enumerate(body_colors):
|
|
x = _make_node_with_color(color, x, 'body_%i' % i)
|
|
return x
|
|
|
|
_, a = _simple_loop(a, body)
|
|
for i, color in enumerate(out_colors):
|
|
a = _make_node_with_color(color, a, 'output_%i' % i)
|
|
a = array_ops.identity(a)
|
|
return a
|
|
|
|
|
|
def _get_config(auto_mixed_precision_mode):
|
|
"""Returns a ConfigProto with auto mixed precision enabled if appropriate."""
|
|
rewrite_config = rewriter_config_pb2.RewriterConfig(
|
|
# do not remove duplicated nodes
|
|
arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
|
|
# do not turn Conv2D and other nodes into _FusedConv2D
|
|
remapping=rewriter_config_pb2.RewriterConfig.OFF,
|
|
)
|
|
if auto_mixed_precision_mode == 'cuda':
|
|
rewrite_config.auto_mixed_precision = rewriter_config_pb2.RewriterConfig.ON
|
|
elif auto_mixed_precision_mode == 'mkl':
|
|
rewrite_config.auto_mixed_precision_mkl = (
|
|
rewriter_config_pb2.RewriterConfig.ON)
|
|
else:
|
|
assert auto_mixed_precision_mode is None
|
|
rewrite_config.min_graph_nodes = -1
|
|
graph_options = config_pb2.GraphOptions(
|
|
rewrite_options=rewrite_config, build_cost_model=1)
|
|
config = config_pb2.ConfigProto(graph_options=graph_options)
|
|
config.graph_options.optimizer_options.opt_level = -1
|
|
return config
|
|
|
|
|
|
def _is_cast_to_fp16(node_name):
|
|
return node_name.endswith('-CastToFp16-AutoMixedPrecision')
|
|
|
|
|
|
def _is_cast_to_bf16(node_name):
|
|
return node_name.endswith('-CastToBf16-AutoMixedPrecision')
|
|
|
|
|
|
def _is_cast_to_fp32(node_name):
|
|
return node_name.endswith('-CastToFp32-AutoMixedPrecision')
|
|
|
|
|
|
def _count_casts(mode, nodes):
|
|
"""Counts the number of casts to f16 and fp32."""
|
|
num_to_fp16 = 0
|
|
num_to_bf16 = 0
|
|
num_to_fp32 = 0
|
|
for node in nodes:
|
|
if _is_cast_to_fp16(node.name):
|
|
num_to_fp16 += 1
|
|
if _is_cast_to_bf16(node.name):
|
|
num_to_bf16 += 1
|
|
elif _is_cast_to_fp32(node.name):
|
|
num_to_fp32 += 1
|
|
if mode == 'cuda':
|
|
assert num_to_bf16 == 0
|
|
return num_to_fp16, num_to_fp32
|
|
else:
|
|
assert mode == 'mkl'
|
|
assert num_to_fp16 == 0
|
|
return num_to_bf16, num_to_fp32
|
|
|
|
|
|
def _build_node_map(nodes):
|
|
node_map = {}
|
|
for node in nodes:
|
|
node_map[node.name] = node
|
|
return node_map
|
|
|
|
|
|
def _example_noninlined_funcdef_shape(op):
|
|
return [op.inputs[0].shape]
|
|
|
|
|
|
@function.Defun(
|
|
shape_func=_example_noninlined_funcdef_shape,
|
|
func_name='example_noninlined_funcdef_grad',
|
|
noinline=True)
|
|
def _example_noninlined_funcdef_grad(features, grad):
|
|
"""Gradient of Swish function defined below."""
|
|
sigmoid_features = math_ops.sigmoid(features)
|
|
activation_grad = (
|
|
sigmoid_features * (1.0 + features * (1.0 - sigmoid_features)))
|
|
return grad * activation_grad
|
|
|
|
|
|
@function.Defun(
|
|
grad_func=_example_noninlined_funcdef_grad,
|
|
shape_func=_example_noninlined_funcdef_shape,
|
|
func_name='example_noninlined_funcdef',
|
|
noinline=True)
|
|
def _example_noninlined_funcdef(features):
|
|
"""Computes the Swish activation function: `x * sigmoid(x)`."""
|
|
return features * math_ops.sigmoid(features)
|
|
|
|
|
|
class AutoMixedPrecisionTest(test.TestCase, parameterized.TestCase):
|
|
"""Tests the Grappler auto mixed precision optimizer."""
|
|
IGNORE_PERF_VAR = 'TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_IGNORE_PERFORMANCE'
|
|
|
|
# TODO(benbarsdell): Add tests for eager mode with a tf.function.
|
|
|
|
def setUp(self):
|
|
super(AutoMixedPrecisionTest, self).setUp()
|
|
# Enable the CUDA tests to be run on pre-Volta GPUs by telling the grappler
|
|
# pass to ignore performance and always transform the graph.
|
|
self._original_ignore_perf_value = os.getenv(self.IGNORE_PERF_VAR)
|
|
os.environ[self.IGNORE_PERF_VAR] = '1'
|
|
|
|
def tearDown(self):
|
|
if self._original_ignore_perf_value is not None:
|
|
os.environ[self.IGNORE_PERF_VAR] = self._original_ignore_perf_value
|
|
else:
|
|
del os.environ[self.IGNORE_PERF_VAR]
|
|
super(AutoMixedPrecisionTest, self).tearDown()
|
|
|
|
def _lower_precision_dtype(self, mode):
|
|
return dtypes.float16 if mode == 'cuda' else dtypes.bfloat16
|
|
|
|
def _assert_output_f16(self, mode, node_map, node_name, output_port=0):
|
|
self.assertEqual(node_map[node_name].output_info[output_port].dtype,
|
|
self._lower_precision_dtype(mode).as_datatype_enum)
|
|
|
|
def _run(self, mode, fetches):
|
|
"""Runs the graph and returns the evaluation of the fetches."""
|
|
with session.Session(config=_get_config(None)) as sess:
|
|
sess.run(variables.global_variables_initializer())
|
|
output_val_ref = self.evaluate(fetches)
|
|
|
|
with session.Session(config=_get_config(mode)) as sess:
|
|
sess.run(variables.global_variables_initializer())
|
|
metadata = config_pb2.RunMetadata()
|
|
output_val = sess.run(fetches, run_metadata=metadata)
|
|
|
|
return output_val_ref, output_val, metadata.cost_graph
|
|
|
|
def _maybe_skip(self, mode):
|
|
if mode == 'cuda' and not test.is_gpu_available(cuda_only=True):
|
|
self.skipTest('No GPU is available')
|
|
if mode == 'mkl' and not test_util.IsMklEnabled():
|
|
self.skipTest('MKL is not enabled')
|
|
|
|
def _run_simple_loop_test(self, mode, inp, body, out):
|
|
"""Runs a test of a simple loop.
|
|
|
|
The loop has different node colors in different sections of the graph. The
|
|
arguments must be strings where each character represents the color of a
|
|
node in that section of the graph: w = white, g = gray, c = clear,
|
|
b = black. CAPITALIZED characters indicate that the node is expected to be
|
|
changed to DT_HALF during graph optimization.
|
|
|
|
inp -> loop [ body ] -> out.
|
|
|
|
Args:
|
|
mode: Either 'cuda' or 'mkl'.
|
|
inp: A string of letters indicating the colors and expected dtypes of the
|
|
input nodes.
|
|
body: A string of letters indicating the colors and expected dtypes of the
|
|
body nodes.
|
|
out: A string of letters indicating the colors and expected dtypes of the
|
|
output nodes.
|
|
"""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
expected_types = []
|
|
for section in [inp, body, out]:
|
|
section_expected_types = []
|
|
for color in section:
|
|
if color.isupper():
|
|
expected_type = self._lower_precision_dtype(mode).as_datatype_enum
|
|
else:
|
|
expected_type = types_pb2.DT_FLOAT
|
|
section_expected_types.append(expected_type)
|
|
expected_types.append(section_expected_types)
|
|
|
|
a = _build_simple_loop_graph(inp, body, out)
|
|
output_val_ref, output_val, cost_graph = self._run(mode, a)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
section_names = ['input', 'while/body', 'output']
|
|
all_types_correct = True
|
|
for section_name, expected_types in zip(section_names, expected_types):
|
|
for i, expected_type in enumerate(expected_types):
|
|
node_name = section_name + '_%i' % i
|
|
output_port = 0
|
|
optimized_type = node_map[node_name].output_info[output_port].dtype
|
|
if optimized_type != expected_type:
|
|
print('Expected node %s to have type %s but got type %s' %
|
|
(node_name, expected_type, optimized_type))
|
|
all_types_correct = False
|
|
self.assertTrue(all_types_correct)
|
|
if mode == 'mkl':
|
|
self.assertAllClose(output_val_ref, output_val, atol=2e-2, rtol=2e-2)
|
|
else:
|
|
self.assertAllClose(output_val_ref, output_val, atol=2e-3, rtol=1e-3)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_conv_bn(self, mode):
|
|
"""Test graph with convolution followed by batch norm."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([2, 8, 8, 1])
|
|
x = _conv_bn(x)
|
|
output = _conv_bn(x)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'Conv2D')
|
|
self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
|
|
self._assert_output_f16(mode, node_map, 'Conv2D_1')
|
|
self.assertEqual(num_to_f16, 3) # Before Conv2D:0, Conv2D:1, Conv2D_1:1
|
|
self.assertEqual(num_to_fp32, 1) # After FusedBatchNormV3:0
|
|
if mode == 'mkl':
|
|
tol = 1e-2
|
|
elif test.is_built_with_rocm():
|
|
# Bump up the tolerance for the ROCm platform
|
|
# The default tolerance (1e-3) results in a tiny fraction (<1%) of
|
|
# miscompares on ROCm platform, and hence the tolerance bump
|
|
tol = 2e-3
|
|
else:
|
|
tol = 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_conv3d_bn(self, mode):
|
|
"""Test graph with convolution followed by batch norm."""
|
|
self._maybe_skip(mode)
|
|
if mode == 'cuda':
|
|
# TODO(reedwm): enable these tests when cuDNN is upgraded to >= 7.6.2.
|
|
self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
|
|
random_seed.set_random_seed(0)
|
|
x = _input([2, 8, 8, 8, 1])
|
|
x = _conv3d_bn(x)
|
|
output = _conv3d_bn(x)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
num_to_fp16, num_to_fp32 = _count_casts(mode, cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'Conv3D')
|
|
self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
|
|
self._assert_output_f16(mode, node_map, 'Conv3D_1')
|
|
self.assertEqual(num_to_fp16, 3) # Before Conv3D:0, Conv3D:1, Conv3D_1:1
|
|
self.assertEqual(num_to_fp32, 1) # After FusedBatchNormV3:0
|
|
self.assertAllClose(output_val_ref, output_val, atol=1e-2, rtol=1e-2)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_conv3d(self, mode):
|
|
"""Test grad ops with convolution3d graph."""
|
|
self._maybe_skip(mode)
|
|
if mode == 'cuda':
|
|
# TODO(reedwm): enable these tests when cuDNN is upgraded to >= 7.6.2.
|
|
self.skipTest('Test case should be skipped when cuDNN < 7.6.2')
|
|
random_seed.set_random_seed(0)
|
|
x = _input([2, 8, 8, 8, 1])
|
|
f = _weight([3, 3, 3, 1, 6])
|
|
y = _conv3d(x, f)
|
|
y = array_ops.identity(y)
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(y, [x, f])
|
|
output = (y, g)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
self._assert_output_f16(mode, node_map, 'Conv3D')
|
|
self._assert_output_f16(mode, node_map,
|
|
'gradients/Conv3D_grad/Conv3DBackpropInputV2')
|
|
self._assert_output_f16(mode, node_map,
|
|
'gradients/Conv3D_grad/Conv3DBackpropFilterV2')
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
tol = 5e-2 if mode == 'mkl' else 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
# TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
|
|
# MKL
|
|
@parameterized.parameters(['cuda'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_conv_bn_dropout(self, mode):
|
|
"""Test dropout precision of convolution batch norm graph."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([2, 8, 8, 1])
|
|
y = _conv_bn(x)
|
|
y = nn.dropout(y, rate=0.5)
|
|
y = math_ops.add(y, 1, name='addition')
|
|
y = _conv_bn(y)
|
|
y = array_ops.identity(y)
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(y, [x])
|
|
output = (y, g)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
self._assert_output_f16(mode, node_map, 'Conv2D')
|
|
self._assert_output_f16(mode, node_map, 'FusedBatchNormV3')
|
|
# We do not assert dropout's dtype because we do not want to rely on the
|
|
# node names of dropout's internal implementation.
|
|
self._assert_output_f16(mode, node_map, 'addition')
|
|
self._assert_output_f16(mode, node_map, 'Conv2D_1')
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
# Bump up the tolerance for the ROCm platform
|
|
# The default tolerance (1e-3) results in a tiny fraction (<1%) of
|
|
# miscompares on ROCm platform, and hence the tolerance bump
|
|
tol = 2e-3 if test.is_built_with_rocm else 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
# TODO(reedwm): Fix and enable this test with MKL. Currently this crashes with
|
|
# MKL
|
|
@parameterized.parameters(['cuda'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_conv_pool(self, mode):
|
|
"""Test graph with convolution followed by pooling."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([2, 8, 8, 1])
|
|
output = _conv_pool(x)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
num_to_f16, num_to_fp32 = _count_casts(mode, cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'Conv2D')
|
|
self._assert_output_f16(mode, node_map, 'Relu')
|
|
self._assert_output_f16(mode, node_map, 'MaxPool')
|
|
self._assert_output_f16(mode, node_map, 'Conv2D_1')
|
|
self.assertEqual(num_to_f16, 4)
|
|
self.assertEqual(num_to_fp32, 1)
|
|
tol = 5e-3 if mode == 'mkl' else 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_simple_loop(self, mode):
|
|
"""Test graph with while loop."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([8, 8])
|
|
y = _simple_loop(x, _matmul_act)[1]
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(y, [x])
|
|
output = (y, g)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'while/MatMul')
|
|
self._assert_output_f16(mode, node_map, 'while/Relu')
|
|
tol = 1e-2 if mode == 'mkl' else 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_loop_with_vars_intertwined(self, mode):
|
|
"""Test graph with intertwined while loops."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([8, 8])
|
|
_, _, k, l = _loop_vars_intertwined(
|
|
array_ops.ones(array_ops.shape(x)), x, _matmul_act, _matmul_act)
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(k, [x])
|
|
output = (k, l, g)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'while/MatMul')
|
|
self._assert_output_f16(mode, node_map, 'while/Relu')
|
|
self._assert_output_f16(mode, node_map, 'while/MatMul_1')
|
|
self._assert_output_f16(mode, node_map, 'while/Relu_1')
|
|
tol = 5e-3 if mode == 'mkl' else 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
@parameterized.parameters(['cuda'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_multi_paths(self, mode):
|
|
"""Test graph with multiple paths."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([2, 8, 8, 3])
|
|
x1, x2, x3 = array_ops.split(x, num_or_size_splits=3, axis=3)
|
|
y1 = _conv_pool(x1)
|
|
y2 = _conv_pool(x2)
|
|
y3 = _conv_pool(x3)
|
|
y = array_ops.concat([y1, y2, y3], axis=3)
|
|
y = array_ops.identity(y)
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(y, [x])
|
|
output = (y, g)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'split')
|
|
for suffix in [''] + ['_%i' % i for i in range(1, 6)]:
|
|
self._assert_output_f16(mode, node_map, 'Conv2D' + suffix)
|
|
self._assert_output_f16(mode, node_map, 'Relu' + suffix)
|
|
self._assert_output_f16(mode, node_map, 'MaxPool' + suffix)
|
|
self._assert_output_f16(mode, node_map, 'concat')
|
|
self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_multi_paths_2(self, mode):
|
|
"""Test graph with multiple paths."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([8, 8])
|
|
y1 = _matmul_act(x)
|
|
y2 = _matmul_act(x)
|
|
y = y1 + y2 + x
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(y, [x])
|
|
output = (g, y)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'MatMul')
|
|
self._assert_output_f16(mode, node_map, 'Relu')
|
|
self._assert_output_f16(mode, node_map, 'MatMul_1')
|
|
self._assert_output_f16(mode, node_map, 'Relu_1')
|
|
if mode == 'mkl':
|
|
tol = 2e-2
|
|
elif test.is_built_with_rocm():
|
|
# Bump up the tolerance for the ROCm platform
|
|
# The default tolerance (1e-3) results in a tiny fraction (<1%) of
|
|
# miscompares on ROCm platform, and hence the tolerance bump
|
|
tol = 2e-3
|
|
else:
|
|
tol = 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
@parameterized.parameters(['cuda']) # MKL doesn't support bf16 Sigmoid
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_recurrent_lstm(self, mode):
|
|
"""Test graph with recurrent lstm."""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
init_c = _input([8, 4])
|
|
init_h = _input([8, 4])
|
|
_, _, h, _ = _recurrent_lstm(init_c, init_h)
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(h, [init_c, init_h])
|
|
output = (h, g)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'while/concat')
|
|
self._assert_output_f16(mode, node_map, 'while/MatMul')
|
|
self._assert_output_f16(mode, node_map, 'while/split')
|
|
self._assert_output_f16(mode, node_map, 'while/Sigmoid')
|
|
self._assert_output_f16(mode, node_map, 'while/Sigmoid_1')
|
|
self._assert_output_f16(mode, node_map, 'while/Sigmoid_2')
|
|
self._assert_output_f16(mode, node_map, 'while/Tanh')
|
|
self._assert_output_f16(mode, node_map, 'while/Tanh_1')
|
|
self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('v1 loop test')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_1(self, mode):
|
|
self._run_simple_loop_test(mode, 'W', 'C', 'C')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('v1 loop test')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_2(self, mode):
|
|
self._run_simple_loop_test(mode, 'C', 'C', 'W')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('v1 loop test')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_3(self, mode):
|
|
self._run_simple_loop_test(mode, 'W', 'G', 'W')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('v1 loop test')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_4(self, mode):
|
|
self._run_simple_loop_test(mode, 'W', 'gbg', 'W')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_5(self, mode):
|
|
self._run_simple_loop_test(mode, 'b', 'gWC', 'c')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_6(self, mode):
|
|
self._run_simple_loop_test(mode, 'b', 'CWCG', 'C')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_7(self, mode):
|
|
self._run_simple_loop_test(mode, 'C', 'GWCG', 'C')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_v1_only('b/138749235')
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_propagation_through_simple_loop_8(self, mode):
|
|
self._run_simple_loop_test(mode, 'C', 'CgbgWC', 'g')
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_noninlined_funcdef(self, mode):
|
|
"""Test graph with non-inlined function subgraph.
|
|
|
|
This requires the grappler pass to handle an OpDef that only appears in the
|
|
graph's function registry instead of the global op registry.
|
|
|
|
Args:
|
|
mode: Either 'cuda' or 'mkl'.
|
|
"""
|
|
self._maybe_skip(mode)
|
|
random_seed.set_random_seed(0)
|
|
x = _input([8, 8])
|
|
y = _matmul_act(x)
|
|
y = _example_noninlined_funcdef(y)
|
|
optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.01)
|
|
g = optimizer.compute_gradients(y, [x])
|
|
output = (g, y)
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, output)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'MatMul')
|
|
tol = 1e-2 if mode == 'mkl' else 1e-3
|
|
self.assertAllClose(output_val_ref, output_val, atol=tol, rtol=tol)
|
|
|
|
@parameterized.parameters(['cuda', 'mkl'])
|
|
@test_util.run_deprecated_v1
|
|
@test_util.disable_xla('This test does not pass with XLA')
|
|
def test_ingraph_train_loop(self, mode):
|
|
"""Tests a graph containing a while loop around a training update.
|
|
|
|
This requires the grappler pass to take special care with its handling of
|
|
Enter ops that appear in front of reads from non-resource variables. See
|
|
the use of NodeImplicitlyReadsVariable in auto_mixed_precision.cc.
|
|
|
|
Args:
|
|
mode: Either 'cuda' or 'mkl'.
|
|
"""
|
|
self._maybe_skip(mode)
|
|
if tf2.enabled():
|
|
# This test tests non-resource variables, which are only used in TF1.
|
|
self.skipTest('TensorFlow 1 required')
|
|
random_seed.set_random_seed(1234)
|
|
np.random.seed(1234)
|
|
num_iter, bs, nchan, nclass = 100, 64, 32, 100
|
|
|
|
data = np.random.normal(size=(bs * num_iter, nchan)).astype(np.float32)
|
|
labels = np.random.randint(nclass, size=(bs * num_iter,))
|
|
ds = dataset_ops.Dataset.from_tensor_slices((data, labels))
|
|
ds = ds.batch(bs).prefetch(3)
|
|
it = ds.make_one_shot_iterator()
|
|
|
|
def body(_, i):
|
|
i += 1
|
|
x, yt = it.get_next()
|
|
dense = layers.Dense(nclass)
|
|
y = dense(x)
|
|
loss = losses.sparse_softmax_cross_entropy(yt, y)
|
|
opt = adam.AdamOptimizer()
|
|
train_op = opt.minimize(loss, var_list=dense.trainable_weights)
|
|
with ops.control_dependencies([train_op]):
|
|
loss = array_ops.identity(loss)
|
|
return loss, i
|
|
|
|
begin, end = constant_op.constant(0), constant_op.constant(num_iter)
|
|
loss, _ = control_flow_ops.while_loop(lambda loss, i: math_ops.less(i, end),
|
|
body, [0.0, begin])
|
|
|
|
output_val_ref, output_val, cost_graph = self._run(mode, loss)
|
|
node_map = _build_node_map(cost_graph.node)
|
|
|
|
self._assert_output_f16(mode, node_map, 'while/dense/MatMul')
|
|
self._assert_output_f16(mode, node_map,
|
|
'while/gradients/while/dense/MatMul_grad/MatMul_1')
|
|
self.assertAllClose(output_val_ref, output_val, atol=1e-3, rtol=1e-3)
|
|
|
|
# TODO(benbarsdell): Add tests for list ops (TensorList*) that pass through
|
|
# graph source/sink nodes, similar to the TensorListThroughFunction C++ test.
|
|
# Tests here will have the advantage of catching changes in the types of ops
|
|
# that are added to the graph.
|
|
|
|
|
|
if __name__ == '__main__':
|
|
test.main()
|