Add bfloat16 support to more cwise CPU ops

PiperOrigin-RevId: 325248465
Change-Id: I68b3be2af4f9acedb76ab6077bf5dac9ac6eeb72
This commit is contained in:
Reed Wanderman-Milne 2020-08-06 09:41:59 -07:00 committed by TensorFlower Gardener
parent 3f24d131d7
commit 696a4a76ce
23 changed files with 112 additions and 62 deletions

View File

@ -16,7 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER3(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, double);
REGISTER4(UnaryOp, CPU, "Ceil", functor::ceil, float, Eigen::half, bfloat16,
double);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Ceil", functor::ceil, float, Eigen::half, double);

View File

@ -156,6 +156,7 @@ struct TernaryClipOp<CPUDevice, T> {
INSTANTIATE_CPU(Eigen::half);
INSTANTIATE_CPU(float);
INSTANTIATE_CPU(double);
INSTANTIATE_CPU(bfloat16);
INSTANTIATE_CPU(int8);
INSTANTIATE_CPU(int16);
INSTANTIATE_CPU(int32);
@ -173,6 +174,7 @@ INSTANTIATE_CPU(uint16);
REGISTER_CPU_KERNEL(Eigen::half);
REGISTER_CPU_KERNEL(float);
REGISTER_CPU_KERNEL(double);
REGISTER_CPU_KERNEL(bfloat16);
REGISTER_CPU_KERNEL(int8);
REGISTER_CPU_KERNEL(int16);
REGISTER_CPU_KERNEL(int32);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Cos", functor::cos, float, Eigen::half, bfloat16,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Cos", functor::cos, float, Eigen::half, double);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Cosh", functor::cosh, float, double, complex64,
complex128);
REGISTER5(UnaryOp, CPU, "Cosh", functor::cosh, float, double, bfloat16,
complex64, complex128);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Exp", functor::exp, float, Eigen::half, bfloat16,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER5(UnaryOp, GPU, "Exp", functor::exp, float, Eigen::half, double,

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Expm1", functor::expm1, float, Eigen::half, bfloat16,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Expm1", functor::expm1, float, Eigen::half, double);
#endif

View File

@ -16,7 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER3(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, double);
REGISTER4(UnaryOp, CPU, "Floor", functor::floor, float, Eigen::half, bfloat16,
double);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Floor", functor::floor, float, Eigen::half, double);

View File

@ -18,8 +18,8 @@ limitations under the License.
namespace tensorflow {
REGISTER6(BinaryOp, CPU, "FloorDiv", functor::safe_floor_div, uint8, uint16,
int8, int16, int32, int64);
REGISTER3(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
Eigen::half, double);
REGISTER4(BinaryOp, CPU, "FloorDiv", functor::floor_div_real, float,
Eigen::half, bfloat16, double);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER4(BinaryOp, GPU, "FloorDiv", functor::floor_div, uint8, uint16, int16,

View File

@ -18,7 +18,8 @@ limitations under the License.
namespace tensorflow {
REGISTER3(BinaryOp, CPU, "FloorMod", functor::safe_floor_mod, int32, int64,
uint64);
REGISTER2(BinaryOp, CPU, "FloorMod", functor::floor_fmod, float, double);
REGISTER3(BinaryOp, CPU, "FloorMod", functor::floor_fmod, bfloat16, float,
double);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
// A special GPU kernel for int32.

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER3(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
double);
REGISTER4(UnaryOp, CPU, "IsFinite", functor::isfinite, float, Eigen::half,
bfloat16, double);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "IsFinite", functor::isfinite, float, Eigen::half,

View File

@ -16,7 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER3(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, double);
REGISTER4(UnaryOp, CPU, "IsInf", functor::isinf, float, Eigen::half, bfloat16,
double);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "IsInf", functor::isinf, float, Eigen::half, double);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Log1p", functor::log1p, float, Eigen::half, bfloat16,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Log1p", functor::log1p, float, Eigen::half, double);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, double,
complex64, complex128);
REGISTER6(BinaryOp, CPU, "Pow", functor::pow, float, Eigen::half, bfloat16,
double, complex64, complex128);
REGISTER2(BinaryOp, CPU, "Pow", functor::safe_pow, int32, int64);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM

View File

@ -30,15 +30,8 @@ REGISTER3(SimpleBinaryOp, GPU, "InvGrad", functor::inverse_grad, float,
Eigen::half, double);
#endif
#ifdef ENABLE_INTEL_MKL_BFLOAT16
// Since Eigen backend does not support bfloat16 ops, we are selectively
// enabling them for MKL backend.
REGISTER6(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
double, complex64, complex128, bfloat16);
#else
REGISTER5(UnaryOp, CPU, "Reciprocal", functor::inverse, float, Eigen::half,
double, complex64, complex128);
#endif // ENABLE_INTEL_MKL_BFLOAT16
bfloat16, double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
double, int64);
@ -47,8 +40,8 @@ REGISTER4(UnaryOp, GPU, "Reciprocal", functor::inverse, float, Eigen::half,
REGISTER(UnaryOp, SYCL, "Reciprocal", functor::inverse, float);
#endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double, complex64, complex128);
REGISTER6(SimpleBinaryOp, CPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, bfloat16, double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(SimpleBinaryOp, GPU, "ReciprocalGrad", functor::inverse_grad, float,
Eigen::half, double);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER7(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
complex64, Eigen::half, complex128);
REGISTER8(UnaryOp, CPU, "Sign", functor::sign, float, double, int32, int64,
complex64, Eigen::half, bfloat16, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER6(UnaryOp, GPU, "Sign", functor::sign, float, Eigen::half, double,
int64, complex64, complex128);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Sin", functor::sin, float, Eigen::half, bfloat16,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Sin", functor::sin, float, Eigen::half, double);

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER4(UnaryOp, CPU, "Sinh", functor::sinh, float, double, complex64,
complex128);
REGISTER5(UnaryOp, CPU, "Sinh", functor::sinh, float, double, bfloat16,
complex64, complex128);
#if TENSORFLOW_USE_SYCL
#define REGISTER_SYCL_KERNEL(TYPE) \

View File

@ -16,8 +16,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_common.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, float, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Tan", functor::tan, Eigen::half, bfloat16, float,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(UnaryOp, GPU, "Tan", functor::tan, Eigen::half, float, double);

View File

@ -17,8 +17,8 @@ limitations under the License.
#include "tensorflow/core/kernels/cwise_ops_gradients.h"
namespace tensorflow {
REGISTER5(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, double,
complex64, complex128);
REGISTER6(UnaryOp, CPU, "Tanh", functor::tanh, float, Eigen::half, bfloat16,
double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
#ifndef MLIR_GENERATED_GPU_KERNELS_ENABLED
@ -30,8 +30,8 @@ REGISTER3(UnaryOp, GPU, "Tanh", functor::tanh, float, Eigen::half, double);
REGISTER2(UnaryOp, SYCL, "Tanh", functor::tanh, float, double);
#endif // TENSORFLOW_USE_SYCL
REGISTER5(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
Eigen::half, double, complex64, complex128);
REGISTER6(SimpleBinaryOp, CPU, "TanhGrad", functor::tanh_grad, float,
Eigen::half, bfloat16, double, complex64, complex128);
#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
REGISTER3(SimpleBinaryOp, GPU, "TanhGrad", functor::tanh_grad, float,
Eigen::half, double);

View File

@ -67,6 +67,7 @@ class ClipTest(test.TestCase):
dtypes.float16,
dtypes.float32,
dtypes.float64,
dtypes.bfloat16,
dtypes.int16,
dtypes.int32,
dtypes.int64,
@ -88,6 +89,7 @@ class ClipTest(test.TestCase):
dtypes.float16,
dtypes.float32,
dtypes.float64,
dtypes.bfloat16,
dtypes.int16,
dtypes.int32,
dtypes.int64,
@ -110,6 +112,7 @@ class ClipTest(test.TestCase):
dtypes.float16,
dtypes.float32,
dtypes.float64,
dtypes.bfloat16,
dtypes.int16,
dtypes.int32,
dtypes.int64,
@ -132,6 +135,7 @@ class ClipTest(test.TestCase):
dtypes.float16,
dtypes.float32,
dtypes.float64,
dtypes.bfloat16,
dtypes.int16,
dtypes.int32,
dtypes.int64,

View File

@ -841,6 +841,9 @@ class MathOpsOverloadTest(test.TestCase):
def _compareBinary(self, x, y, dtype, np_func, tf_func):
np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
if dtype == dtypes_lib.bfloat16:
# assertAllClose does not properly handle bfloat16 values
np_ans = np_ans.astype(np.float32)
self.assertAllClose(np_ans,
self._computeTensorAndLiteral(x, y, dtype, tf_func))
self.assertAllClose(np_ans,
@ -857,6 +860,7 @@ class MathOpsOverloadTest(test.TestCase):
dtypes_lib.float16,
dtypes_lib.float32,
dtypes_lib.float64,
dtypes_lib.bfloat16,
dtypes_lib.int32,
dtypes_lib.int64,
dtypes_lib.complex64,
@ -920,12 +924,16 @@ class MathOpsOverloadTest(test.TestCase):
class IsFiniteInfNanTest(test.TestCase):
def _compare(self, x, use_gpu):
np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
with test_util.device(use_gpu=use_gpu):
inx = ops.convert_to_tensor(x)
ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
inx), math_ops.is_nan(inx)
tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
if x.dtype == dtypes_lib.bfloat16.as_numpy_dtype:
# Numpy will implicitly convert bfloat16 value to float16, so we cast to
# float32 to avoid this.
x = x.astype(np.float32)
np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
self.assertAllEqual(np_inf, tf_inf)
self.assertAllEqual(np_nan, tf_nan)
self.assertAllEqual(np_finite, tf_finite)
@ -934,11 +942,18 @@ class IsFiniteInfNanTest(test.TestCase):
self.assertShapeEqual(np_finite, ofinite)
def _testDtype(self, dtype):
fi = np.finfo(dtype)
data = np.array([
0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
np.inf, np.nan
]).astype(dtype)
if dtype != dtypes_lib.bfloat16.as_numpy_dtype:
fi = np.finfo(dtype)
data = np.array([
0, -1, 1, fi.resolution, -fi.resolution, fi.min, fi.max, -np.inf,
np.inf, np.nan
]).astype(dtype)
else:
# np.finfo does not support bfloat16
data = np.array([
0, -1, 1, 0.01, -0.01, -3.3895e+38, 3.3895e+38, -np.inf, np.inf,
np.nan
]).astype(dtype)
self._compare(data, use_gpu=False)
self._compare(data, use_gpu=True)
@ -951,6 +966,9 @@ class IsFiniteInfNanTest(test.TestCase):
def testDouble(self):
self._testDtype(np.float64)
def testBfloat16(self):
self._testDtype(dtypes_lib.bfloat16.as_numpy_dtype)
def testSqrt(self):
for dtype in [np.float16, np.float32, np.float64]:
fi = np.finfo(dtype)
@ -998,8 +1016,8 @@ class RoundingTest(test.TestCase):
def _testDtype(self, dtype):
data = (np.arange(-3, 3) / 4.).reshape(1, 3, 2).astype(dtype)
self._compare(data)
# TODO: rint op is not supported for float16
if dtype is np.float16:
# TODO(reedwm): rint op is not supported for float16 and bfloat16
if dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
return
self._compare_values(data)
x = [0.5, 0.5000001]
@ -1012,10 +1030,12 @@ class RoundingTest(test.TestCase):
self._compare_values(x, y=y)
def testTypes(self):
self.skipTest("b/131162241")
for dtype in [np.float16, np.float32, np.float64]:
with self.subTest(dtype=dtype):
self._testDtype(dtype)
# TODO(b/131162241): Enable test for GPU
with ops.device("/CPU:0"):
for dtype in [np.float16, np.float32, np.float64,
dtypes_lib.bfloat16.as_numpy_dtype]:
with self.subTest(dtype=dtype):
self._testDtype(dtype)
class ComplexMakeRealImagTest(test.TestCase):

View File

@ -61,6 +61,8 @@ def _default_tolerance(dtype):
Args:
dtype: A datatype.
"""
if dtype == dtypes_lib.bfloat16.as_numpy_dtype:
return 5e-3
if dtype == np.float16:
return 5e-3
elif dtype in (np.float32, np.complex64):
@ -81,12 +83,7 @@ class UnaryOpTest(test.TestCase):
np_ans = np_func(x)
with self.cached_session(use_gpu=False):
inx = ops.convert_to_tensor(x)
if x.dtype in (np.float32, np.float64,
dtypes_lib.bfloat16.as_numpy_dtype):
y = 1.1 * tf_func(inx)
np_ans *= 1.1
else:
y = tf_func(inx)
y = tf_func(inx)
tf_cpu = self.evaluate(y)
self.assertShapeEqual(np_ans, y)
if x.dtype == np.float16:
@ -99,7 +96,7 @@ class UnaryOpTest(test.TestCase):
if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
return # Return early
if x.dtype == np.float16:
if x.dtype in (np.float16, dtypes_lib.bfloat16.as_numpy_dtype):
s = list(np.shape(x))
jacob_t, _ = gradient_checker.compute_gradient(
inx, s, y, s, x_init_value=x)
@ -108,7 +105,7 @@ class UnaryOpTest(test.TestCase):
yf = tf_func(inxf)
_, jacob_n = gradient_checker.compute_gradient(
inxf, s, yf, s, x_init_value=xf, delta=1e-2)
jacob_n = jacob_n.astype(np.float16)
jacob_n = jacob_n.astype(x.dtype)
self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
elif x.dtype in (np.float32, np.complex64):
s = list(np.shape(x))
@ -384,13 +381,36 @@ class UnaryOpTest(test.TestCase):
self._compareBothSparse(y, np.sign, math_ops.sign)
self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf, tol=1e-3)
@test_util.run_deprecated_v1
def testBFloat16Basic(self):
def compute_f32(np_func):
"""Decorator to compute Numpy function with float32 math."""
def f(x):
y = np_func(x.astype(np.float32))
return y.astype(x.dtype)
return f
bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
x = np.arange(-6, 6,
2).reshape(1, 3, 2).astype(dtypes_lib.bfloat16.as_numpy_dtype)
y = (x + .5).astype(bfloat16) # no zero
z = (x + 15.5).astype(bfloat16) # all positive
self._compareCpu(x, np.abs, math_ops.abs)
self._compareCpu(x, np.abs, _ABS)
self._compareBoth(x, np.negative, math_ops.negative)
self._compareBoth(x, np.negative, _NEG)
self._compareCpu(y, compute_f32(self._inv), math_ops.reciprocal)
self._compareCpu(x, np.exp, math_ops.exp)
self._compareCpu(x, np.expm1, math_ops.expm1)
self._compareCpu(z, compute_f32(np.log), math_ops.log)
self._compareCpu(z, compute_f32(np.log1p), math_ops.log1p)
self._compareCpu(y, np.sign, math_ops.sign)
self._compareBoth(x, compute_f32(np.sin), math_ops.sin)
self._compareBoth(x, compute_f32(np.cos), math_ops.cos)
self._compareBoth(x, compute_f32(np.tan), math_ops.tan)
self._compareBoth(x, compute_f32(np.sinh), math_ops.sinh)
self._compareBoth(x, compute_f32(np.cosh), math_ops.cosh)
self._compareBoth(x, compute_f32(np.tanh), math_ops.tanh)
def testInt8Basic(self):
x = np.arange(-6, 6, 2).reshape(1, 3, 2).astype(np.int8)

View File

@ -476,6 +476,13 @@ class DivAndModTest(test_util.TensorFlowTestCase):
# % array_ops.constant(divs))
# self.assertAllEqual(tf2_result, tf_result)
def testFloorModBfloat64(self):
nums, divs = self.floatTestData()
tf_result = math_ops.floormod(math_ops.cast(nums, dtypes.bfloat16),
math_ops.cast(divs, dtypes.bfloat16))
np_result = nums % divs
self.assertAllEqual(tf_result, np_result)
def testTruncateModInt(self):
nums, divs = self.intTestData()
tf_result = math_ops.truncatemod(nums, divs)