From dcd7431b6d0f4681941860b6b20c3edfd618035b Mon Sep 17 00:00:00 2001 From: Anush Elangovan Date: Fri, 24 Nov 2017 14:48:36 -0600 Subject: [PATCH] [XLA] Guard AVX, SSE and NEON instructions On OSX you currently run into linker errors because unsupported instructions are registered. clang on OSX doesn't respect __attribute((WEAK)) (see PR#14893). So we add ifdefs to register only the supported instructions. Also register __sincos on __APPLE__ platforms. TEST=Build tensorflow/compiler/aot/tests:tfcompile builds successfully on OSX (10.13.2) --- .../xla/service/cpu/cpu_runtime_avx.h | 6 ++-- .../xla/service/cpu/cpu_runtime_neon.h | 6 ++-- .../xla/service/cpu/cpu_runtime_sse4_1.h | 7 ++-- .../xla/service/cpu/simple_orc_jit.cc | 32 ++++++++++++++++--- 4 files changed, 40 insertions(+), 11 deletions(-) diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h index acfada8540d..74ae6d00c91 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_avx.h @@ -38,14 +38,16 @@ typedef float V8F32AVX __attribute__((__vector_size__(32))); extern "C" { +#ifdef __AVX__ // The following functions are vectorized versions of a selection of libm // library functions. // References to these functions are created by the LLVM vectorizer. xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_ExpV8F32AVX( - xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK; + xla::cpu::runtime::V8F32AVX x); xla::cpu::runtime::V8F32AVX __xla_cpu_runtime_LogV8F32AVX( - xla::cpu::runtime::V8F32AVX x) TF_ATTRIBUTE_WEAK; + xla::cpu::runtime::V8F32AVX x); +#endif } #endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_AVX_H_ diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h index 75cb16b2739..645a43858fb 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_neon.h @@ -49,14 +49,16 @@ struct V4F32NEON; extern "C" { +#ifdef __ARM_NEON__ // The following functions are vectorized versions of a selection of libm // library functions. // References to these functions are created by the LLVM vectorizer. xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_ExpV4F32NEON( - xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK; + xla::cpu::runtime::V4F32NEON x); xla::cpu::runtime::V4F32NEON __xla_cpu_runtime_LogV4F32NEON( - xla::cpu::runtime::V4F32NEON x) TF_ATTRIBUTE_WEAK; + xla::cpu::runtime::V4F32NEON x); +#endif // __ARM_NEON__ } #endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_NEON_H_ diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h index 96587d10d2b..1bd8494bf84 100644 --- a/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h +++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime_sse4_1.h @@ -39,14 +39,17 @@ typedef float V4F32SSE __attribute__((__vector_size__(16))); extern "C" { +#ifdef __SSE4_1__ // The following functions are vectorized versions of a selection of libm // library functions. // References to these functions are created by the LLVM vectorizer. xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_ExpV4F32SSE( - xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK; + xla::cpu::runtime::V4F32SSE x); xla::cpu::runtime::V4F32SSE __xla_cpu_runtime_LogV4F32SSE( - xla::cpu::runtime::V4F32SSE x) TF_ATTRIBUTE_WEAK; + xla::cpu::runtime::V4F32SSE x); +#endif + } #endif // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_RUNTIME_SSE4_1_H_ diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc index cda27833079..c942cd6bf12 100644 --- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc +++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc @@ -102,9 +102,21 @@ llvm::StringRef GetHostCpuName() { CompilerFunctor::VectorIntrinsics GetAvailableIntrinsics() { CompilerFunctor::VectorIntrinsics intrinsics; - intrinsics.sse_intrinsics = (&__xla_cpu_runtime_ExpV4F32SSE != nullptr); - intrinsics.avx_intrinsics = (&__xla_cpu_runtime_ExpV8F32AVX != nullptr); - intrinsics.neon_intrinsics = (&__xla_cpu_runtime_ExpV4F32NEON != nullptr); +#ifdef __SSE4_1__ + intrinsics.sse_intrinsics = true; +#else + intrinsics.sse_intrinsics = false; +#endif +#ifdef __AVX__ + intrinsics.avx_intrinsics = true; +#else + intrinsics.avx_intrinsics = false; +#endif +#ifdef __ARM_NEON__ + intrinsics.neon_intrinsics = true; +#else + intrinsics.neon_intrinsics = false; +#endif return intrinsics; } @@ -201,12 +213,18 @@ bool RegisterKnownJITSymbols() { REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32); REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32); REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64); +#ifdef __ARM_NEON__ REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32NEON); - REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE); - REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX); REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32NEON); +#endif +#ifdef __SSE4_1__ + REGISTER_CPU_RUNTIME_SYMBOL(ExpV4F32SSE); REGISTER_CPU_RUNTIME_SYMBOL(LogV4F32SSE); +#endif +#ifdef __AVX__ + REGISTER_CPU_RUNTIME_SYMBOL(ExpV8F32AVX); REGISTER_CPU_RUNTIME_SYMBOL(LogV8F32AVX); +#endif REGISTER_CPU_RUNTIME_SYMBOL(ParallelForkJoin); REGISTER_CPU_RUNTIME_SYMBOL(ReleaseInfeedBufferAfterDequeue); REGISTER_CPU_RUNTIME_SYMBOL(ReleaseOutfeedBufferAfterPopulation); @@ -275,7 +293,11 @@ bool RegisterKnownJITSymbols() { REGISTER_LIBM_SYMBOL(scalbln, double (*)(double, long)); REGISTER_LIBM_SYMBOL(scalbn, double (*)(double, int)); REGISTER_LIBM_SYMBOL(sin, double (*)(double)); +#ifdef __APPLE__ + REGISTER_LIBM_SYMBOL(__sincos, void (*)(double, double*, double*)); +#else REGISTER_LIBM_SYMBOL(sincos, void (*)(double, double*, double*)); +#endif REGISTER_LIBM_SYMBOL(sinh, double (*)(double)); REGISTER_LIBM_SYMBOL(sqrt, double (*)(double)); REGISTER_LIBM_SYMBOL(tan, double (*)(double));