From d7cb6d0a3febf7893f92a84ef53c82928faeafaf Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Fri, 19 Jul 2019 14:12:27 -0700 Subject: [PATCH] - Disable tuning on Apple - we don't want to use an in-order-tuned kernel on an Apple CPU. We shouldn't even with tuning, as Apple CPUs are out-of-order, but we don't want to risk the case of misdetection by the tuning nanobenchmark. - Whenever tuning is not enabled, have the tuning resolver just return without even the overhead of querying a timestamp. PiperOrigin-RevId: 259036253 --- tensorflow/lite/experimental/ruy/platform.h | 7 +++++++ tensorflow/lite/experimental/ruy/tune.cc | 13 ++++++------- tensorflow/lite/experimental/ruy/tune.h | 13 +++++++++++++ tensorflow/lite/experimental/ruy/tune_test.cc | 2 ++ 4 files changed, 28 insertions(+), 7 deletions(-) diff --git a/tensorflow/lite/experimental/ruy/platform.h b/tensorflow/lite/experimental/ruy/platform.h index 13eccf8acf6..29c0fc20784 100644 --- a/tensorflow/lite/experimental/ruy/platform.h +++ b/tensorflow/lite/experimental/ruy/platform.h @@ -49,4 +49,11 @@ limitations under the License. #define RUY_DONOTUSEDIRECTLY_NEON_64 \ (RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_64) +// Detect APPLE +#ifdef __APPLE__ +#define RUY_DONOTUSEDIRECTLY_APPLE 1 +#else +#define RUY_DONOTUSEDIRECTLY_APPLE 0 +#endif + #endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_ diff --git a/tensorflow/lite/experimental/ruy/tune.cc b/tensorflow/lite/experimental/ruy/tune.cc index d2ca263e706..58a956e03cc 100644 --- a/tensorflow/lite/experimental/ruy/tune.cc +++ b/tensorflow/lite/experimental/ruy/tune.cc @@ -18,13 +18,11 @@ limitations under the License. #include #include -#include "tensorflow/lite/experimental/ruy/opt_set.h" -#include "tensorflow/lite/experimental/ruy/platform.h" #include "tensorflow/lite/experimental/ruy/time.h" namespace ruy { -#if RUY_PLATFORM(NEON_64) +#ifdef RUY_IMPLEMENT_TUNING namespace { @@ -131,7 +129,7 @@ Tuning TuningResolver::ResolveNow() { return is_probably_inorder ? Tuning::kInOrder : Tuning::kOutOfOrder; } -#else // not RUY_PLATFORM(NEON_64) +#else // not defined RUY_IMPLEMENT_TUNING float TuningResolver::EvalRatio() { return 0; } float TuningResolver::ThresholdRatio() { return 0; } @@ -146,9 +144,7 @@ TuningResolver::TuningResolver() : expiry_duration_(DurationFromSeconds(kExpirySecs)) {} Tuning TuningResolver::Resolve() { -#if !RUY_OPT_ENABLED(RUY_OPT_TUNING) - return Tuning::kOutOfOrder; -#endif +#ifdef RUY_IMPLEMENT_TUNING if (unresolved_tuning_ != Tuning::kAuto) { return unresolved_tuning_; } @@ -160,6 +156,9 @@ Tuning TuningResolver::Resolve() { last_resolved_timepoint_ = new_timepoint; last_resolved_tuning_ = ResolveNow(); return last_resolved_tuning_; +#else + return Tuning::kOutOfOrder; +#endif } } // namespace ruy diff --git a/tensorflow/lite/experimental/ruy/tune.h b/tensorflow/lite/experimental/ruy/tune.h index c1b95842b87..a1d0eb9ae40 100644 --- a/tensorflow/lite/experimental/ruy/tune.h +++ b/tensorflow/lite/experimental/ruy/tune.h @@ -74,8 +74,21 @@ limitations under the License. #include +#include "tensorflow/lite/experimental/ruy/opt_set.h" +#include "tensorflow/lite/experimental/ruy/platform.h" #include "tensorflow/lite/experimental/ruy/time.h" +// Tuning only implemented on NEON_64 at the moment (see assembly code +// in the nano-benchmark) and not on Apple (some Apple CPUs produce incorrect +// results on in-order-tuned kernels combining ARM and NEON load instructions +// and NEON `ins` instructions). +// +// When tuning is not implemented, we simply always use Tuning::kOutOfOrder. +#if RUY_OPT_ENABLED(RUY_OPT_TUNING) && RUY_PLATFORM(NEON_64) && \ + !RUY_PLATFORM(APPLE) +#define RUY_IMPLEMENT_TUNING +#endif + namespace ruy { enum class Tuning { diff --git a/tensorflow/lite/experimental/ruy/tune_test.cc b/tensorflow/lite/experimental/ruy/tune_test.cc index 571c2189e81..051c34910b6 100644 --- a/tensorflow/lite/experimental/ruy/tune_test.cc +++ b/tensorflow/lite/experimental/ruy/tune_test.cc @@ -33,6 +33,7 @@ TEST(TuneTest, TuneTest) { tuning_resolver.SetTuning(Tuning::kAuto); +#ifdef RUY_IMPLEMENT_TUNING for (auto tuning : {Tuning::kOutOfOrder, Tuning::kInOrder}) { tuning_resolver.SetTuning(tuning); ASSERT_TRUE(tuning_resolver.Resolve() == tuning); @@ -40,6 +41,7 @@ TEST(TuneTest, TuneTest) { std::this_thread::sleep_for(std::chrono::seconds(1)); ASSERT_TRUE(tuning_resolver.Resolve() == tuning); } +#endif } } // namespace