Fix performance regression (b/137615815) introduced by new platform
#defines - they were tested directly by #ifdef, and were being defined by path.h. As tune.cc did not #include path.h, it did not enable its platform-specific tuning code, resulting in a performance regression in cases relying on tuning for maximal performance --- in-order ARM. To prevent that from happening again, this moves the platform defines to a new platform.h and forces users to use a RUY_PLATFORM(X) function macro, so that if they fail to #include platform.h, they get a compilation error. PiperOrigin-RevId: 258372624
This commit is contained in:
parent
f2df2c2865
commit
8c1064d91a
@ -12,6 +12,11 @@ package(
|
||||
licenses = ["notice"], # Apache 2.0
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "platform",
|
||||
hdrs = ["platform.h"],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "check_macros",
|
||||
hdrs = ["check_macros.h"],
|
||||
@ -60,6 +65,7 @@ cc_library(
|
||||
],
|
||||
deps = [
|
||||
":opt_set",
|
||||
":platform",
|
||||
":time",
|
||||
],
|
||||
)
|
||||
@ -164,7 +170,10 @@ cc_library(
|
||||
name = "path",
|
||||
hdrs = ["path.h"],
|
||||
visibility = ruy_visibility(),
|
||||
deps = [":size_util"],
|
||||
deps = [
|
||||
":platform",
|
||||
":size_util",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
@ -238,6 +247,7 @@ cc_library(
|
||||
":matrix",
|
||||
":opt_set",
|
||||
":path",
|
||||
":platform",
|
||||
],
|
||||
)
|
||||
|
||||
@ -255,6 +265,7 @@ cc_library(
|
||||
":internal_matrix",
|
||||
":opt_set",
|
||||
":path",
|
||||
":platform",
|
||||
":size_util",
|
||||
":spec",
|
||||
":tune",
|
||||
@ -276,6 +287,7 @@ cc_library(
|
||||
":internal_matrix",
|
||||
":opt_set",
|
||||
":path",
|
||||
":platform",
|
||||
":spec",
|
||||
":tune",
|
||||
"@gemmlowp//:profiler",
|
||||
@ -371,6 +383,7 @@ cc_library(
|
||||
":ruy",
|
||||
":time",
|
||||
"@com_google_googletest//:gtest",
|
||||
":platform",
|
||||
] + ruy_test_ext_deps(),
|
||||
)
|
||||
|
||||
|
@ -26,8 +26,9 @@ limitations under the License.
|
||||
#include "tensorflow/lite/experimental/ruy/matrix.h"
|
||||
#include "tensorflow/lite/experimental/ruy/opt_set.h"
|
||||
#include "tensorflow/lite/experimental/ruy/path.h"
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
|
||||
#if ((defined RUY_NEON_64) || (defined RUY_NEON_32))
|
||||
#if (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32))
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
|
@ -25,6 +25,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
|
||||
#include "tensorflow/lite/experimental/ruy/opt_set.h"
|
||||
#include "tensorflow/lite/experimental/ruy/path.h"
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
#include "tensorflow/lite/experimental/ruy/size_util.h"
|
||||
#include "tensorflow/lite/experimental/ruy/spec.h"
|
||||
#include "tensorflow/lite/experimental/ruy/tune.h"
|
||||
@ -217,7 +218,7 @@ RUY_INHERIT_KERNEL(Path::kStandardCpp, Path::kNeon)
|
||||
RUY_INHERIT_KERNEL(Path::kNeon, Path::kNeonDotprod)
|
||||
|
||||
// KernelParams are shared across 32-bit and 64-bit NEON code.
|
||||
#if ((defined RUY_NEON_64) || (defined RUY_NEON_32)) && \
|
||||
#if (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) && \
|
||||
(RUY_OPT_ENABLED(RUY_OPT_ASM))
|
||||
|
||||
#define RUY_ASM_FLAG_HAS_BIAS 0x1
|
||||
@ -369,7 +370,7 @@ void Kernel8bitNeonInOrder(const KernelParams8bit<4, 4>& params);
|
||||
void Kernel8bitNeonDotprodOutOfOrder(const KernelParams8bit<8, 8>& params);
|
||||
void Kernel8bitNeonDotprodInOrder(const KernelParams8bit<8, 8>& params);
|
||||
|
||||
#ifdef RUY_NEON_64
|
||||
#if RUY_PLATFORM(NEON_64)
|
||||
template <typename DstScalar>
|
||||
struct Kernel<Path::kNeon, std::int8_t, std::int8_t, DstScalar,
|
||||
BasicSpec<std::int32_t, DstScalar>> {
|
||||
@ -489,7 +490,7 @@ void KernelFloatNeonInOrder(const KernelParamsFloat<8, 8>& params);
|
||||
void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params);
|
||||
void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params);
|
||||
|
||||
#ifdef RUY_NEON_64
|
||||
#if RUY_PLATFORM(NEON_64)
|
||||
// A Float kernel for ARM64 Neon.
|
||||
template <>
|
||||
struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
|
||||
@ -512,7 +513,7 @@ struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
|
||||
};
|
||||
#endif
|
||||
|
||||
#ifdef RUY_NEON_32
|
||||
#if RUY_PLATFORM(NEON_32)
|
||||
// A Float kernel for ARM32 Neon.
|
||||
template <>
|
||||
struct Kernel<Path::kNeon, float, float, float, BasicSpec<float, float>> {
|
||||
@ -559,7 +560,7 @@ struct Kernel<Path::kNeonDotprod, float, float, float,
|
||||
}
|
||||
};
|
||||
|
||||
#endif // ((defined RUY_NEON_64) || (defined RUY_NEON_32)) &&
|
||||
#endif // (RUY_PLATFORM(NEON_64) || RUY_PLATFORM(NEON_32)) &&
|
||||
// (RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
} // namespace ruy
|
||||
|
||||
|
@ -15,10 +15,11 @@ limitations under the License.
|
||||
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/experimental/ruy/kernel.h"
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
|
||||
namespace ruy {
|
||||
|
||||
#if (defined RUY_NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#if RUY_PLATFORM(NEON_32) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
#define RUY_ASM_LABEL_STORE_UINT8 91
|
||||
#define RUY_ASM_LABEL_STORE_INT8 92
|
||||
@ -539,5 +540,5 @@ void KernelFloat32NeonOutOfOrder(const KernelParamsFloat<8, 4>& params) {
|
||||
#undef RUY_OFFSET_RHS_BASE_PTR
|
||||
#undef RUY_OFFSET_DST_BASE_PTR
|
||||
|
||||
#endif // (defined RUY_NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#endif // RUY_PLATFORM(NEON_32) && (RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
} // namespace ruy
|
||||
|
@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/kernel.h"
|
||||
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/experimental/ruy/kernel.h"
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
|
||||
namespace ruy {
|
||||
|
||||
#if (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
#define RUY_ASM_LABEL_STORE_UINT8 91
|
||||
#define RUY_ASM_LABEL_STORE_INT8 92
|
||||
@ -6302,6 +6302,6 @@ void KernelFloatNeonDotprodInOrder(const KernelParamsFloat<8, 8>& params) {
|
||||
#undef RUY_OFFSET_RHS_BASE_PTR
|
||||
#undef RUY_OFFSET_DST_BASE_PTR
|
||||
|
||||
#endif // (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#endif // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
} // namespace ruy
|
||||
|
@ -15,9 +15,11 @@ limitations under the License.
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/pack.h"
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
|
||||
namespace ruy {
|
||||
|
||||
#if (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
|
||||
const void* src_ptr2, const void* src_ptr3,
|
||||
@ -1329,6 +1331,6 @@ void PackFloatNeonInOrder(const float* src_ptr0, const float* src_ptr1,
|
||||
"v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21",
|
||||
"v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
|
||||
}
|
||||
#endif // (defined RUY_NEON64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#endif // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
} // namespace ruy
|
||||
|
@ -89,6 +89,7 @@ limitations under the License.
|
||||
#include "tensorflow/lite/experimental/ruy/common.h"
|
||||
#include "tensorflow/lite/experimental/ruy/internal_matrix.h"
|
||||
#include "tensorflow/lite/experimental/ruy/opt_set.h"
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
#include "tensorflow/lite/experimental/ruy/tune.h"
|
||||
|
||||
namespace ruy {
|
||||
@ -158,7 +159,7 @@ struct PackImpl<Path::kStandardCpp, FixedKernelLayout, Scalar, PackedScalar,
|
||||
RUY_INHERIT_PACK(Path::kStandardCpp, Path::kNeon)
|
||||
RUY_INHERIT_PACK(Path::kNeon, Path::kNeonDotprod)
|
||||
|
||||
#if (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#if RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
void Pack8bitNeonOutOfOrder(const void* src_ptr0, const void* src_ptr1,
|
||||
const void* src_ptr2, const void* src_ptr3,
|
||||
@ -384,7 +385,7 @@ struct PackImpl<Path::kNeon, FixedKernelLayout<Order::kRowMajor, 1, 8>, float,
|
||||
}
|
||||
};
|
||||
|
||||
#endif // (defined RUY_NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
#endif // RUY_PLATFORM(NEON_64) && RUY_OPT_ENABLED(RUY_OPT_ASM)
|
||||
|
||||
// Main entry point for packing.
|
||||
template <Path ThePath, typename FixedKernelLayout, typename Scalar,
|
||||
|
@ -18,29 +18,9 @@ limitations under the License.
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
#include "tensorflow/lite/experimental/ruy/size_util.h"
|
||||
|
||||
// Detect ARM, 32-bit or 64-bit
|
||||
#ifdef __aarch64__
|
||||
#define RUY_ARM_64
|
||||
#elif defined(__arm__)
|
||||
#define RUY_ARM_32
|
||||
#endif
|
||||
|
||||
// Detect NEON.
|
||||
#if (defined __ARM_NEON) || (defined __ARM_NEON__)
|
||||
#define RUY_NEON
|
||||
#endif
|
||||
|
||||
// Define 32bit ARM NEON and 64 bit ARM NEON
|
||||
#if defined(RUY_NEON) && defined(RUY_ARM_32)
|
||||
#define RUY_NEON_32
|
||||
#endif
|
||||
|
||||
#if defined(RUY_NEON) && defined(RUY_ARM_64)
|
||||
#define RUY_NEON_64
|
||||
#endif
|
||||
|
||||
namespace ruy {
|
||||
|
||||
// A Path is a choice of implementation path, e.g. between reference code
|
||||
@ -119,21 +99,21 @@ inline Path GetMostSignificantPath(Path path_mask) {
|
||||
// ruy::kAllPaths represents all Path's that make sense to on a given
|
||||
// base architecture.
|
||||
#ifdef __linux__
|
||||
#ifdef RUY_NEON_64
|
||||
#if RUY_PLATFORM(NEON_64)
|
||||
constexpr Path kAllPaths =
|
||||
Path::kReference | Path::kStandardCpp | Path::kNeon | Path::kNeonDotprod;
|
||||
#elif defined RUY_NEON_32
|
||||
#elif RUY_PLATFORM(NEON_32)
|
||||
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
|
||||
#else
|
||||
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
|
||||
#endif // RUY_NEON_64
|
||||
#endif
|
||||
#else // __linux__
|
||||
// We don't know how to do runtime dotprod detection outside of linux for now.
|
||||
#if defined(RUY_NEON_64) || defined(RUY_NEON_32)
|
||||
#if RUY_PLATFORM(NEON)
|
||||
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp | Path::kNeon;
|
||||
#else
|
||||
constexpr Path kAllPaths = Path::kReference | Path::kStandardCpp;
|
||||
#endif // defined(RUY_NEON_64) || defined(RUY_NEON_32)
|
||||
#endif
|
||||
#endif // __linux__
|
||||
|
||||
} // namespace ruy
|
||||
|
52
tensorflow/lite/experimental/ruy/platform.h
Normal file
52
tensorflow/lite/experimental/ruy/platform.h
Normal file
@ -0,0 +1,52 @@
|
||||
/* Copyright 2019 Google LLC. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
|
||||
#define TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
|
||||
|
||||
#define RUY_PLATFORM(X) ((RUY_DONOTUSEDIRECTLY_##X) != 0)
|
||||
|
||||
// Detect ARM 32-bit
|
||||
#ifdef __arm__
|
||||
#define RUY_DONOTUSEDIRECTLY_ARM_32 1
|
||||
#else
|
||||
#define RUY_DONOTUSEDIRECTLY_ARM_32 0
|
||||
#endif
|
||||
|
||||
// Detect ARM 64-bit
|
||||
#ifdef __aarch64__
|
||||
#define RUY_DONOTUSEDIRECTLY_ARM_64 1
|
||||
#else
|
||||
#define RUY_DONOTUSEDIRECTLY_ARM_64 0
|
||||
#endif
|
||||
|
||||
// Detect NEON
|
||||
#if (defined __ARM_NEON) || (defined __ARM_NEON__)
|
||||
#define RUY_DONOTUSEDIRECTLY_NEON 1
|
||||
#else
|
||||
#define RUY_DONOTUSEDIRECTLY_NEON 0
|
||||
#endif
|
||||
|
||||
// Define ARM 32-bit NEON
|
||||
#define RUY_DONOTUSEDIRECTLY_NEON_32 \
|
||||
(RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_32)
|
||||
|
||||
// Define ARM 64-bit NEON
|
||||
// Note: NEON is implied by ARM64, so this define is redundant.
|
||||
// It still allows some conveyance of intent.
|
||||
#define RUY_DONOTUSEDIRECTLY_NEON_64 \
|
||||
(RUY_DONOTUSEDIRECTLY_NEON && RUY_DONOTUSEDIRECTLY_ARM_64)
|
||||
|
||||
#endif // TENSORFLOW_LITE_EXPERIMENTAL_RUY_PLATFORM_H_
|
@ -29,6 +29,7 @@ limitations under the License.
|
||||
#include <vector>
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
#include "tensorflow/lite/experimental/ruy/pmu.h"
|
||||
#include "tensorflow/lite/experimental/ruy/ruy.h"
|
||||
#include "tensorflow/lite/experimental/ruy/ruy_advanced.h"
|
||||
@ -1651,7 +1652,7 @@ void TestSet<LhsScalar, RhsScalar, SpecType>::MakeResultPaths() {
|
||||
}
|
||||
// We link against a generic BLAS target that only maps to OpenBLAS on specific
|
||||
// architectures.
|
||||
#if defined RUY_ARM_64 || defined RUY_ARM_32
|
||||
#if RUY_PLATFORM(ARM_32) || RUY_PLATFORM(ARM_64)
|
||||
// OpenBLAS multi-threading is disabled, so avoid mixing single-threaded
|
||||
// and multi-threaded benchmark results.
|
||||
if (max_num_threads == 1) {
|
||||
|
@ -19,11 +19,12 @@ limitations under the License.
|
||||
#include <cstdint>
|
||||
|
||||
#include "tensorflow/lite/experimental/ruy/opt_set.h"
|
||||
#include "tensorflow/lite/experimental/ruy/platform.h"
|
||||
#include "tensorflow/lite/experimental/ruy/time.h"
|
||||
|
||||
namespace ruy {
|
||||
|
||||
#ifdef RUY_NEON_64
|
||||
#if RUY_PLATFORM(NEON_64)
|
||||
|
||||
namespace {
|
||||
|
||||
@ -130,7 +131,7 @@ Tuning TuningResolver::ResolveNow() {
|
||||
return is_probably_inorder ? Tuning::kInOrder : Tuning::kOutOfOrder;
|
||||
}
|
||||
|
||||
#else // not defined RUY_NEON_64
|
||||
#else // not RUY_PLATFORM(NEON_64)
|
||||
|
||||
float TuningResolver::EvalRatio() { return 0; }
|
||||
float TuningResolver::ThresholdRatio() { return 0; }
|
||||
|
Loading…
x
Reference in New Issue
Block a user