Checking for defined(USE_NEON) && defined(__aarch64__) is redundant: A64 always has NEON support.

PiperOrigin-RevId: 254442846
This commit is contained in:
A. Unique TensorFlower 2019-06-21 12:01:55 -07:00 committed by TensorFlower Gardener
parent 4793f74ef5
commit f68778cb77
3 changed files with 12 additions and 29 deletions

View File

@ -505,7 +505,7 @@ struct KernelMacroBlock {
// implementation rather than conforming to style.
};
#if defined(USE_NEON) && defined(__aarch64__)
#if defined(__aarch64__)
// Experiments suggest that a modest performance improvement is seen, at least
// on 855 chipset big cores, with cache hints.
template <typename T>
@ -532,7 +532,7 @@ inline void PreloadInputBlock(
row_ptr += input_height_stride;
}
}
#endif // USE_NEON &&__aarch64__
#endif // __aarch64__
} // namespace depthwise_conv
} // namespace optimized_ops

View File

@ -27,7 +27,14 @@ namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {
#ifdef USE_NEON
#define STR(s) STR_UNEXPANDED(s)
#define STR_UNEXPANDED(s) #s
// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
// Jetson TX-2. This compiler does not support the offsetof() macro.
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
#include <stddef.h>
// Lane operations are for clarity and convenience. We want to load and store
// 4 8-bit lanes together. So these are treated much like 32-bit loads and
// 32-bit stores. Stores require 32-bit alignment.
@ -53,14 +60,6 @@ namespace depthwise_conv {
vld1q_lane_s32(reinterpret_cast<const int32*>(src), reg, lane_num)
#define vld1q_dup_s8x4(src) vld1q_dup_s32(reinterpret_cast<const int32*>(src))
#define STR(s) STR_UNEXPANDED(s)
#define STR_UNEXPANDED(s) #s
// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
// Jetson TX-2. This compiler does not support the offsetof() macro.
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
#include <stddef.h>
// Represents the number of bytes offset from the start of the
// DepthwiseConvParams struct. This is used in the asm to load parameters.
// Keep these values in sync with the static_asserts below.
@ -135,11 +134,7 @@ static_assert(offsetof(DepthwiseConvParams, output_width) ==
static_assert(offsetof(DepthwiseConvParams, output_height) ==
OFFSET_OUTPUT_HEIGHT,
"");
#endif // __aarch64__
#endif // ARM NEON
#ifdef USE_NEON
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
// Dot product ops hard-coded
// Represents the number of bytes offset from the start of the
@ -5757,15 +5752,13 @@ inline void DepthwiseConv3x3Filter(
}
#endif // __aarch64__
#endif
// Perform any necessary cache hinting and pre-writing.
template <DepthwiseConvImplementation implementation>
struct WorkspacePrefetchWrite {
static inline void Run(int8 fill_data, int size, int8* workspace) {}
};
#if defined(USE_NEON) && defined(__aarch64__)
#if defined(__aarch64__)
// Encourage the processor to keep the workspace in cache. Both the cache hint
// and some memory writes are required.
//
@ -5791,7 +5784,7 @@ struct WorkspacePrefetchWrite<
}
};
#endif // USE_NEON &&__aarch64__
#endif // __aarch64__
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
// Dot product ops hard-coded

View File

@ -26,8 +26,6 @@ namespace tflite {
namespace optimized_ops {
namespace depthwise_conv {
#ifdef USE_NEON
#define STR(s) STR_UNEXPANDED(s)
#define STR_UNEXPANDED(s) #s
@ -106,12 +104,6 @@ static_assert(offsetof(DepthwiseConvParams, output_width) ==
static_assert(offsetof(DepthwiseConvParams, output_height) ==
OFFSET_OUTPUT_HEIGHT,
"");
#endif // __aarch64__
#endif // ARM NEON
#ifdef USE_NEON
#if defined(__aarch64__) && !defined(GOOGLE_L4T)
template <>
struct DepthwiseConvWindowPerChannel<DepthwiseConvOutputRounding::kUpward, 8, 1,
@ -3080,8 +3072,6 @@ inline void DepthwiseConv3x3FilterPerChannel(
}
#endif // __aarch64__
#endif
#undef STR
#undef STR_UNEXPANDED