No longer depend directly on the general gemmlowp library,
outside of cpu_backend_gemm and the legacy code. Still depend on small specialist sub-libraries gemmlowp:fixedpoint and gemmlowp:profiler. PiperOrigin-RevId: 247047844
This commit is contained in:
parent
2f345d145e
commit
60ac1b8adf
@ -196,7 +196,8 @@ cc_library(
|
||||
":tensor",
|
||||
":tensor_utils",
|
||||
"//third_party/eigen3",
|
||||
"@gemmlowp",
|
||||
"@gemmlowp//:fixedpoint",
|
||||
"@gemmlowp//:profiler",
|
||||
"//tensorflow/lite/c:c_api_internal",
|
||||
"//tensorflow/lite/kernels:cpu_backend_context",
|
||||
"//tensorflow/lite/kernels:cpu_backend_threadpool",
|
||||
@ -359,7 +360,8 @@ cc_library(
|
||||
":strided_slice_logic",
|
||||
":tensor",
|
||||
":types",
|
||||
"@gemmlowp",
|
||||
"@gemmlowp//:fixedpoint",
|
||||
"@gemmlowp//:profiler",
|
||||
"//tensorflow/lite/c:c_api_internal",
|
||||
"//tensorflow/lite/kernels:op_macros",
|
||||
] + select({
|
||||
@ -479,7 +481,8 @@ cc_library(
|
||||
"//tensorflow/lite/kernels:activation_functor",
|
||||
"//tensorflow/lite/kernels:op_macros",
|
||||
"@arm_neon_2_x86_sse",
|
||||
"@gemmlowp",
|
||||
"@gemmlowp//:fixedpoint",
|
||||
"@gemmlowp//:profiler",
|
||||
],
|
||||
)
|
||||
|
||||
@ -535,7 +538,7 @@ cc_library(
|
||||
"//tensorflow/lite/c:c_api_internal",
|
||||
"@arm_neon_2_x86_sse",
|
||||
"//tensorflow/lite/kernels:op_macros",
|
||||
"@gemmlowp",
|
||||
"@gemmlowp//:fixedpoint",
|
||||
] + select({
|
||||
":aarch64": [
|
||||
":neon_tensor_utils",
|
||||
@ -642,7 +645,6 @@ cc_test(
|
||||
":types",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_googletest//:gtest_main",
|
||||
"@gemmlowp",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -547,6 +547,121 @@ inline void NdArrayDescsForElementwiseBroadcast(
|
||||
}
|
||||
}
|
||||
|
||||
// Copied from gemmlowp::RoundDown when we dropped direct dependency on
|
||||
// gemmlowp.
|
||||
//
|
||||
// Returns the runtime argument rounded down to the nearest multiple of
|
||||
// the fixed Modulus.
|
||||
template <unsigned Modulus, typename Integer>
|
||||
Integer RoundDown(Integer i) {
|
||||
return i - (i % Modulus);
|
||||
}
|
||||
|
||||
// Copied from gemmlowp::RoundUp when we dropped direct dependency on
|
||||
// gemmlowp.
|
||||
//
|
||||
// Returns the runtime argument rounded up to the nearest multiple of
|
||||
// the fixed Modulus.
|
||||
template <unsigned Modulus, typename Integer>
|
||||
Integer RoundUp(Integer i) {
|
||||
return RoundDown<Modulus>(i + Modulus - 1);
|
||||
}
|
||||
|
||||
// Copied from gemmlowp::CeilQuotient when we dropped direct dependency on
|
||||
// gemmlowp.
|
||||
//
|
||||
// Returns the quotient a / b rounded up ('ceil') to the nearest integer.
|
||||
template <typename Integer>
|
||||
Integer CeilQuotient(Integer a, Integer b) {
|
||||
return (a + b - 1) / b;
|
||||
}
|
||||
|
||||
// This function is a copy of gemmlowp::HowManyThreads, copied when we dropped
|
||||
// the direct dependency of internal/optimized/ on gemmlowp.
|
||||
//
|
||||
// TODO(b/131910176): get rid of this function by switching each call site
|
||||
// to its own more sensible logic for its own workload.
|
||||
template <int KernelRows>
|
||||
inline int LegacyHowManyThreads(int max_num_threads, int rows, int cols,
|
||||
int depth) {
|
||||
// Early-exit in the default case where multi-threading is disabled.
|
||||
if (max_num_threads == 1) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
// Basic calculation: take into account max pool size, and
|
||||
// how many rows we have to feed our kernel.
|
||||
// The motivation for an absolute minimum number of rows per thread,
|
||||
// potentially higher than KernelRows, is that very thin thread workload
|
||||
// currently defeat assumptions of the AddMod generator, resulting
|
||||
// in substantial bias in TestWithRealData on 24 threads.
|
||||
// Ideally, the AddMod generator should be aware of global (r,c) coordinates
|
||||
// so as to be independent of the number of threads.
|
||||
static const int AbsoluteMinRowsPerThread = 16;
|
||||
static const int MinRowsPerThread = KernelRows > AbsoluteMinRowsPerThread
|
||||
? KernelRows
|
||||
: AbsoluteMinRowsPerThread;
|
||||
int thread_count =
|
||||
std::min(max_num_threads, CeilQuotient(rows, MinRowsPerThread));
|
||||
|
||||
// At this point for small products we already have thread_count==1 so
|
||||
// we can avoid doing more work; otherwise, we still want to check
|
||||
// that the cubic size (rows*cols*depth) is big enough to keep
|
||||
// workers_ busy.
|
||||
if (thread_count > 1) {
|
||||
// Empirically determined value.
|
||||
static constexpr std::uint64_t min_cubic_size_per_thread = 64 * 1024;
|
||||
|
||||
// We can only multiply two out of three sizes without risking overflow
|
||||
const std::uint64_t cubic_size =
|
||||
std::uint64_t(rows) * std::uint64_t(cols) * std::uint64_t(depth);
|
||||
|
||||
thread_count = std::min(
|
||||
thread_count, static_cast<int>(cubic_size / min_cubic_size_per_thread));
|
||||
|
||||
if (thread_count < 1) {
|
||||
thread_count = 1;
|
||||
}
|
||||
}
|
||||
|
||||
assert(thread_count > 0 && thread_count <= max_num_threads);
|
||||
return thread_count;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void optimized_ops_preload_l1_stream(const T* ptr) {
|
||||
#ifdef __aarch64__
|
||||
// Aarch64 has very detailed prefetch instructions, that compilers
|
||||
// can't know how to map __builtin_prefetch to, and as a result, don't,
|
||||
// leaving __builtin_prefetch a no-op on this architecture.
|
||||
// For our purposes, "pldl1keep" is usually what we want, meaning:
|
||||
// "prefetch for load, into L1 cache, using each value multiple times".
|
||||
asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
#elif defined __GNUC__
|
||||
// builtin offered by GCC-compatible compilers including clang
|
||||
__builtin_prefetch(ptr);
|
||||
#else
|
||||
(void)ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void optimized_ops_preload_l1_keep(const T* ptr) {
|
||||
#ifdef __aarch64__
|
||||
// Aarch64 has very detailed prefetch instructions, that compilers
|
||||
// can't know how to map __builtin_prefetch to, and as a result, don't,
|
||||
// leaving __builtin_prefetch a no-op on this architecture.
|
||||
// For our purposes, "pldl1keep" is usually what we want, meaning:
|
||||
// "prefetch for load, into L1 cache, using each value multiple times".
|
||||
asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
#elif defined __GNUC__
|
||||
// builtin offered by GCC-compatible compilers including clang
|
||||
__builtin_prefetch(ptr);
|
||||
#else
|
||||
(void)ptr;
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace tflite
|
||||
|
||||
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_COMMON_H_
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_3X3_FILTER_COMMON_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_FLOAT_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
|
@ -15,8 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_UINT8_H_
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_context.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
@ -17,8 +17,7 @@ limitations under the License.
|
||||
|
||||
#include <memory>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_3x3_filter_common.h"
|
||||
#include "tensorflow/lite/kernels/internal/reference/depthwiseconv_uint8.h"
|
||||
|
@ -24,7 +24,6 @@ limitations under the License.
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/compatibility.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_IM2COL_UTILS_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
namespace tflite {
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_ADD_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
|
@ -15,6 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_CONV_H_
|
||||
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_context.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
|
||||
|
@ -15,8 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_context.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_FULLY_CONNECTED_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_context.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
|
||||
@ -25,22 +25,6 @@ limitations under the License.
|
||||
namespace tflite {
|
||||
namespace optimized_integer_ops {
|
||||
|
||||
inline void optimized_ops_preload_l1_stream(const int8_t* ptr) {
|
||||
#ifdef GEMMLOWP_ARM_64
|
||||
asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
#else
|
||||
gemmlowp::Prefetch(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void optimized_ops_preload_l1_keep(const int8_t* ptr) {
|
||||
#ifdef GEMMLOWP_ARM_64
|
||||
asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
#else
|
||||
gemmlowp::Prefetch(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef USE_NEON
|
||||
inline void FullyConnectedAsGEMVWorkerImpl(
|
||||
const RuntimeShape& input_shape, const int8_t* input_data,
|
||||
@ -328,7 +312,7 @@ inline void FullyConnectedAsGEMV(
|
||||
const int output_rows = output_shape.Dims(output_dim_count - 1);
|
||||
const int input_size = FlatSizeSkipDim(input_shape, 0);
|
||||
static constexpr int kKernelRows = 4;
|
||||
const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
|
||||
const int thread_count = LegacyHowManyThreads<kKernelRows>(
|
||||
cpu_backend_context->max_num_threads(), output_rows, batches, input_size);
|
||||
if (thread_count == 1) {
|
||||
// Single-thread case: do the computation on the current thread, don't
|
||||
@ -347,8 +331,8 @@ inline void FullyConnectedAsGEMV(
|
||||
// TODO(b/131746020) don't create new heap allocations every time.
|
||||
// At least we make it a single heap allocation by using reserve().
|
||||
tasks.reserve(thread_count);
|
||||
const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
|
||||
gemmlowp::CeilQuotient(output_rows, thread_count));
|
||||
const int kRowsPerWorker =
|
||||
RoundUp<kKernelRows>(CeilQuotient(output_rows, thread_count));
|
||||
int row_start = 0;
|
||||
for (int i = 0; i < thread_count; ++i) {
|
||||
int row_end = std::min(output_rows, row_start + kRowsPerWorker);
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_MUL_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include <assert.h>
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
@ -27,7 +28,7 @@ limitations under the License.
|
||||
#include <type_traits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
|
@ -15,7 +15,8 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_INTEGER_OPS_SOFTMAX_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include <stdint.h>
|
||||
#include <sys/types.h>
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
|
||||
|
@ -27,9 +27,6 @@ limitations under the License.
|
||||
#include <tuple>
|
||||
#include <type_traits>
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
|
||||
|
||||
#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
#endif
|
||||
@ -37,9 +34,11 @@ limitations under the License.
|
||||
#include "third_party/eigen3/Eigen/Core"
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/c/c_api_internal.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_context.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_gemm.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_gemm_params.h"
|
||||
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/optimized/im2col_utils.h"
|
||||
@ -280,22 +279,6 @@ void Gemm(const Eigen::MatrixBase<Lhs>& lhs, const Eigen::MatrixBase<Rhs>& rhs,
|
||||
}
|
||||
}
|
||||
|
||||
inline void optimized_ops_preload_l1_stream(const uint8* ptr) {
|
||||
#ifdef GEMMLOWP_ARM_64
|
||||
asm volatile("prfm pldl1strm, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
#else
|
||||
gemmlowp::Prefetch(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
inline void optimized_ops_preload_l1_keep(const uint8* ptr) {
|
||||
#ifdef GEMMLOWP_ARM_64
|
||||
asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
|
||||
#else
|
||||
gemmlowp::Prefetch(ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef GEMMLOWP_NEON
|
||||
// In the common case of batch size 1, a fully-connected node degenerates
|
||||
// to a matrix*vector product. LSTM cells contain a fully-connected node;
|
||||
@ -1112,7 +1095,7 @@ inline void FullyConnectedAsGEMV(
|
||||
const int output_rows = output_shape.Dims(output_dim_count - 1);
|
||||
const int input_size = FlatSizeSkipDim(input_shape, 0);
|
||||
static constexpr int kKernelRows = 4;
|
||||
const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
|
||||
const int thread_count = LegacyHowManyThreads<kKernelRows>(
|
||||
cpu_backend_context->max_num_threads(), output_rows, batches, input_size);
|
||||
if (thread_count == 1) {
|
||||
// Single-thread case: do the computation on the current thread, don't
|
||||
@ -1131,8 +1114,8 @@ inline void FullyConnectedAsGEMV(
|
||||
// TODO(b/131746020) don't create new heap allocations every time.
|
||||
// At least we make it a single heap allocation by using reserve().
|
||||
tasks.reserve(thread_count);
|
||||
const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
|
||||
gemmlowp::CeilQuotient(output_rows, thread_count));
|
||||
const int kRowsPerWorker =
|
||||
RoundUp<kKernelRows>(CeilQuotient(output_rows, thread_count));
|
||||
int row_start = 0;
|
||||
for (int i = 0; i < thread_count; ++i) {
|
||||
int row_end = std::min(output_rows, row_start + kRowsPerWorker);
|
||||
@ -1714,9 +1697,9 @@ inline void ShuffledFullyConnected(
|
||||
}
|
||||
|
||||
static constexpr int kKernelRows = 4;
|
||||
const int thread_count = gemmlowp::HowManyThreads<kKernelRows>(
|
||||
cpu_backend_context->max_num_threads(), output_depth, batches,
|
||||
accum_depth);
|
||||
const int thread_count =
|
||||
LegacyHowManyThreads<kKernelRows>(cpu_backend_context->max_num_threads(),
|
||||
output_depth, batches, accum_depth);
|
||||
if (thread_count == 1) {
|
||||
// Single-thread case: do the computation on the current thread, don't
|
||||
// use a threadpool
|
||||
@ -1733,8 +1716,8 @@ inline void ShuffledFullyConnected(
|
||||
// TODO(b/131746020) don't create new heap allocations every time.
|
||||
// At least we make it a single heap allocation by using reserve().
|
||||
tasks.reserve(thread_count);
|
||||
const int kRowsPerWorker = gemmlowp::RoundUp<kKernelRows>(
|
||||
gemmlowp::CeilQuotient(output_depth, thread_count));
|
||||
const int kRowsPerWorker =
|
||||
RoundUp<kKernelRows>(CeilQuotient(output_depth, thread_count));
|
||||
int row_start = 0;
|
||||
for (int i = 0; i < thread_count; i++) {
|
||||
int row_end = std::min(output_depth, row_start + kRowsPerWorker);
|
||||
|
@ -15,7 +15,6 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_FULLY_CONNECTED_H_
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/round.h"
|
||||
|
@ -16,7 +16,8 @@ limitations under the License.
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_ADD_H_
|
||||
|
||||
#include <limits>
|
||||
#include "public/gemmlowp.h"
|
||||
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/types.h"
|
||||
|
||||
|
@ -15,7 +15,7 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEPTHWISE_CONV_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
@ -15,7 +15,8 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_MUL_H_
|
||||
|
||||
#include "public/gemmlowp.h"
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
@ -16,6 +16,8 @@ limitations under the License.
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_TANH_H_
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
|
||||
namespace tflite {
|
||||
|
@ -15,7 +15,6 @@ limitations under the License.
|
||||
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
|
||||
#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_POOLING_H_
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
#include "tensorflow/lite/kernels/internal/round.h"
|
||||
|
@ -27,7 +27,7 @@ limitations under the License.
|
||||
#include <type_traits>
|
||||
|
||||
#include "fixedpoint/fixedpoint.h"
|
||||
#include "public/gemmlowp.h"
|
||||
#include "profiling/instrumentation.h"
|
||||
#include "tensorflow/lite/c/c_api_internal.h"
|
||||
#include "tensorflow/lite/kernels/internal/common.h"
|
||||
#include "tensorflow/lite/kernels/internal/quantization_util.h"
|
||||
@ -1914,23 +1914,25 @@ inline void LstmCell(
|
||||
// aiming for 16-bit fixed-point quantization of these internal nodes here.
|
||||
//
|
||||
template <int StateIntegerBits>
|
||||
inline void LstmCell(
|
||||
const LstmCellParams& params, const RuntimeShape& unextended_input_shape,
|
||||
const uint8* input_data_uint8,
|
||||
const RuntimeShape& unextended_prev_activ_shape,
|
||||
const uint8* prev_activ_data_uint8, const RuntimeShape& weights_shape,
|
||||
const uint8* weights_data_uint8, const RuntimeShape& unextended_bias_shape,
|
||||
const int32* bias_data_int32,
|
||||
const RuntimeShape& unextended_prev_state_shape,
|
||||
const int16* prev_state_data_int16,
|
||||
const RuntimeShape& unextended_output_state_shape,
|
||||
int16* output_state_data_int16,
|
||||
const RuntimeShape& unextended_output_activ_shape,
|
||||
uint8* output_activ_data_uint8,
|
||||
const RuntimeShape& unextended_concat_temp_shape,
|
||||
uint8* concat_temp_data_uint8,
|
||||
const RuntimeShape& unextended_activ_temp_shape,
|
||||
int16* activ_temp_data_int16, gemmlowp::GemmContext* gemmlowp_context) {
|
||||
inline void LstmCell(const LstmCellParams& params,
|
||||
const RuntimeShape& unextended_input_shape,
|
||||
const uint8* input_data_uint8,
|
||||
const RuntimeShape& unextended_prev_activ_shape,
|
||||
const uint8* prev_activ_data_uint8,
|
||||
const RuntimeShape& weights_shape,
|
||||
const uint8* weights_data_uint8,
|
||||
const RuntimeShape& unextended_bias_shape,
|
||||
const int32* bias_data_int32,
|
||||
const RuntimeShape& unextended_prev_state_shape,
|
||||
const int16* prev_state_data_int16,
|
||||
const RuntimeShape& unextended_output_state_shape,
|
||||
int16* output_state_data_int16,
|
||||
const RuntimeShape& unextended_output_activ_shape,
|
||||
uint8* output_activ_data_uint8,
|
||||
const RuntimeShape& unextended_concat_temp_shape,
|
||||
uint8* concat_temp_data_uint8,
|
||||
const RuntimeShape& unextended_activ_temp_shape,
|
||||
int16* activ_temp_data_int16, void* gemmlowp_context) {
|
||||
(void)gemmlowp_context; // only used in optimized code.
|
||||
int32 weights_zero_point = params.weights_zero_point;
|
||||
int32 accum_multiplier = params.accum_multiplier;
|
||||
|
Loading…
Reference in New Issue
Block a user