STT-tensorflow/tensorflow/lite/kernels/internal/optimized/depthwiseconv_multithread.h
Benoit Jacob 0e43175921 Don't access CpuBackendContext concurrently from multiple threads.
This fixes a race condition that caused crashes with 'illegal instruction'
as the dot-product detection went wrong.

When implementing dotprod detection in depthwiseconv code based on CpuBackendContext, I forgot to mention that CpuBackendContext should
not be used concurrently from multiple threads (a limitation it inherits
from the underlying gemmlowp / ruy contexts).

To avoid that, the dotprod detection is moved to the top-level op kernel
function called on the main thread, before the thread dispatch.

A new data structure was needed to hold the results of the dotprod detection
in a way that could be shared with threads: that's CpuFlags. Put it in
the existing cpu_check.h. It can't share code with the existing code here
because what it does is not currently supported by the OS features that
this existing code uses.

PiperOrigin-RevId: 250739702
2019-05-30 12:38:39 -07:00

188 lines
7.8 KiB
C++

/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_
#include "tensorflow/lite/kernels/cpu_backend_context.h"
#include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h"
#include "tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
namespace tflite {
namespace optimized_ops {
// TODO(luwa): add multithread to per-channel depthwise_conv
// DepthwiseConv can run with multi threads on the dim specified by thread_dim.
// Each thread processes output elements on dim, thread_dim, in the range of
// [thread_start, thread_end).
// For example, assume thread_start = 2, thread_end = 6, and thread_dim = 1, it
// means that it will calculate DepthwiseConv for output_data[:, 2:5, :, :].
template <typename T, typename TS>
struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task {
DepthwiseConvWorkerTask(const DepthwiseParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& filter_shape,
const T* filter_data, const RuntimeShape& bias_shape,
const TS* bias_data, const RuntimeShape& output_shape,
T* output_data, const CpuFlags& cpu_flags,
int thread_start, int thread_end, int thread_dim)
: params_(params),
input_shape_(input_shape),
input_data_(input_data),
filter_shape_(filter_shape),
filter_data_(filter_data),
bias_shape_(bias_shape),
bias_data_(bias_data),
output_shape_(output_shape),
output_data_(output_data),
cpu_flags_(cpu_flags),
thread_start_(thread_start),
thread_end_(thread_end),
thread_dim_(thread_dim) {}
void Run() override {
DepthwiseConvImpl(params_, input_shape_, input_data_, filter_shape_,
filter_data_, bias_shape_, bias_data_, output_shape_,
output_data_, cpu_flags_, thread_start_, thread_end_,
thread_dim_);
}
private:
const DepthwiseParams& params_;
const RuntimeShape& input_shape_;
const T* input_data_;
const RuntimeShape& filter_shape_;
const T* filter_data_;
const RuntimeShape& bias_shape_;
const TS* bias_data_;
const RuntimeShape& output_shape_;
T* output_data_;
const CpuFlags& cpu_flags_;
int thread_start_;
int thread_end_;
int thread_dim_;
};
inline int HowManyConvThreads(const RuntimeShape& output_shape,
const RuntimeShape& filter_shape) {
// How many scalar multiplications are needed to make it worth using one
// more thread
static constexpr int kMinMulPerThread = 1 << 13; // 8k
const int filter_height = filter_shape.Dims(1);
const int filter_width = filter_shape.Dims(2);
const int num_muls = output_shape.FlatSize() * filter_height * filter_width;
// Try to avoid real runtime divisions if possible by dividing by a
// compile-time constant.
int thread_count = std::max(1, num_muls / kMinMulPerThread);
return thread_count;
}
inline bool MultithreadAlongBatches(int thread_count, int batches) {
TFLITE_DCHECK_GE(thread_count, 2);
// If there are fewer batch entries than the number of threads we want to use,
// then better do intra-batch-entry multithreading.
if (batches < thread_count) {
return false;
}
// If there are at least 2 batch entries to be handed to each thread, then
// it's safe to proceed with batch-wise multithreading: each thread will have
// approximately equal number of batch entries to handle, so the load
// balancing will be reasonable, and the amount to which the load is not
// perfectly balanced will be offset by the inherent advantages of
// batch-wise multithreading (each thread is more efficient thanks to working
// on larger buffers with less boundary-handling overhead).
if (batches >= 2 * thread_count) {
return true;
}
// In the limit case were there are at least 1 but not much more than 1
// batch entries per thread, it may be a good idea to do per-batch
// multithreading if the number of batch entries is a multiple of the number
// of threads, so that each thread will have the same number of batch entries
// to process.
return ((batches % thread_count) == 0);
}
template <typename T, typename TS>
inline void DepthwiseConv(const DepthwiseParams& params,
const RuntimeShape& input_shape, const T* input_data,
const RuntimeShape& filter_shape,
const T* filter_data, const RuntimeShape& bias_shape,
const TS* bias_data, const RuntimeShape& output_shape,
T* output_data,
CpuBackendContext* cpu_backend_context) {
gemmlowp::ScopedProfilingLabel label("DepthwiseConv");
TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
int thread_count = HowManyConvThreads(output_shape, filter_shape);
const int max_threads = cpu_backend_context->max_num_threads();
thread_count = std::max(1, std::min(thread_count, max_threads));
#ifndef TFLITE_WITH_RUY
// Cap the number of threads to 2 for float path to avoid regression in
// performance (b/132294857).
if (std::is_floating_point<T>::value) {
thread_count = std::min(thread_count, 2);
}
#endif
const int output_batches = output_shape.Dims(0);
const int output_height = output_shape.Dims(1);
CpuFlags cpu_flags;
GetCpuFlags(cpu_backend_context, &cpu_flags);
if (thread_count == 1) {
DepthwiseConvImpl(params, input_shape, input_data, filter_shape,
filter_data, bias_shape, bias_data, output_shape,
output_data, cpu_flags, /*thread_start=*/0,
/*thread_end=*/output_height, /*thread_dim=*/1);
return;
}
int thread_dim, thread_dim_size;
if (MultithreadAlongBatches(thread_count, output_batches)) {
thread_dim = 0;
thread_dim_size = output_batches;
} else {
thread_dim = 1;
thread_dim_size = output_height;
}
std::vector<DepthwiseConvWorkerTask<T, TS>> tasks;
// TODO(b/131746020) don't create new heap allocations every time.
// At least we make it a single heap allocation by using reserve().
tasks.reserve(thread_count);
int thread_start = 0;
for (int i = 0; i < thread_count; ++i) {
int thread_end =
thread_start + (thread_dim_size - thread_start) / (thread_count - i);
tasks.emplace_back(params, input_shape, input_data, filter_shape,
filter_data, bias_shape, bias_data, output_shape,
output_data, cpu_flags, thread_start, thread_end,
thread_dim);
thread_start = thread_end;
}
cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
cpu_backend_context);
}
} // namespace optimized_ops
} // namespace tflite
#endif // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_DEPTHWISECONV_MULTITHREAD_H_