ruy::ThreadPool: when there is only 1 task, don't even touch atomic counters.

Saves a store-release and a load-acquire (total ~100 cycles) per matmul.

PiperOrigin-RevId: 261321407
This commit is contained in:
Benoit Jacob 2019-08-02 07:59:08 -07:00 committed by TensorFlower Gardener
parent 4181b8b6fe
commit c6156aaaa1

View File

@ -153,17 +153,23 @@ class Thread {
void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) { void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
RUY_DCHECK_GE(task_count, 1); RUY_DCHECK_GE(task_count, 1);
// Task #0 will be run on the current thread. if (task_count > 1) {
CreateThreads(task_count - 1); // Task #0 will be run on the current thread.
counter_to_decrement_when_ready_.Reset(task_count - 1); CreateThreads(task_count - 1);
for (int i = 1; i < task_count; i++) { counter_to_decrement_when_ready_.Reset(task_count - 1);
auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride; for (int i = 1; i < task_count; i++) {
threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address)); auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride;
threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address));
}
} }
// Execute task #0 workload immediately on the current thread.
// Execute task #0 immediately on the current thread.
(tasks + 0)->Run(); (tasks + 0)->Run();
// Wait for the threads submitted above to finish.
counter_to_decrement_when_ready_.Wait(); if (task_count > 1) {
// Wait for the threads submitted above to finish.
counter_to_decrement_when_ready_.Wait();
}
} }
// Ensures that the pool has at least the given count of threads. // Ensures that the pool has at least the given count of threads.