ruy::ThreadPool: when there is only 1 task, don't even touch atomic counters.
Saves a store-release and a load-acquire (total ~100 cycles) per matmul. PiperOrigin-RevId: 261321407
This commit is contained in:
parent
4181b8b6fe
commit
c6156aaaa1
@ -153,17 +153,23 @@ class Thread {
|
|||||||
|
|
||||||
void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
|
void ThreadPool::ExecuteImpl(int task_count, int stride, Task* tasks) {
|
||||||
RUY_DCHECK_GE(task_count, 1);
|
RUY_DCHECK_GE(task_count, 1);
|
||||||
// Task #0 will be run on the current thread.
|
if (task_count > 1) {
|
||||||
CreateThreads(task_count - 1);
|
// Task #0 will be run on the current thread.
|
||||||
counter_to_decrement_when_ready_.Reset(task_count - 1);
|
CreateThreads(task_count - 1);
|
||||||
for (int i = 1; i < task_count; i++) {
|
counter_to_decrement_when_ready_.Reset(task_count - 1);
|
||||||
auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride;
|
for (int i = 1; i < task_count; i++) {
|
||||||
threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address));
|
auto task_address = reinterpret_cast<std::uintptr_t>(tasks) + i * stride;
|
||||||
|
threads_[i - 1]->StartWork(reinterpret_cast<Task*>(task_address));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
// Execute task #0 workload immediately on the current thread.
|
|
||||||
|
// Execute task #0 immediately on the current thread.
|
||||||
(tasks + 0)->Run();
|
(tasks + 0)->Run();
|
||||||
// Wait for the threads submitted above to finish.
|
|
||||||
counter_to_decrement_when_ready_.Wait();
|
if (task_count > 1) {
|
||||||
|
// Wait for the threads submitted above to finish.
|
||||||
|
counter_to_decrement_when_ready_.Wait();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Ensures that the pool has at least the given count of threads.
|
// Ensures that the pool has at least the given count of threads.
|
||||||
|
Loading…
Reference in New Issue
Block a user