Remove heuristic caps on parallelism that should now be handled by cost model.

Adjust cost model for FloatToBFloat16 and BFloat16ToFloat. They do not take 100 cycles per element.

This cl is a companion to cl/122779011, which makes the caps effective again, even with the nonblocking threadpool.
Change: 123144919
This commit is contained in:
A. Unique TensorFlower 2016-05-24 13:14:03 -08:00 committed by TensorFlower Gardener
parent 67ddfa5b34
commit 3669479261
3 changed files with 18 additions and 33 deletions

View File

@ -150,19 +150,11 @@ class CpuCastOp : public CastOpBase {
work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
int64 N = out->NumElements();
auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
int num_threads = static_cast<int>(std::min(
static_cast<int64>(std::min(4, worker_threads->num_threads)),
N / 4096));
if (num_threads < 1) {
BFloat16ToFloat(inp.flat<bfloat16>().data(),
out->flat<float>().data(), N);
} else {
auto work = [&inp, &out](int64 start, int64 end) {
BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
out->flat<float>().data() + start, end - start);
};
Shard(num_threads, worker_threads->workers, N, 100, work);
}
auto work = [&inp, &out](int64 start, int64 end) {
BFloat16ToFloat(inp.flat<bfloat16>().data() + start,
out->flat<float>().data() + start, end - start);
};
Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
};
return Status::OK();
}
@ -170,19 +162,11 @@ class CpuCastOp : public CastOpBase {
work_ = [](OpKernelContext* ctx, const Tensor& inp, Tensor* out) {
int64 N = out->NumElements();
auto worker_threads = ctx->device()->tensorflow_cpu_worker_threads();
int num_threads = static_cast<int>(std::min(
static_cast<int64>(std::min(4, worker_threads->num_threads)),
N / 4096));
if (num_threads < 1) {
FloatToBFloat16(inp.flat<float>().data(),
out->flat<bfloat16>().data(), N);
} else {
auto work = [&inp, &out](int64 start, int64 end) {
FloatToBFloat16(inp.flat<float>().data() + start,
out->flat<bfloat16>().data() + start, end - start);
};
Shard(num_threads, worker_threads->workers, N, 100, work);
}
auto work = [&inp, &out](int64 start, int64 end) {
FloatToBFloat16(inp.flat<float>().data() + start,
out->flat<bfloat16>().data() + start, end - start);
};
Shard(worker_threads->num_threads, worker_threads->workers, N, 2, work);
};
return Status::OK();
}

View File

@ -123,7 +123,8 @@ void ConcatCPU(DeviceBase* d,
}
}
};
Shard(num_threads, worker_threads->workers, output->size(), sizeof(T), work);
Shard(worker_threads->num_threads, worker_threads->workers, output->size(),
sizeof(T), work);
}
#define REGISTER(T) \

View File

@ -161,13 +161,11 @@ struct FillPhiloxRandom<CPUDevice, Distribution> {
int64 total_group_count = (size + kGroupSize - 1) / kGroupSize;
// Limit to maximum six threads for now. The performance scaling is very
// sub-linear. Too many threads causes a much worse overall performance.
int num_workers = 6;
const int kGroupCost =
random::PhiloxRandom::kResultElementCount *
(random::PhiloxRandom::kElementCost + Distribution::kElementCost);
Shard(num_workers, worker_threads.workers, total_group_count, kGroupCost,
Shard(worker_threads.num_threads, worker_threads.workers, total_group_count,
kGroupCost,
[&gen, data, size, dist](int64 start_group, int64 limit_group) {
FillPhiloxRandomTask<
Distribution,
@ -399,8 +397,10 @@ class MultinomialOp : public OpKernel {
sizeof(int64) * num_samples);
}
};
Shard(std::min(batch_size, worker_threads.num_threads),
worker_threads.workers, batch_size, num_samples * num_classes * 2,
// Rough estimate, log2() takes from 58-680 cycles on Haswell.
// The functor here calls log twice for each element.
const int64 cost = 500 * num_samples * num_classes;
Shard(worker_threads.num_threads, worker_threads.workers, batch_size, cost,
DoWork);
}