Internal change

PiperOrigin-RevId: 276815994
Change-Id: I61be890666a04db4e4ef8c1a7bfce59e79836650
This commit is contained in:
Jeffrey A. Dean 2019-10-25 21:31:33 -07:00 committed by TensorFlower Gardener
parent 1799941f59
commit dea39f6b3d
3 changed files with 94 additions and 20 deletions

View File

@ -45,6 +45,55 @@ struct MemCpyCopier<ResourceHandle> {
}
};
template <typename T>
int64 EstimateBytesPerElement(
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs) {
return sizeof(T);
}
// EstimateBytesPerElement for strings estimates the total bytes involved in
// concatenating the strings in the "inputs" matrices (higher-level code
// reshapes all the inputs to matrices), by sampling the lengths of the actual
// strings in the various tensors.
template <>
int64 EstimateBytesPerElement<std::string>(
const std::vector<
std::unique_ptr<typename TTypes<std::string, 2>::ConstMatrix>>&
inputs) {
// randomly sample a few input strings to get a sense of the average size
// of each element
int num_samples = 0;
int64 num_bytes_in_samples = 0;
for (const auto& input : inputs) {
const auto dim0 = input->dimension(0);
const auto dim1 = input->dimension(1);
const auto zero = dim0 - dim0; // Make type match
if (dim0 > 0 && dim1 > 0) {
// Draw 9 samples of string sizes from the input, in this sort of pattern
// ("*" is sample), to get an estimate of the lengths of each string
// element in the tensors:
//
// *...*...*
// .........
// *...*...*
// .........
// *...*...*
for (auto i : {zero, dim0 / 2, dim0 - 1}) {
for (auto j : {zero, dim1 / 2, dim1 - 1}) {
num_bytes_in_samples += (*input)(i, j).size();
num_samples++;
}
}
}
}
// We don't use sizeof(std::string) as the overhead, since that would
// overestimate the memory touched for copying a string.
int64 string_overhead = sizeof(char*) + sizeof(size_t);
return string_overhead +
((num_samples > 0) ? (num_bytes_in_samples / num_samples) : 0);
}
} // namespace
template <typename T>
@ -53,13 +102,8 @@ void ConcatCPU(
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
inputs,
typename TTypes<T, 2>::Matrix* output) {
if (std::is_same<T, string>::value) {
// use a large cost here to force strings to be handled by separate threads
ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
} else {
ConcatCPUImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */,
MemCpyCopier<T>(), output);
}
int64 cost_per_unit = EstimateBytesPerElement<T>(inputs);
ConcatCPUImpl<T>(d, inputs, cost_per_unit, MemCpyCopier<T>(), output);
}
#define REGISTER(T) \

View File

@ -45,14 +45,15 @@ void ConcatCPUImpl(
row_size += sizes.back();
}
// cost_per_unit is estimated bytes to copy per output array element (for
// strings this includes an estimate of the number of bytes of the actual
// string data, as well).
const int64 estimated_total_cost = output->size() * cost_per_unit;
auto worker_threads = d->tensorflow_cpu_worker_threads();
int num_threads = std::min(4, worker_threads->num_threads);
// strings define a different amount of work (generally much more) compared
// with standard POD, so we parallelize differently.
if (!std::is_same<T, string>::value) {
num_threads =
static_cast<int>(std::min<int64>(num_threads, output->size() / 4096));
}
num_threads = static_cast<int>(
std::min<int64>(num_threads, estimated_total_cost / 16384));
// Single threaded mode.
// TODO(dga): Deduplicate this code w.r.t. sharded code below.
if (num_threads == 0) {

View File

@ -35,10 +35,30 @@ limitations under the License.
namespace tensorflow {
namespace {
// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
// in size, and concat them together along "concat_dimension"
template <typename T>
static void ConcatHelper(int iters, int concat_dimension, int dim2) {
void FillTensorWithRandomValues(Tensor* t, int string_length, int64* bytes) {
t->flat<T>().setRandom();
*bytes = t->flat<T>().size() * sizeof(T);
}
template <>
void FillTensorWithRandomValues<std::string>(Tensor* t, int string_length,
int64* bytes) {
auto ts = t->flat<string>();
*bytes = 0;
for (int i = 0; i < ts.size(); i++) {
ts(i) = string(string_length, 'x');
*bytes += sizeof(ts(i)) + ts(i).size();
}
}
// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
// in size, and concat them together along "concat_dimension". If T is
// std::string, then the length of individual strings in the tensors will be
// of length "string_length".
template <typename T>
static void ConcatHelper(int iters, int concat_dimension, int dim2,
int string_length = 0) {
testing::StopTiming();
Graph* g = new Graph(OpRegistry::Global());
@ -47,9 +67,10 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2) {
Tensor concat_dim(DT_INT32, TensorShape({}));
concat_dim.scalar<int32>()() = concat_dimension;
Tensor in0(dt, TensorShape({kDim1, dim2}));
in0.flat<T>().setRandom();
Tensor in1(dt, TensorShape({kDim1, dim2}));
in1.flat<T>().setRandom();
int64 in0_bytes, in1_bytes;
FillTensorWithRandomValues<T>(&in0, string_length, &in0_bytes);
FillTensorWithRandomValues<T>(&in1, string_length, &in1_bytes);
Node* node;
TF_CHECK_OK(
@ -60,8 +81,7 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2) {
.Attr("T", dt)
.Finalize(g, &node));
testing::BytesProcessed(static_cast<int64>(iters) *
((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
testing::StartTiming();
test::Benchmark("cpu", g).Run(iters);
testing::UseRealTime();
@ -78,6 +98,15 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
ConcatHelper<string>(iters, 0, dim2, string_length);
}
BENCHMARK(BM_ConcatDim0String)
->ArgPair(1, 16)
->ArgPair(1, 10000)
->ArgPair(100, 16);
static void BM_ConcatDim1uint8(int iters, int dim2) {
ConcatHelper<uint8>(iters, 1, dim2);
}