Internal change
PiperOrigin-RevId: 276815994 Change-Id: I61be890666a04db4e4ef8c1a7bfce59e79836650
This commit is contained in:
parent
1799941f59
commit
dea39f6b3d
tensorflow/core/kernels
@ -45,6 +45,55 @@ struct MemCpyCopier<ResourceHandle> {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
int64 EstimateBytesPerElement(
|
||||
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
|
||||
inputs) {
|
||||
return sizeof(T);
|
||||
}
|
||||
|
||||
// EstimateBytesPerElement for strings estimates the total bytes involved in
|
||||
// concatenating the strings in the "inputs" matrices (higher-level code
|
||||
// reshapes all the inputs to matrices), by sampling the lengths of the actual
|
||||
// strings in the various tensors.
|
||||
template <>
|
||||
int64 EstimateBytesPerElement<std::string>(
|
||||
const std::vector<
|
||||
std::unique_ptr<typename TTypes<std::string, 2>::ConstMatrix>>&
|
||||
inputs) {
|
||||
// randomly sample a few input strings to get a sense of the average size
|
||||
// of each element
|
||||
int num_samples = 0;
|
||||
int64 num_bytes_in_samples = 0;
|
||||
for (const auto& input : inputs) {
|
||||
const auto dim0 = input->dimension(0);
|
||||
const auto dim1 = input->dimension(1);
|
||||
const auto zero = dim0 - dim0; // Make type match
|
||||
if (dim0 > 0 && dim1 > 0) {
|
||||
// Draw 9 samples of string sizes from the input, in this sort of pattern
|
||||
// ("*" is sample), to get an estimate of the lengths of each string
|
||||
// element in the tensors:
|
||||
//
|
||||
// *...*...*
|
||||
// .........
|
||||
// *...*...*
|
||||
// .........
|
||||
// *...*...*
|
||||
for (auto i : {zero, dim0 / 2, dim0 - 1}) {
|
||||
for (auto j : {zero, dim1 / 2, dim1 - 1}) {
|
||||
num_bytes_in_samples += (*input)(i, j).size();
|
||||
num_samples++;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
// We don't use sizeof(std::string) as the overhead, since that would
|
||||
// overestimate the memory touched for copying a string.
|
||||
int64 string_overhead = sizeof(char*) + sizeof(size_t);
|
||||
return string_overhead +
|
||||
((num_samples > 0) ? (num_bytes_in_samples / num_samples) : 0);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
@ -53,13 +102,8 @@ void ConcatCPU(
|
||||
const std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>&
|
||||
inputs,
|
||||
typename TTypes<T, 2>::Matrix* output) {
|
||||
if (std::is_same<T, string>::value) {
|
||||
// use a large cost here to force strings to be handled by separate threads
|
||||
ConcatCPUImpl<T>(d, inputs, 100000, MemCpyCopier<T>(), output);
|
||||
} else {
|
||||
ConcatCPUImpl<T>(d, inputs, sizeof(T) /* cost_per_unit */,
|
||||
MemCpyCopier<T>(), output);
|
||||
}
|
||||
int64 cost_per_unit = EstimateBytesPerElement<T>(inputs);
|
||||
ConcatCPUImpl<T>(d, inputs, cost_per_unit, MemCpyCopier<T>(), output);
|
||||
}
|
||||
|
||||
#define REGISTER(T) \
|
||||
|
@ -45,14 +45,15 @@ void ConcatCPUImpl(
|
||||
row_size += sizes.back();
|
||||
}
|
||||
|
||||
// cost_per_unit is estimated bytes to copy per output array element (for
|
||||
// strings this includes an estimate of the number of bytes of the actual
|
||||
// string data, as well).
|
||||
const int64 estimated_total_cost = output->size() * cost_per_unit;
|
||||
|
||||
auto worker_threads = d->tensorflow_cpu_worker_threads();
|
||||
int num_threads = std::min(4, worker_threads->num_threads);
|
||||
// strings define a different amount of work (generally much more) compared
|
||||
// with standard POD, so we parallelize differently.
|
||||
if (!std::is_same<T, string>::value) {
|
||||
num_threads =
|
||||
static_cast<int>(std::min<int64>(num_threads, output->size() / 4096));
|
||||
}
|
||||
num_threads = static_cast<int>(
|
||||
std::min<int64>(num_threads, estimated_total_cost / 16384));
|
||||
// Single threaded mode.
|
||||
// TODO(dga): Deduplicate this code w.r.t. sharded code below.
|
||||
if (num_threads == 0) {
|
||||
|
@ -35,10 +35,30 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
namespace {
|
||||
|
||||
// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
|
||||
// in size, and concat them together along "concat_dimension"
|
||||
template <typename T>
|
||||
static void ConcatHelper(int iters, int concat_dimension, int dim2) {
|
||||
void FillTensorWithRandomValues(Tensor* t, int string_length, int64* bytes) {
|
||||
t->flat<T>().setRandom();
|
||||
*bytes = t->flat<T>().size() * sizeof(T);
|
||||
}
|
||||
|
||||
template <>
|
||||
void FillTensorWithRandomValues<std::string>(Tensor* t, int string_length,
|
||||
int64* bytes) {
|
||||
auto ts = t->flat<string>();
|
||||
*bytes = 0;
|
||||
for (int i = 0; i < ts.size(); i++) {
|
||||
ts(i) = string(string_length, 'x');
|
||||
*bytes += sizeof(ts(i)) + ts(i).size();
|
||||
}
|
||||
}
|
||||
|
||||
// For the benchmark, we set up two 2-dimensional tensors, each kDim1 x 'dim'
|
||||
// in size, and concat them together along "concat_dimension". If T is
|
||||
// std::string, then the length of individual strings in the tensors will be
|
||||
// of length "string_length".
|
||||
template <typename T>
|
||||
static void ConcatHelper(int iters, int concat_dimension, int dim2,
|
||||
int string_length = 0) {
|
||||
testing::StopTiming();
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
|
||||
@ -47,9 +67,10 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2) {
|
||||
Tensor concat_dim(DT_INT32, TensorShape({}));
|
||||
concat_dim.scalar<int32>()() = concat_dimension;
|
||||
Tensor in0(dt, TensorShape({kDim1, dim2}));
|
||||
in0.flat<T>().setRandom();
|
||||
Tensor in1(dt, TensorShape({kDim1, dim2}));
|
||||
in1.flat<T>().setRandom();
|
||||
int64 in0_bytes, in1_bytes;
|
||||
FillTensorWithRandomValues<T>(&in0, string_length, &in0_bytes);
|
||||
FillTensorWithRandomValues<T>(&in1, string_length, &in1_bytes);
|
||||
|
||||
Node* node;
|
||||
TF_CHECK_OK(
|
||||
@ -60,8 +81,7 @@ static void ConcatHelper(int iters, int concat_dimension, int dim2) {
|
||||
.Attr("T", dt)
|
||||
.Finalize(g, &node));
|
||||
|
||||
testing::BytesProcessed(static_cast<int64>(iters) *
|
||||
((kDim1 * dim2) + (kDim1 * dim2)) * sizeof(T));
|
||||
testing::BytesProcessed(static_cast<int64>(iters) * (in0_bytes + in1_bytes));
|
||||
testing::StartTiming();
|
||||
test::Benchmark("cpu", g).Run(iters);
|
||||
testing::UseRealTime();
|
||||
@ -78,6 +98,15 @@ static void BM_ConcatDim1Float(int iters, int dim2) {
|
||||
BENCHMARK(BM_ConcatDim0Float)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
BENCHMARK(BM_ConcatDim1Float)->Arg(1000)->Arg(100000)->Arg(1000000);
|
||||
|
||||
static void BM_ConcatDim0String(int iters, int dim2, int string_length) {
|
||||
ConcatHelper<string>(iters, 0, dim2, string_length);
|
||||
}
|
||||
|
||||
BENCHMARK(BM_ConcatDim0String)
|
||||
->ArgPair(1, 16)
|
||||
->ArgPair(1, 10000)
|
||||
->ArgPair(100, 16);
|
||||
|
||||
static void BM_ConcatDim1uint8(int iters, int dim2) {
|
||||
ConcatHelper<uint8>(iters, 1, dim2);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user