Optimize OneHot op on CPU

PiperOrigin-RevId: 276122680
Change-Id: I17a72abf2818223dee4fb86517f94ebf5c045309
This commit is contained in:
Eugene Zhulenev 2019-10-22 12:44:51 -07:00 committed by TensorFlower Gardener
parent 6037196225
commit 13772b2e69
3 changed files with 162 additions and 0 deletions

View File

@ -2159,6 +2159,25 @@ tf_cc_test(
],
)
tf_cc_test(
name = "one_hot_op_test",
size = "small",
srcs = ["one_hot_op_test.cc"],
deps = [
":one_hot_op",
":ops_testutil",
":ops_util",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
"//tensorflow/stream_executor/cuda:cudnn_plugin",
],
)
tf_cc_test(
name = "reverse_op_test",
size = "small",

View File

@ -19,12 +19,18 @@ limitations under the License.
#define TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
// Generator definition for OneHotOp, must be compilable by nvcc.
#define EIGEN_USE_THREADS
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
#include "tensorflow/core/framework/bounds_check.h"
#include "tensorflow/core/framework/tensor_types.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/types.h"
namespace tensorflow {
typedef Eigen::ThreadPoolDevice CPUDevice;
namespace generator {
template <typename T, typename TI>
@ -65,6 +71,53 @@ struct OneHot {
}
};
template <typename T, typename TI>
struct OneHot<CPUDevice, T, TI> {
EIGEN_ALWAYS_INLINE static void Compute(
const CPUDevice& d, const typename TTypes<TI>::ConstMatrix& indices,
const typename TTypes<T>::ConstScalar& on_value,
const typename TTypes<T>::ConstScalar& off_value,
typename TTypes<T, 3>::Tensor* output) {
// Pre-fill output with `off_value`.
output->device(d) = output->constant(off_value());
// Iterate through indices and update on_value elements in the output.
Eigen::Index prefix_size = output->dimensions()[0];
Eigen::Index depth_size = output->dimensions()[1];
Eigen::Index suffix_size = output->dimensions()[2];
// Cost of setting one `on_value` coefficient.
double bytes_loaded = sizeof(T);
double bytes_stored = sizeof(T);
double cycles = 0.0;
const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, cycles);
if (suffix_size == 1) {
const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
for (Eigen::Index i = start; i < end; ++i) {
const TI depth = internal::SubtleMustCopy(indices(i, 0));
if (FastBoundsCheck(depth, depth_size)) {
(*output)(i, depth, 0) = on_value();
}
}
};
d.parallelFor(prefix_size, cost, func);
} else {
const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
for (Eigen::Index i = start; i < end; ++i) {
const Eigen::Index d0 = i / suffix_size;
const Eigen::Index d1 = i - (d0 * suffix_size);
const TI depth = internal::SubtleMustCopy(indices(d0, d1));
if (FastBoundsCheck(depth, depth_size)) {
(*output)(d0, depth, d1) = on_value();
}
}
};
d.parallelFor(prefix_size * suffix_size, cost * suffix_size, func);
}
}
};
} // namespace functor
} // namespace tensorflow

View File

@ -0,0 +1,90 @@
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include <random>
#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
#include "tensorflow/core/framework/tensor.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
namespace tensorflow {
static Graph* OneHot(int batch_size, int num_classes, int axis) {
Graph* g = new Graph(OpRegistry::Global());
Tensor indices(DT_INT32, TensorShape({batch_size}));
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> dist(0, num_classes - 1);
auto indices_t = indices.flat<int32>();
for (int i = 0; i < batch_size; ++i) {
indices_t(i) = dist(gen);
}
Tensor depth(DT_INT32, TensorShape({}));
depth.scalar<int32>()() = num_classes;
Tensor on_value(DT_FLOAT, TensorShape({}));
on_value.scalar<float>()() = 1.0f;
Tensor off_value(DT_FLOAT, TensorShape({}));
off_value.scalar<float>()() = 0.0f;
test::graph::Multi(g, "OneHot",
{
test::graph::Constant(g, indices),
test::graph::Constant(g, depth),
test::graph::Constant(g, on_value),
test::graph::Constant(g, off_value),
})
->AddAttr("axis", axis);
return g;
}
#define BM_OneHot(BATCH, CLASS, AXIS, DEVICE) \
static void BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE(int iters) { \
testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
test::Benchmark(#DEVICE, OneHot(BATCH, CLASS, AXIS)).Run(iters); \
} \
BENCHMARK(BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE);
// CPU
BM_OneHot(32, 512, 1, cpu);
BM_OneHot(64, 512, 1, cpu);
BM_OneHot(128, 512, 1, cpu);
BM_OneHot(32, 1024, 1, cpu);
BM_OneHot(64, 1024, 1, cpu);
BM_OneHot(128, 1024, 1, cpu);
BM_OneHot(32, 10000, 1, cpu);
BM_OneHot(64, 10000, 1, cpu);
BM_OneHot(128, 10000, 1, cpu);
BM_OneHot(32, 512, 0, cpu);
BM_OneHot(64, 512, 0, cpu);
BM_OneHot(128, 512, 0, cpu);
BM_OneHot(32, 1024, 0, cpu);
BM_OneHot(64, 1024, 0, cpu);
BM_OneHot(128, 1024, 0, cpu);
BM_OneHot(32, 10000, 0, cpu);
BM_OneHot(64, 10000, 0, cpu);
BM_OneHot(128, 10000, 0, cpu);
} // end namespace tensorflow