Optimize OneHot op on CPU
PiperOrigin-RevId: 276122680 Change-Id: I17a72abf2818223dee4fb86517f94ebf5c045309
This commit is contained in:
parent
6037196225
commit
13772b2e69
@ -2159,6 +2159,25 @@ tf_cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "one_hot_op_test",
|
||||
size = "small",
|
||||
srcs = ["one_hot_op_test.cc"],
|
||||
deps = [
|
||||
":one_hot_op",
|
||||
":ops_testutil",
|
||||
":ops_util",
|
||||
"//tensorflow/core:core_cpu",
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
"//tensorflow/core:testlib",
|
||||
"//tensorflow/stream_executor/cuda:cudnn_plugin",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "reverse_op_test",
|
||||
size = "small",
|
||||
|
@ -19,12 +19,18 @@ limitations under the License.
|
||||
#define TENSORFLOW_CORE_KERNELS_ONE_HOT_OP_H_
|
||||
// Generator definition for OneHotOp, must be compilable by nvcc.
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/bounds_check.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
typedef Eigen::ThreadPoolDevice CPUDevice;
|
||||
|
||||
namespace generator {
|
||||
|
||||
template <typename T, typename TI>
|
||||
@ -65,6 +71,53 @@ struct OneHot {
|
||||
}
|
||||
};
|
||||
|
||||
template <typename T, typename TI>
|
||||
struct OneHot<CPUDevice, T, TI> {
|
||||
EIGEN_ALWAYS_INLINE static void Compute(
|
||||
const CPUDevice& d, const typename TTypes<TI>::ConstMatrix& indices,
|
||||
const typename TTypes<T>::ConstScalar& on_value,
|
||||
const typename TTypes<T>::ConstScalar& off_value,
|
||||
typename TTypes<T, 3>::Tensor* output) {
|
||||
// Pre-fill output with `off_value`.
|
||||
output->device(d) = output->constant(off_value());
|
||||
|
||||
// Iterate through indices and update on_value elements in the output.
|
||||
Eigen::Index prefix_size = output->dimensions()[0];
|
||||
Eigen::Index depth_size = output->dimensions()[1];
|
||||
Eigen::Index suffix_size = output->dimensions()[2];
|
||||
|
||||
// Cost of setting one `on_value` coefficient.
|
||||
double bytes_loaded = sizeof(T);
|
||||
double bytes_stored = sizeof(T);
|
||||
double cycles = 0.0;
|
||||
const Eigen::TensorOpCost cost(bytes_loaded, bytes_stored, cycles);
|
||||
|
||||
if (suffix_size == 1) {
|
||||
const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
|
||||
for (Eigen::Index i = start; i < end; ++i) {
|
||||
const TI depth = internal::SubtleMustCopy(indices(i, 0));
|
||||
if (FastBoundsCheck(depth, depth_size)) {
|
||||
(*output)(i, depth, 0) = on_value();
|
||||
}
|
||||
}
|
||||
};
|
||||
d.parallelFor(prefix_size, cost, func);
|
||||
} else {
|
||||
const auto func = [&](Eigen::Index start, Eigen::Index end) -> void {
|
||||
for (Eigen::Index i = start; i < end; ++i) {
|
||||
const Eigen::Index d0 = i / suffix_size;
|
||||
const Eigen::Index d1 = i - (d0 * suffix_size);
|
||||
const TI depth = internal::SubtleMustCopy(indices(d0, d1));
|
||||
if (FastBoundsCheck(depth, depth_size)) {
|
||||
(*output)(d0, depth, d1) = on_value();
|
||||
}
|
||||
}
|
||||
};
|
||||
d.parallelFor(prefix_size * suffix_size, cost * suffix_size, func);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
|
||||
} // namespace tensorflow
|
||||
|
90
tensorflow/core/kernels/one_hot_op_test.cc
Normal file
90
tensorflow/core/kernels/one_hot_op_test.cc
Normal file
@ -0,0 +1,90 @@
|
||||
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include <random>
|
||||
|
||||
#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
#include "tensorflow/core/platform/test_benchmark.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
static Graph* OneHot(int batch_size, int num_classes, int axis) {
|
||||
Graph* g = new Graph(OpRegistry::Global());
|
||||
|
||||
Tensor indices(DT_INT32, TensorShape({batch_size}));
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_int_distribution<> dist(0, num_classes - 1);
|
||||
|
||||
auto indices_t = indices.flat<int32>();
|
||||
for (int i = 0; i < batch_size; ++i) {
|
||||
indices_t(i) = dist(gen);
|
||||
}
|
||||
|
||||
Tensor depth(DT_INT32, TensorShape({}));
|
||||
depth.scalar<int32>()() = num_classes;
|
||||
|
||||
Tensor on_value(DT_FLOAT, TensorShape({}));
|
||||
on_value.scalar<float>()() = 1.0f;
|
||||
|
||||
Tensor off_value(DT_FLOAT, TensorShape({}));
|
||||
off_value.scalar<float>()() = 0.0f;
|
||||
|
||||
test::graph::Multi(g, "OneHot",
|
||||
{
|
||||
test::graph::Constant(g, indices),
|
||||
test::graph::Constant(g, depth),
|
||||
test::graph::Constant(g, on_value),
|
||||
test::graph::Constant(g, off_value),
|
||||
})
|
||||
->AddAttr("axis", axis);
|
||||
return g;
|
||||
}
|
||||
|
||||
#define BM_OneHot(BATCH, CLASS, AXIS, DEVICE) \
|
||||
static void BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE(int iters) { \
|
||||
testing::ItemsProcessed(static_cast<int64>(iters) * BATCH * CLASS); \
|
||||
test::Benchmark(#DEVICE, OneHot(BATCH, CLASS, AXIS)).Run(iters); \
|
||||
} \
|
||||
BENCHMARK(BM_OneHot##_##BATCH##_##CLASS##_##AXIS##_##DEVICE);
|
||||
|
||||
// CPU
|
||||
BM_OneHot(32, 512, 1, cpu);
|
||||
BM_OneHot(64, 512, 1, cpu);
|
||||
BM_OneHot(128, 512, 1, cpu);
|
||||
|
||||
BM_OneHot(32, 1024, 1, cpu);
|
||||
BM_OneHot(64, 1024, 1, cpu);
|
||||
BM_OneHot(128, 1024, 1, cpu);
|
||||
|
||||
BM_OneHot(32, 10000, 1, cpu);
|
||||
BM_OneHot(64, 10000, 1, cpu);
|
||||
BM_OneHot(128, 10000, 1, cpu);
|
||||
|
||||
BM_OneHot(32, 512, 0, cpu);
|
||||
BM_OneHot(64, 512, 0, cpu);
|
||||
BM_OneHot(128, 512, 0, cpu);
|
||||
|
||||
BM_OneHot(32, 1024, 0, cpu);
|
||||
BM_OneHot(64, 1024, 0, cpu);
|
||||
BM_OneHot(128, 1024, 0, cpu);
|
||||
|
||||
BM_OneHot(32, 10000, 0, cpu);
|
||||
BM_OneHot(64, 10000, 0, cpu);
|
||||
BM_OneHot(128, 10000, 0, cpu);
|
||||
|
||||
} // end namespace tensorflow
|
Loading…
Reference in New Issue
Block a user