Add benchmark for the SpatialConvolution specialized gemm_pack_rhs.

PiperOrigin-RevId: 219839163
This commit is contained in:
Eugene Zhulenev 2018-11-02 12:09:37 -07:00 committed by TensorFlower Gardener
parent 3dce771c0e
commit eaa673fd2b
7 changed files with 184 additions and 10 deletions

View File

@ -2414,12 +2414,8 @@ tf_cc_tests(
],
deps = [
":eigen_helpers",
"//tensorflow/core:core_cpu",
"//tensorflow/core:framework",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
],
)

View File

@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/eigen_activations.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/platform/test.h"
namespace Eigen {

View File

@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/eigen_attention.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/platform/test.h"
namespace Eigen {

View File

@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/eigen_backward_spatial_convolutions.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/eigen_backward_cuboid_convolutions.h"
#include "tensorflow/core/platform/test.h"

View File

@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/eigen_pooling.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/platform/test.h"
namespace Eigen {

View File

@ -14,7 +14,6 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/eigen_softmax.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/platform/test.h"
namespace Eigen {

View File

@ -14,9 +14,9 @@ limitations under the License.
==============================================================================*/
#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
#include "tensorflow/core/framework/types.h"
#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/test_benchmark.h"
namespace Eigen {
@ -1373,4 +1373,187 @@ TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
EigenApprox(8.0f, direct(0, 1, 3, 0));
}
static void PackRhsHelper(int iters,
/* Input dimensions: */
int input_batches, int input_cols, int input_rows,
int input_depth,
/* Filter (kernel) dimensions: */
int filter_count, int filter_cols, int filter_rows) {
tensorflow::testing::UseRealTime();
tensorflow::testing::StopTiming();
using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
// Default Eigen::Tensor layout is column major, so we configure dimensions
// starting from the inner most (channels aka depth in this case).
Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
using Traits = typename Eigen::internal::gebp_traits<float, float>;
static const int packet_size = Eigen::internal::packet_traits<float>::size;
// Reshape dimensions.
using NewDimension = Eigen::array<Eigen::Index, 2>;
// Contraction dimensions.
using nocontract_t = Eigen::array<Eigen::Index, 1>;
using contract_t = Eigen::array<Eigen::Index, 1>;
// Input to the TensorImagePatchOp.
using ArgType = Tensor<float, 4>;
using Evaluator = TensorEvaluator<
const TensorReshapingOp<
NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
Eigen::DefaultDevice>;
using InputMapper = Eigen::internal::TensorContractionInputMapper<
float, Index, Eigen::internal::Rhs, Evaluator, //
nocontract_t, contract_t, //
packet_size, //
/*inner_dim_contiguous*/ true, //
/*inner_dim_reordered*/ false, //
/*Alignment*/ 0>;
using SubMapper = Eigen::internal::TensorContractionSubMapper<
float, Index, Eigen::internal::Rhs, Evaluator, //
nocontract_t, contract_t, //
packet_size, //
/*inner_dim_contiguous*/ true, //
/*inner_dim_reordered*/ false, //
/*Alignment*/ 0>;
using PackRhsImpl =
Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper, //
Traits::nr, //
ColMajor, //
/*Conjugate*/ false, //
/*PanelMode*/ false>;
Eigen::DefaultDevice device;
// Actual contract dimensions are not important.
const Eigen::Index not_important = -1234;
nocontract_t nocontract_dim = {not_important};
contract_t contract_dim = {not_important};
// We use tensor of the same dimensions to store packed data.
Tensor<float, 4> packed(input_dims);
// We generate multiple input tensors, around 512mb in total size to measure
// realistic workload when input data in not in L1-L3 cache.
size_t input_bytes = input_dims.TotalSize() * sizeof(float);
size_t mem_size_bytes = 1024 * 1024 * 512;
size_t num_inputs =
std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
std::vector<Tensor<float, 4>> inputs;
std::vector<Evaluator> evaluators;
std::vector<InputMapper> input_mappers;
for (int i = 0; i < num_inputs; ++i) {
inputs.emplace_back(input_dims);
inputs[i].setRandom();
// 1. Extract image patches from input tensor. All strides are `1`.
const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
inputs[i], //
filter_rows, filter_cols, //
/*row_strides=*/1, /*col_strides=*/1, //
/*in_row_strides=*/1, /*in_col_strides=*/1, //
/*row_inflate_strides=*/1, /*col_inflate_strides=*/1, //
Eigen::PADDING_SAME, /*padding_value=*/0.0);
// 2. Reshape extracted patches into "virtual" 2d tensor.
NewDimension reshape_dims = {
input_depth * filter_rows * filter_cols, // patch size
// PADDING_SAME: output {rows, cols} == input {rows, cols}
input_rows * input_cols * input_batches}; // num_patches
const auto reshape_op =
TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
image_patch_op, reshape_dims);
evaluators.emplace_back(reshape_op, device);
input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
contract_dim, contract_dim);
}
// We read properties of extracted image patches directly from evaluator.
const Index patch_depth = evaluators[0].impl().dimensions()[0];
const Index patch_rows = evaluators[0].impl().dimensions()[1];
const Index patch_cols = evaluators[0].impl().dimensions()[2];
// Number of patches is the same as the maximum column available through the
// InputMapper (SubMapper).
const Index num_patches = evaluators[0].impl().dimensions()[3];
// The size of a single patch, it's the same as the maximum depth available
// through the InputMapper (SubMapper).
const Index patch_size = patch_depth * patch_rows * patch_cols;
PackRhsImpl pack_rhs;
// This is the typical size of the rhs block used in Tensor contractions.
const Index default_depth = 320; // must be multiple of 8
const Index default_cols = 280;
const Index packed_total_size = input_dims.TotalSize();
tensorflow::testing::StartTiming();
for (int i = 0; i < iters; ++i) {
int input_idx =
num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
// Depth offset must be a multiple of 8 (float packet size with AVX2).
Index depth_offset = (internal::random<Index>(0, patch_size - 10) / 8) * 8;
Index col_offset = internal::random<Index>(0, num_patches - 10);
Index depth = std::min(default_depth, patch_size - depth_offset);
Index cols = std::min(default_cols, num_patches - col_offset);
// Write packed data to random memory location to emulate cold caches.
Index packed_size = depth * cols;
Index packed_offset =
internal::random<Index>(0, packed_total_size - packed_size - 1);
pack_rhs(packed.data() + packed_offset,
input_mappers[input_idx].getSubMapper(depth_offset, col_offset),
depth, cols);
}
tensorflow::testing::StopTiming();
std::ostringstream stringStream;
stringStream << "patch: depth=" << patch_depth << " rows=" << patch_rows
<< " cols=" << patch_cols << " num_patches=" << num_patches
<< " patch_size=" << patch_size << " num_inputs=" << num_inputs;
tensorflow::testing::SetLabel(stringStream.str());
}
#define BM_NAME(prefix, N, H, W, C, FC, FH, FW) \
BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW
#define BM_PackRhs(N, H, W, C, FC, FH, FW) \
static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW)(int iters) { \
PackRhsHelper(iters, N, H, W, C, FC, FH, FW); \
} \
BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW))
// Number of input channel (input depth) it equal to the number of patch
// channels (patch depth).
// NOTE: This is the most common case in Tensorflow models.
// Fast path: input channel dimension is the multiple of the packet size.
BM_PackRhs(/*batch*/ 32, //
/*image*/ 64, 64, //
/*channels*/ 32, //
/*num_filters*/ 64, //
/*filter*/ 5, 5);
// Slow path: input channel dimension is not the multiple of the packet size.
BM_PackRhs(/*batch*/ 32, //
/*image*/ 64, 64, //
/*channels*/ 30, //
/*num_filters*/ 64, //
/*filter*/ 5, 5);
} // namespace Eigen