|
|
|
@ -14,9 +14,9 @@ limitations under the License.
|
|
|
|
|
==============================================================================*/
|
|
|
|
|
|
|
|
|
|
#include "tensorflow/core/kernels/eigen_spatial_convolutions.h"
|
|
|
|
|
#include "tensorflow/core/framework/types.h"
|
|
|
|
|
#include "tensorflow/core/kernels/eigen_cuboid_convolution.h"
|
|
|
|
|
#include "tensorflow/core/platform/test.h"
|
|
|
|
|
#include "tensorflow/core/platform/test_benchmark.h"
|
|
|
|
|
|
|
|
|
|
namespace Eigen {
|
|
|
|
|
|
|
|
|
@ -1373,4 +1373,187 @@ TEST(EigenSpatialConvolutionsTest, SpatialConvContractionMapper) {
|
|
|
|
|
EigenApprox(8.0f, direct(0, 1, 3, 0));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void PackRhsHelper(int iters,
|
|
|
|
|
/* Input dimensions: */
|
|
|
|
|
int input_batches, int input_cols, int input_rows,
|
|
|
|
|
int input_depth,
|
|
|
|
|
/* Filter (kernel) dimensions: */
|
|
|
|
|
int filter_count, int filter_cols, int filter_rows) {
|
|
|
|
|
tensorflow::testing::UseRealTime();
|
|
|
|
|
tensorflow::testing::StopTiming();
|
|
|
|
|
|
|
|
|
|
using Dimensions = Eigen::DSizes<Eigen::Index, 4>;
|
|
|
|
|
|
|
|
|
|
// Default Eigen::Tensor layout is column major, so we configure dimensions
|
|
|
|
|
// starting from the inner most (channels aka depth in this case).
|
|
|
|
|
Dimensions input_dims(input_depth, input_rows, input_cols, input_batches);
|
|
|
|
|
|
|
|
|
|
using Traits = typename Eigen::internal::gebp_traits<float, float>;
|
|
|
|
|
static const int packet_size = Eigen::internal::packet_traits<float>::size;
|
|
|
|
|
|
|
|
|
|
// Reshape dimensions.
|
|
|
|
|
using NewDimension = Eigen::array<Eigen::Index, 2>;
|
|
|
|
|
|
|
|
|
|
// Contraction dimensions.
|
|
|
|
|
using nocontract_t = Eigen::array<Eigen::Index, 1>;
|
|
|
|
|
using contract_t = Eigen::array<Eigen::Index, 1>;
|
|
|
|
|
|
|
|
|
|
// Input to the TensorImagePatchOp.
|
|
|
|
|
using ArgType = Tensor<float, 4>;
|
|
|
|
|
|
|
|
|
|
using Evaluator = TensorEvaluator<
|
|
|
|
|
const TensorReshapingOp<
|
|
|
|
|
NewDimension, const TensorImagePatchOp<Dynamic, Dynamic, ArgType>>,
|
|
|
|
|
Eigen::DefaultDevice>;
|
|
|
|
|
|
|
|
|
|
using InputMapper = Eigen::internal::TensorContractionInputMapper<
|
|
|
|
|
float, Index, Eigen::internal::Rhs, Evaluator, //
|
|
|
|
|
nocontract_t, contract_t, //
|
|
|
|
|
packet_size, //
|
|
|
|
|
/*inner_dim_contiguous*/ true, //
|
|
|
|
|
/*inner_dim_reordered*/ false, //
|
|
|
|
|
/*Alignment*/ 0>;
|
|
|
|
|
|
|
|
|
|
using SubMapper = Eigen::internal::TensorContractionSubMapper<
|
|
|
|
|
float, Index, Eigen::internal::Rhs, Evaluator, //
|
|
|
|
|
nocontract_t, contract_t, //
|
|
|
|
|
packet_size, //
|
|
|
|
|
/*inner_dim_contiguous*/ true, //
|
|
|
|
|
/*inner_dim_reordered*/ false, //
|
|
|
|
|
/*Alignment*/ 0>;
|
|
|
|
|
|
|
|
|
|
using PackRhsImpl =
|
|
|
|
|
Eigen::internal::gemm_pack_rhs<float, Eigen::Index, SubMapper, //
|
|
|
|
|
Traits::nr, //
|
|
|
|
|
ColMajor, //
|
|
|
|
|
/*Conjugate*/ false, //
|
|
|
|
|
/*PanelMode*/ false>;
|
|
|
|
|
|
|
|
|
|
Eigen::DefaultDevice device;
|
|
|
|
|
|
|
|
|
|
// Actual contract dimensions are not important.
|
|
|
|
|
const Eigen::Index not_important = -1234;
|
|
|
|
|
nocontract_t nocontract_dim = {not_important};
|
|
|
|
|
contract_t contract_dim = {not_important};
|
|
|
|
|
|
|
|
|
|
// We use tensor of the same dimensions to store packed data.
|
|
|
|
|
Tensor<float, 4> packed(input_dims);
|
|
|
|
|
|
|
|
|
|
// We generate multiple input tensors, around 512mb in total size to measure
|
|
|
|
|
// realistic workload when input data in not in L1-L3 cache.
|
|
|
|
|
size_t input_bytes = input_dims.TotalSize() * sizeof(float);
|
|
|
|
|
size_t mem_size_bytes = 1024 * 1024 * 512;
|
|
|
|
|
size_t num_inputs =
|
|
|
|
|
std::max(static_cast<size_t>(1), mem_size_bytes / input_bytes);
|
|
|
|
|
|
|
|
|
|
std::vector<Tensor<float, 4>> inputs;
|
|
|
|
|
std::vector<Evaluator> evaluators;
|
|
|
|
|
std::vector<InputMapper> input_mappers;
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < num_inputs; ++i) {
|
|
|
|
|
inputs.emplace_back(input_dims);
|
|
|
|
|
inputs[i].setRandom();
|
|
|
|
|
|
|
|
|
|
// 1. Extract image patches from input tensor. All strides are `1`.
|
|
|
|
|
const auto image_patch_op = TensorImagePatchOp<Dynamic, Dynamic, ArgType>(
|
|
|
|
|
inputs[i], //
|
|
|
|
|
filter_rows, filter_cols, //
|
|
|
|
|
/*row_strides=*/1, /*col_strides=*/1, //
|
|
|
|
|
/*in_row_strides=*/1, /*in_col_strides=*/1, //
|
|
|
|
|
/*row_inflate_strides=*/1, /*col_inflate_strides=*/1, //
|
|
|
|
|
Eigen::PADDING_SAME, /*padding_value=*/0.0);
|
|
|
|
|
|
|
|
|
|
// 2. Reshape extracted patches into "virtual" 2d tensor.
|
|
|
|
|
NewDimension reshape_dims = {
|
|
|
|
|
input_depth * filter_rows * filter_cols, // patch size
|
|
|
|
|
// PADDING_SAME: output {rows, cols} == input {rows, cols}
|
|
|
|
|
input_rows * input_cols * input_batches}; // num_patches
|
|
|
|
|
const auto reshape_op =
|
|
|
|
|
TensorReshapingOp<NewDimension, decltype(image_patch_op)>(
|
|
|
|
|
image_patch_op, reshape_dims);
|
|
|
|
|
|
|
|
|
|
evaluators.emplace_back(reshape_op, device);
|
|
|
|
|
|
|
|
|
|
input_mappers.emplace_back(evaluators[i], nocontract_dim, nocontract_dim,
|
|
|
|
|
contract_dim, contract_dim);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We read properties of extracted image patches directly from evaluator.
|
|
|
|
|
const Index patch_depth = evaluators[0].impl().dimensions()[0];
|
|
|
|
|
const Index patch_rows = evaluators[0].impl().dimensions()[1];
|
|
|
|
|
const Index patch_cols = evaluators[0].impl().dimensions()[2];
|
|
|
|
|
|
|
|
|
|
// Number of patches is the same as the maximum column available through the
|
|
|
|
|
// InputMapper (SubMapper).
|
|
|
|
|
const Index num_patches = evaluators[0].impl().dimensions()[3];
|
|
|
|
|
|
|
|
|
|
// The size of a single patch, it's the same as the maximum depth available
|
|
|
|
|
// through the InputMapper (SubMapper).
|
|
|
|
|
const Index patch_size = patch_depth * patch_rows * patch_cols;
|
|
|
|
|
|
|
|
|
|
PackRhsImpl pack_rhs;
|
|
|
|
|
|
|
|
|
|
// This is the typical size of the rhs block used in Tensor contractions.
|
|
|
|
|
const Index default_depth = 320; // must be multiple of 8
|
|
|
|
|
const Index default_cols = 280;
|
|
|
|
|
|
|
|
|
|
const Index packed_total_size = input_dims.TotalSize();
|
|
|
|
|
|
|
|
|
|
tensorflow::testing::StartTiming();
|
|
|
|
|
for (int i = 0; i < iters; ++i) {
|
|
|
|
|
int input_idx =
|
|
|
|
|
num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
|
|
|
|
|
|
|
|
|
|
// Depth offset must be a multiple of 8 (float packet size with AVX2).
|
|
|
|
|
Index depth_offset = (internal::random<Index>(0, patch_size - 10) / 8) * 8;
|
|
|
|
|
Index col_offset = internal::random<Index>(0, num_patches - 10);
|
|
|
|
|
|
|
|
|
|
Index depth = std::min(default_depth, patch_size - depth_offset);
|
|
|
|
|
Index cols = std::min(default_cols, num_patches - col_offset);
|
|
|
|
|
|
|
|
|
|
// Write packed data to random memory location to emulate cold caches.
|
|
|
|
|
Index packed_size = depth * cols;
|
|
|
|
|
Index packed_offset =
|
|
|
|
|
internal::random<Index>(0, packed_total_size - packed_size - 1);
|
|
|
|
|
|
|
|
|
|
pack_rhs(packed.data() + packed_offset,
|
|
|
|
|
input_mappers[input_idx].getSubMapper(depth_offset, col_offset),
|
|
|
|
|
depth, cols);
|
|
|
|
|
}
|
|
|
|
|
tensorflow::testing::StopTiming();
|
|
|
|
|
|
|
|
|
|
std::ostringstream stringStream;
|
|
|
|
|
stringStream << "patch: depth=" << patch_depth << " rows=" << patch_rows
|
|
|
|
|
<< " cols=" << patch_cols << " num_patches=" << num_patches
|
|
|
|
|
<< " patch_size=" << patch_size << " num_inputs=" << num_inputs;
|
|
|
|
|
tensorflow::testing::SetLabel(stringStream.str());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define BM_NAME(prefix, N, H, W, C, FC, FH, FW) \
|
|
|
|
|
BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW
|
|
|
|
|
|
|
|
|
|
#define BM_PackRhs(N, H, W, C, FC, FH, FW) \
|
|
|
|
|
static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW)(int iters) { \
|
|
|
|
|
PackRhsHelper(iters, N, H, W, C, FC, FH, FW); \
|
|
|
|
|
} \
|
|
|
|
|
BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW))
|
|
|
|
|
|
|
|
|
|
// Number of input channel (input depth) it equal to the number of patch
|
|
|
|
|
// channels (patch depth).
|
|
|
|
|
|
|
|
|
|
// NOTE: This is the most common case in Tensorflow models.
|
|
|
|
|
// Fast path: input channel dimension is the multiple of the packet size.
|
|
|
|
|
BM_PackRhs(/*batch*/ 32, //
|
|
|
|
|
/*image*/ 64, 64, //
|
|
|
|
|
/*channels*/ 32, //
|
|
|
|
|
/*num_filters*/ 64, //
|
|
|
|
|
/*filter*/ 5, 5);
|
|
|
|
|
|
|
|
|
|
// Slow path: input channel dimension is not the multiple of the packet size.
|
|
|
|
|
BM_PackRhs(/*batch*/ 32, //
|
|
|
|
|
/*image*/ 64, 64, //
|
|
|
|
|
/*channels*/ 30, //
|
|
|
|
|
/*num_filters*/ 64, //
|
|
|
|
|
/*filter*/ 5, 5);
|
|
|
|
|
|
|
|
|
|
} // namespace Eigen
|
|
|
|
|