Merge pull request from Intel-tensorflow:sshiddib/dnnl_threadpool_ops2

PiperOrigin-RevId: 317315002
Change-Id: If80d7892810ed13c5406864c0ae6130ec6701ed6
This commit is contained in:
TensorFlower Gardener 2020-06-19 08:52:38 -07:00
commit 6f2be48cde
6 changed files with 40 additions and 16 deletions

View File

@ -178,6 +178,9 @@ class MklAddNOp : public OpKernel {
dnn_fmt = MklTensorFormatToMklDnnDataFormat(mkl_data_format);
}
std::shared_ptr<stream> fwd_cpu_stream;
fwd_cpu_stream.reset(CreateStream(ctx, cpu_engine));
// Create memory descriptor for MKL-DNN.
// If all input in Tensorflow format, create block memory descriptor,
// else convert TF format to MKL memory descriptor
@ -215,6 +218,7 @@ class MklAddNOp : public OpKernel {
srcs_pd.push_back(memory::primitive_desc(md, cpu_engine));
#endif
src.SetUsrMem(md, &src_tensor);
src.SetUsrMemDataHandle(&src_tensor, fwd_cpu_stream);
inputs.push_back(src.GetOpMem());
}
@ -240,11 +244,10 @@ class MklAddNOp : public OpKernel {
}
AllocateOutputSetMklShape(ctx, kOutputIdx, &dst_tensor, output_tf_shape,
output_mkl_shape);
dst.SetUsrMemDataHandle(dst_tensor);
dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
// Create Sum op, and submit net for execution.
std::vector<primitive> net;
stream* fwd_cpu_stream = CreateStream(ctx, cpu_engine);
#ifdef ENABLE_MKLDNN_V1
mkldnn::sum sum_op(sum_pd);
std::unordered_map<int, memory> net_args = {

View File

@ -281,11 +281,19 @@ class MklConcatFwdPrimitive : public MklPrimitive {
std::shared_ptr<stream> fwd_stream) {
DCHECK_EQ(in_data.size(), context_.data_mem.size());
for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
#ifdef ENABLE_MKLDNN_THREADPOOL
context_.data_mem_shdptr[i]->set_data_handle(
static_cast<void*>(in_data[i].get_data_handle()), *fwd_stream);
}
context_.dst_mem->set_data_handle(
static_cast<void*>(dst_data.get_data_handle()), *fwd_stream);
#else
context_.data_mem_shdptr[i]->set_data_handle(
static_cast<void*>(in_data[i].get_data_handle()));
}
context_.dst_mem->set_data_handle(
static_cast<void*>(dst_data.get_data_handle()));
#endif // ENABLE_MKLDNN_THREADPOOL
for (size_t i = 0; i < concat_fwd_dims.num_inputs; i++) {
context_.data_mem[i] = *context_.data_mem_shdptr[i];
@ -788,11 +796,13 @@ class MklConcatOp : public OpKernel {
dnn_shape_dst);
DCHECK(dst_tensor != nullptr) << "Output tensor pointer is NULL";
std::shared_ptr<stream> fwd_cpu_stream;
fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
if (dnn_shape_dst.IsMklTensor())
dst_md = dnn_shape_dst.GetMklLayout();
dst.SetUsrMem(dst_md, dst_tensor);
std::shared_ptr<stream> fwd_cpu_stream;
fwd_cpu_stream.reset(CreateStream(context, cpu_engine));
dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
#ifdef ENABLE_MKLDNN_V1
auto concat_op = concat(concat_pd);
std::unordered_map<int, memory> net_args = {
@ -830,9 +840,10 @@ class MklConcatOp : public OpKernel {
dst_md = dnn_shape_dst.IsMklTensor() ? dnn_shape_dst.GetMklLayout()
: dst_md;
dst.SetUsrMem(dst_md, dst_tensor);
std::shared_ptr<stream> fwd_cpu_stream;
fwd_cpu_stream.reset(CreateStream(context, concat_fwd->GetEngine()));
dst.SetUsrMem(dst_md, dst_tensor);
dst.SetUsrMemDataHandle(dst_tensor, fwd_cpu_stream);
// Execute concat
concat_fwd->Execute(srcs_mem, dst.GetOpMem(), concat_fwd_dims,
fwd_cpu_stream);

View File

@ -75,6 +75,9 @@ class MklDequantizeOp : public OpKernel {
MklDnnData<T> src(&cpu_engine);
MklDnnData<float> dst(&cpu_engine);
std::shared_ptr<stream> reorder_stream;
reorder_stream.reset(CreateStream(ctx, cpu_engine));
// If input is in MKL layout, then simply grab input layout; otherwise,
// construct input TF layout. For TF layout, although input shape
// (src_dims) required is in MKL-DNN order, the layout is Tensorflow's
@ -85,6 +88,7 @@ class MklDequantizeOp : public OpKernel {
: memory::desc(src_dims, MklDnnType<T>(), MEMORY_FORMAT::nhwc);
src.SetUsrMem(src_md, &src_tensor);
src.SetUsrMemDataHandle(&src_tensor, reorder_stream);
Tensor* output_tensor = nullptr;
MklDnnShape output_mkl_shape;
@ -129,6 +133,7 @@ class MklDequantizeOp : public OpKernel {
AllocateOutputSetMklShape(ctx, 0, &output_tensor, output_tf_shape,
output_mkl_shape);
dst.SetUsrMem(dst_md, output_tensor);
dst.SetUsrMemDataHandle(output_tensor, reorder_stream);
// The quantization logic here for mode SCALED is similar to the logic
// in QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
@ -155,8 +160,6 @@ class MklDequantizeOp : public OpKernel {
// Also it does not define round_nearest (enum).
attr.set_int_output_round_mode(mkldnn::round_mode::round_nearest);
#endif // !ENABLE_MKLDNN_V1
std::shared_ptr<stream> reorder_stream;
reorder_stream.reset(CreateStream(ctx, cpu_engine));
std::vector<primitive> net;
// Create reorder primitive and then execute.

View File

@ -137,6 +137,7 @@ class MklLRNOp : public OpKernel {
// that input is in NHWC layout with Channel being the last dimension.
src_dnn_data.SetUsrMem(src_md, &src_tensor);
src_dnn_data.SetOpMemDesc(input_dims, MEMORY_FORMAT::nhwc);
src_dnn_data.SetUsrMemDataHandle(&src_tensor, fwd_stream_);
// dst_dnn_data has the same shape as input.
dst_dnn_data.SetUsrMem(src_md);
@ -157,7 +158,7 @@ class MklLRNOp : public OpKernel {
&output_tensor);
OP_REQUIRES_OK(context, context->status());
DCHECK(output_tensor != nullptr);
dst_dnn_data.SetUsrMemDataHandle(output_tensor);
dst_dnn_data.SetUsrMemDataHandle(output_tensor, fwd_stream_);
// Handle workspace required for MKL-DNN.
AllocateWorkspaceTensor(context, lrn_prim_desc, &workspace_dnn_data);
@ -393,6 +394,7 @@ class MklLRNGradOp : public OpKernel {
orig_input_dnn_shape.GetSizesAsMklDnnDims();
orig_input_dnn_data.SetUsrMem(orig_input_md, &orig_input_tensor);
orig_input_dnn_data.SetOpMemDesc(orig_input_dims, MEMORY_FORMAT::nhwc);
orig_input_dnn_data.SetUsrMemDataHandle(&orig_input_tensor, bwd_stream_);
// output_dnn_data has the same shape as original input
output_dnn_data.SetUsrMem(orig_input_md);
@ -421,7 +423,7 @@ class MklLRNGradOp : public OpKernel {
orig_input_format, &output_tensor);
OP_REQUIRES_OK(context, context->status());
DCHECK(output_tensor != nullptr);
output_dnn_data.SetUsrMemDataHandle(output_tensor);
output_dnn_data.SetUsrMemDataHandle(output_tensor, bwd_stream_);
// Create LRN primitive and add it to the net
// At this point, workspace is enabled, so we don't need

View File

@ -137,6 +137,7 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
memory::dims out_strides =
ReorderStrides(CalculateTFStrides(out_dims), perm);
std::shared_ptr<stream> transpose_stream;
in.SetUsrMem(in_dims, in_strides, &in_tensor);
// Output dimensions are same as input dimensions. We adjust the layout
// using strides.
@ -144,16 +145,16 @@ Status MKLTransposeND(OpKernelContext* context, const Tensor& in_tensor,
std::vector<primitive> net;
#ifdef ENABLE_MKLDNN_V1
std::shared_ptr<stream> transpose_stream;
auto* prim = FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem());
transpose_stream.reset(CreateStream(context, prim->GetEngine()));
in.SetUsrMemDataHandle(&in_tensor, transpose_stream);
out.SetUsrMemDataHandle(out_tensor, transpose_stream);
net.push_back(*(prim->GetPrimitive()));
std::vector<MemoryArgsMap> net_args;
net_args.push_back({{MKLDNN_ARG_FROM, *in.GetUsrMem()},
{MKLDNN_ARG_TO, *out.GetUsrMem()}});
execute_primitives(net, transpose_stream, net_args);
#else
std::shared_ptr<stream> transpose_stream;
transpose_stream.reset(new CPU_STREAM(cpu_engine));
net.push_back(FindOrCreateReorder<T>(in.GetUsrMem(), out.GetUsrMem()));
transpose_stream->submit(net).wait();

View File

@ -1543,17 +1543,21 @@ class MklDnnData {
}
/// Set function for data buffer of user memory primitive.
inline void SetUsrMemDataHandle(void* data_buffer) {
inline void SetUsrMemDataHandle(void* data_buffer,
std::shared_ptr<stream> t_stream = nullptr) {
CHECK_NOTNULL(user_memory_);
CHECK_NOTNULL(data_buffer);
#ifdef ENABLE_MKLDNN_THREADPOOL
user_memory_->set_data_handle(data_buffer, *t_stream);
#else
user_memory_->set_data_handle(data_buffer);
#endif // ENABLE_MKLDNN_THREADPOOL
}
/// Set function for data buffer of user memory primitive.
inline void SetUsrMemDataHandle(const Tensor* tensor) {
CHECK_NOTNULL(user_memory_);
CHECK_NOTNULL(tensor);
user_memory_->set_data_handle(GetTensorBuffer(tensor));
inline void SetUsrMemDataHandle(const Tensor* tensor,
std::shared_ptr<stream> t_stream = nullptr) {
SetUsrMemDataHandle(GetTensorBuffer(tensor), t_stream);
}
/// allocate function for data buffer