Make sure that kernels that are meant to run on GPU are scheduled inline. This

saves a context switch.
As a result, when running on GPU, the inception and translation models ares 2 to 3% faster, and the ptb_word_lm model is 10 to 20% faster while using up to 10% less CPU time.
Change: 128756911
This commit is contained in:
Benoit Steiner 2016-07-28 15:49:00 -08:00 committed by TensorFlower Gardener
parent 0105863ac7
commit f1acb3bd82
2 changed files with 8 additions and 1 deletions

View File

@ -93,6 +93,10 @@ OpKernel::OpKernel(OpKernelConstruction* context)
&output_name_map_));
OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
context->graph_def_version()));
// Kernels executing on GPU tie very few resources on the CPU where the
// scheduler runs: we consider them as inexpensive.
expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
}
OpKernel::~OpKernel() {}

View File

@ -104,7 +104,7 @@ class OpKernel {
// Returns true iff this op kernel is considered "expensive". The
// runtime may use this flag to optimize graph execution for example
// to "inline" inexpensive kernels.
virtual bool IsExpensive() { return true; }
virtual bool IsExpensive() { return expensive_; }
// Accessors.
const NodeDef& def() const { return def_; }
@ -160,6 +160,7 @@ class OpKernel {
const bool is_internal_; // True if this is an internal operation
NameRangeMap input_name_map_;
NameRangeMap output_name_map_;
bool expensive_;
TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
};
@ -179,6 +180,8 @@ class AsyncOpKernel : public OpKernel {
AsyncOpKernel* AsAsync() final { return this; }
void Compute(OpKernelContext* context) final;
bool IsExpensive() override { return true; }
};
// Wraps a tensor that is held by an Op across calls to Compute(). For