Make sure that kernels that are meant to run on GPU are scheduled inline. This
saves a context switch. As a result, when running on GPU, the inception and translation models ares 2 to 3% faster, and the ptb_word_lm model is 10 to 20% faster while using up to 10% less CPU time. Change: 128756911
This commit is contained in:
parent
0105863ac7
commit
f1acb3bd82
@ -93,6 +93,10 @@ OpKernel::OpKernel(OpKernelConstruction* context)
|
||||
&output_name_map_));
|
||||
OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
|
||||
context->graph_def_version()));
|
||||
|
||||
// Kernels executing on GPU tie very few resources on the CPU where the
|
||||
// scheduler runs: we consider them as inexpensive.
|
||||
expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
|
||||
}
|
||||
|
||||
OpKernel::~OpKernel() {}
|
||||
|
@ -104,7 +104,7 @@ class OpKernel {
|
||||
// Returns true iff this op kernel is considered "expensive". The
|
||||
// runtime may use this flag to optimize graph execution for example
|
||||
// to "inline" inexpensive kernels.
|
||||
virtual bool IsExpensive() { return true; }
|
||||
virtual bool IsExpensive() { return expensive_; }
|
||||
|
||||
// Accessors.
|
||||
const NodeDef& def() const { return def_; }
|
||||
@ -160,6 +160,7 @@ class OpKernel {
|
||||
const bool is_internal_; // True if this is an internal operation
|
||||
NameRangeMap input_name_map_;
|
||||
NameRangeMap output_name_map_;
|
||||
bool expensive_;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
|
||||
};
|
||||
@ -179,6 +180,8 @@ class AsyncOpKernel : public OpKernel {
|
||||
AsyncOpKernel* AsAsync() final { return this; }
|
||||
|
||||
void Compute(OpKernelContext* context) final;
|
||||
|
||||
bool IsExpensive() override { return true; }
|
||||
};
|
||||
|
||||
// Wraps a tensor that is held by an Op across calls to Compute(). For
|
||||
|
Loading…
x
Reference in New Issue
Block a user