Make sure that kernels that are meant to run on GPU are scheduled inline. This

saves a context switch. As a result, when running on GPU, the inception and translation models ares 2 to 3% faster, and the ptb_word_lm model is 10 to 20% faster while using up to 10% less CPU time. Change: 128756911
2016-07-28 15:49:00 -08:00 · 2016-07-28 15:49:00 -08:00 · f1acb3bd82
commit f1acb3bd82
parent 0105863ac7
2 changed files with 8 additions and 1 deletions
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@ -93,6 +93,10 @@ OpKernel::OpKernel(OpKernelConstruction* context)
                                   &output_name_map_));
  OP_REQUIRES_OK(context, CheckOpDeprecation(context->op_def(),
                                             context->graph_def_version()));
+
+  // Kernels executing on GPU tie very few resources on the CPU where the
+  // scheduler runs: we consider them as inexpensive.
+  expensive_ = context->device_type() != DeviceType(DEVICE_GPU);
 }

 OpKernel::~OpKernel() {}
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@ -104,7 +104,7 @@ class OpKernel {
  // Returns true iff this op kernel is considered "expensive". The
  // runtime may use this flag to optimize graph execution for example
  // to "inline" inexpensive kernels.
-  virtual bool IsExpensive() { return true; }
+  virtual bool IsExpensive() { return expensive_; }

  // Accessors.
  const NodeDef& def() const { return def_; }
@ -160,6 +160,7 @@ class OpKernel {
  const bool is_internal_;  // True if this is an internal operation
  NameRangeMap input_name_map_;
  NameRangeMap output_name_map_;
+  bool expensive_;

  TF_DISALLOW_COPY_AND_ASSIGN(OpKernel);
 };
@ -179,6 +180,8 @@ class AsyncOpKernel : public OpKernel {
  AsyncOpKernel* AsAsync() final { return this; }

  void Compute(OpKernelContext* context) final;
+
+  bool IsExpensive() override { return true; }
 };

 // Wraps a tensor that is held by an Op across calls to Compute(). For