Sped up construction of BCast helper class for the very common case

where both shapes are the same by using much more straightforward code to achieve the same ultimate initialization of the various instance variables with simpler code. Added benchmark for this to bcast_test.cc. Speeds up the same_shape case by 65% (67 ns to 23 ns for a two-dimensional shape) without any real effect on the different shape case. Run on machine with (40 X 2801 MHz CPUs); 2016/01/28-11:12:26 CPU: Intel Ivybridge with HyperThreading (20 cores) dL1:32KB dL2:256KB dL3:25MB Benchmark Base (ns) New (ns) Improvement ------------------------------------------------------------------ BM_BCastSetup/0 122 122 +0.0% BM_BCastSetup/1 67 23 +65.7% Change: 113374076
2016-01-29 10:20:54 -08:00 · 2016-01-29 10:20:54 -08:00 · 938902bade
commit 938902bade
parent 5dc0ab7565
3 changed files with 148 additions and 106 deletions
--- a/tensorflow/core/util/bcast.cc
+++ b/tensorflow/core/util/bcast.cc
@ -22,114 +22,132 @@ namespace tensorflow {
 void BCast::Reverse(Vec* shape) { std::reverse(shape->begin(), shape->end()); }

 BCast::BCast(const Vec& sx, const Vec& sy) {
-  // Reverse the shape of x and y for convenience.
-  // After the reverse, 0-th is the inner-most dimension.
-  Vec x = sx;
-  Reverse(&x);
-  Vec y = sy;
-  Reverse(&y);
-
-  // 1-extend and align x and y so that they are the same size.
-  if (x.size() > y.size()) {
-    y.resize(x.size(), 1);
-  } else {
-    x.resize(y.size(), 1);
-  }
-
-  // Going through each dimension starting from the inner-most
-  // dimension, compares dimension of x and y. They are compatible if
-  // they are equal or either is 1.
-  enum State {
-    UNKNOWN,
-    SAME,
-    X_ONE,
-    Y_ONE,
-  };
-  State prev = UNKNOWN;
-  const int64 n = x.size();
-  for (int i = 0; i < n; ++i) {
-    // Output shape.
-    State curr = UNKNOWN;
-    const int64 x_i = x[i];  // i-th dimension of x.
-    CHECK_GE(x_i, 0);
-    const int64 y_i = y[i];  // i-th dimension of y.
-    CHECK_GE(y_i, 0);
-    int64 o_i;   // i-th dimension of the output.
-    int64 bx_i;  // i-th broadcast for x.
-    int64 by_i;  // i-th broadcast for y.
-    // Invariant:
-    //   o_i = x_i * bx_i = y_i * by_i
-    if (x_i == y_i) {
-      // No broadcast.
-      o_i = x_i;
-      bx_i = 1;
-      by_i = 1;
-      curr = SAME;
-    } else if (x_i == 1) {
-      // x broadcast to y on this dimension.
-      o_i = y_i;
-      bx_i = y_i;
-      by_i = 1;
-      grad_x_reduce_idx_.push_back(n - 1 - i);
-      curr = X_ONE;
-    } else if (y_i == 1) {
-      // y broadcast to x on this dimension.
-      o_i = x_i;
-      bx_i = 1;
-      by_i = x_i;
-      grad_y_reduce_idx_.push_back(n - 1 - i);
-      curr = Y_ONE;
-    } else {
-      valid_ = false;
-      return;
+  if (sx == sy) {
+    // Fast path for common case of identical shapes for sx and sy
+    int64 elements = 1;
+    const int n = sx.size();
+    output_.resize(n);
+    for (int i = 0; i < n; i++) {
+      int64 dim = sx[i];
+      elements *= dim;
+      output_[i] = dim;
    }
-    output_.push_back(o_i);
-    // Reshape/broadcast.
-    // Invariant:
-    //  result[i] == x_reshape[i] * x_bcast[i] == y_reshape_[i] * y_bcast_[i]
-    if (curr == SAME && x_i == 1) {
-      // Both side are 1s.
-      grad_x_reduce_idx_.push_back(n - 1 - i);
-      grad_y_reduce_idx_.push_back(n - 1 - i);
-      continue;
-    } else if (prev == curr) {
-      // It is a run of the same cases (no broadcast, x broadcast to
-      // y, y broadcast to x). We can reshape the input so that fewer
-      // dimensions are involved in the intermediate computation.
-      result_.back() *= o_i;
-      x_reshape_.back() *= x_i;
-      x_bcast_.back() *= bx_i;
-      y_reshape_.back() *= y_i;
-      y_bcast_.back() *= by_i;
-    } else {
-      result_.push_back(o_i);
-      x_reshape_.push_back(x_i);
-      x_bcast_.push_back(bx_i);
-      y_reshape_.push_back(y_i);
-      y_bcast_.push_back(by_i);
-    }
-    prev = curr;
-  }
-
-  if (result_.empty()) {
-    // Can happen when both x and y are effectively scalar.
-    result_.push_back(1);
-    x_reshape_.push_back(1);
+    x_reshape_.push_back(elements);
+    y_reshape_.push_back(elements);
    x_bcast_.push_back(1);
-    y_reshape_.push_back(1);
    y_bcast_.push_back(1);
-  }
+    result_.push_back(elements);
+    // grad_x_reduce_ and grad_y_reduce_ are left as empty
+  } else {
+    // Reverse the shape of x and y for convenience.
+    // After the reverse, 0-th is the inner-most dimension.
+    Vec x = sx;
+    Reverse(&x);
+    Vec y = sy;
+    Reverse(&y);

-  // Reverse all vectors since x and y were reversed at very
-  // beginning.
-  Reverse(&x_reshape_);
-  Reverse(&x_bcast_);
-  Reverse(&y_reshape_);
-  Reverse(&y_bcast_);
-  Reverse(&result_);
-  Reverse(&output_);
-  Reverse(&grad_x_reduce_idx_);
-  Reverse(&grad_y_reduce_idx_);
+    // 1-extend and align x and y so that they are the same size.
+    if (x.size() > y.size()) {
+      y.resize(x.size(), 1);
+    } else {
+      x.resize(y.size(), 1);
+    }
+
+    // Going through each dimension starting from the inner-most
+    // dimension, compares dimension of x and y. They are compatible if
+    // they are equal or either is 1.
+    enum State {
+      UNKNOWN,
+      SAME,
+      X_ONE,
+      Y_ONE,
+    };
+    State prev = UNKNOWN;
+    const int64 n = x.size();
+    for (int i = 0; i < n; ++i) {
+      // Output shape.
+      State curr = UNKNOWN;
+      const int64 x_i = x[i];  // i-th dimension of x.
+      CHECK_GE(x_i, 0);
+      const int64 y_i = y[i];  // i-th dimension of y.
+      CHECK_GE(y_i, 0);
+      int64 o_i;   // i-th dimension of the output.
+      int64 bx_i;  // i-th broadcast for x.
+      int64 by_i;  // i-th broadcast for y.
+      // Invariant:
+      //   o_i = x_i * bx_i = y_i * by_i
+      if (x_i == y_i) {
+        // No broadcast.
+        o_i = x_i;
+        bx_i = 1;
+        by_i = 1;
+        curr = SAME;
+      } else if (x_i == 1) {
+        // x broadcast to y on this dimension.
+        o_i = y_i;
+        bx_i = y_i;
+        by_i = 1;
+        grad_x_reduce_idx_.push_back(n - 1 - i);
+        curr = X_ONE;
+      } else if (y_i == 1) {
+        // y broadcast to x on this dimension.
+        o_i = x_i;
+        bx_i = 1;
+        by_i = x_i;
+        grad_y_reduce_idx_.push_back(n - 1 - i);
+        curr = Y_ONE;
+      } else {
+        valid_ = false;
+        return;
+      }
+      output_.push_back(o_i);
+      // Reshape/broadcast.
+      // Invariant:
+      //  result[i] == x_reshape[i] * x_bcast[i] == y_reshape_[i] * y_bcast_[i]
+      if (curr == SAME && x_i == 1) {
+        // Both side are 1s.
+        grad_x_reduce_idx_.push_back(n - 1 - i);
+        grad_y_reduce_idx_.push_back(n - 1 - i);
+        continue;
+      } else if (prev == curr) {
+        // It is a run of the same cases (no broadcast, x broadcast to
+        // y, y broadcast to x). We can reshape the input so that fewer
+        // dimensions are involved in the intermediate computation.
+        result_.back() *= o_i;
+        x_reshape_.back() *= x_i;
+        x_bcast_.back() *= bx_i;
+        y_reshape_.back() *= y_i;
+        y_bcast_.back() *= by_i;
+      } else {
+        result_.push_back(o_i);
+        x_reshape_.push_back(x_i);
+        x_bcast_.push_back(bx_i);
+        y_reshape_.push_back(y_i);
+        y_bcast_.push_back(by_i);
+      }
+      prev = curr;
+    }
+
+    if (result_.empty()) {
+      // Can happen when both x and y are effectively scalar.
+      result_.push_back(1);
+      x_reshape_.push_back(1);
+      x_bcast_.push_back(1);
+      y_reshape_.push_back(1);
+      y_bcast_.push_back(1);
+    }
+
+    // Reverse all vectors since x and y were reversed at very
+    // beginning.
+    Reverse(&x_reshape_);
+    Reverse(&x_bcast_);
+    Reverse(&y_reshape_);
+    Reverse(&y_bcast_);
+    Reverse(&result_);
+    Reverse(&output_);
+    Reverse(&grad_x_reduce_idx_);
+    Reverse(&grad_y_reduce_idx_);
+  }
 }

 }  // end namespace tensorflow
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@ -67,7 +67,7 @@ namespace tensorflow {
 // TODO(zhifengc): Adds support for n-ary (n >= 2).
 class BCast {
 public:
-  // A vector of int32 representing the shape of tensor. The 0-th
+  // A vector of int64 representing the shape of tensor. The 0-th
  // element is the outer-most dimension and the last element is the
  // inner-most dimension. Note that we do not use TensorShape since
  // it's more convenient to manipulate Vec directly for this module.
@ -104,7 +104,6 @@ class BCast {
  Vec grad_y_reduce_idx_;

  static void Reverse(Vec* shape);
-  static bool HasZero(const Vec& shape);

  TF_DISALLOW_COPY_AND_ASSIGN(BCast);
 };
--- a/tensorflow/core/util/bcast_test.cc
+++ b/tensorflow/core/util/bcast_test.cc
@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"

 namespace tensorflow {
 namespace {
@ -57,6 +58,15 @@ TEST(BCastTest, Basic_SameShape) {
            "[][]");
 }

+TEST(BCastTest, Basic_SameShapeWithZeroDim) {
+  // Effectively no broadcast needed.
+  EXPECT_EQ(BCast({11, 7, 0, 3, 2}, {11, 7, 0, 3, 2}),
+            "[0][1][0][1]"
+            "[0]"
+            "[11,7,0,3,2]"
+            "[][]");
+}
+
 TEST(BCastTest, Basic_Scalar_Scalar) {
  // Effectively it's a scalar and a scalar.
  // [1, 1] [1]
@ -237,5 +247,20 @@ TEST(BCastTest, TestZeroDimensionShape) {
            "[0,1,3][]");
 }

+static void BM_BCastSetup(int iters, int same_shape) {
+  if (same_shape) {
+    testing::SetLabel("same_shapes");
+    while (--iters > 0) {
+      class BCast b({1000, 100}, {1000, 100});
+    }
+  } else {
+    testing::SetLabel("different_shapes");
+    while (--iters > 0) {
+      class BCast b({3, 1, 5}, {2, 0, 3, 0, 5});
+    }
+  }
+}
+BENCHMARK(BM_BCastSetup)->Arg(0)->Arg(1);
+
 }  // namespace
 }  // namespace tensorflow