Quantiles TFT Analyzer and Mapper.

Added a Bucketize op suitable for working with TFT. PiperOrigin-RevId: 163994906
2017-08-02 09:44:34 -07:00 · 2017-08-02 09:44:34 -07:00 · 6b3cb17b00
commit 6b3cb17b00
parent 5951ab51a9
3 changed files with 146 additions and 0 deletions
--- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc
@ -874,4 +874,57 @@ class QuantilesOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Quantiles").Device(DEVICE_CPU), QuantilesOp);
 template <typename T>
 class BucketizeWithInputBoundariesOp : public OpKernel {
 public:
  explicit BucketizeWithInputBoundariesOp(OpKernelConstruction* context)
      : OpKernel(context) {}
  void Compute(OpKernelContext* context) override {
    const Tensor& boundaries_tensor = context->input(1);
    VLOG(1) << "boundaries has shape: "
            << boundaries_tensor.shape().DebugString();
    auto boundaries = boundaries_tensor.flat<float>();
    boundaries_.clear();
    for (size_t i = 0; i < boundaries.size(); i++) {
      boundaries_.push_back(boundaries(i));
      VLOG(1) << "boundaries(" << i << ") : " << boundaries(i);
    }
    OP_REQUIRES(context, std::is_sorted(boundaries_.begin(), boundaries_.end()),
                errors::InvalidArgument("Expected sorted boundaries"));
    const Tensor& input_tensor = context->input(0);
    auto input = input_tensor.flat<T>();
    Tensor* output_tensor = nullptr;
    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(),
                                                     &output_tensor));
    auto output = output_tensor->template flat<int32>();
    for (size_t i = 0; i < input.size(); i++) {
      output(i) = CalculateBucketIndex(input(i));
    }
  }
 private:
  int32 CalculateBucketIndex(const T value) {
    auto first_bigger_it =
        std::upper_bound(boundaries_.begin(), boundaries_.end(), value);
    return first_bigger_it - boundaries_.begin();
  }
  std::vector<T> boundaries_;
 };
 #define REGISTER_KERNEL(T)                                     \
  REGISTER_KERNEL_BUILDER(Name("BucketizeWithInputBoundaries") \
                              .Device(DEVICE_CPU)              \
                              .TypeConstraint<T>("T"),         \
                          BucketizeWithInputBoundariesOp<T>);
 REGISTER_KERNEL(int32);
 REGISTER_KERNEL(int64);
 REGISTER_KERNEL(float);
 REGISTER_KERNEL(double);
 #undef REGISTER_KERNEL
 }  // namespace tensorflow
--- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc
@ -286,5 +286,33 @@ sparse_quantiles: Rank 1 tensors representing associated quantiles for each of
 the sparse feature tensors.
 )doc");
 REGISTER_OP("BucketizeWithInputBoundaries")
    .Input("input: T")
    .Input("boundaries: float")
    .Output("output: int32")
    .Attr("T: {int32, int64, float, double}")
    .SetShapeFn(shape_inference::UnchangedShape)
    .Doc(R"doc(
 Bucketizes 'input' based on 'boundaries'. This function is similar to Bucketize
 op in core math_ops, except that boundaries are specified using an input tensor,
 as compared with a fixed attribute in Bucketize().
 For example, if the inputs are
    boundaries = [0, 10, 100]
    input = [[-5, 10000]
             [150,   10]
             [5,    100]]
 then the output will be
    output = [[0, 3]
              [3, 2]
              [1, 3]]
 input: Any shape of Tensor contains with numeric type.
 boundaries: A vector Tensor of sorted floats specifies the boundaries
 of the buckets.
 output: Same shape as 'input', where each value of input is replaced with its corresponding bucket index.
 )doc");
 }  // namespace gtflow
 }  // namespace tensorflow
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py
@ -81,6 +81,48 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase):
      self.assertAllEqual([1, 3, 5], dense_buckets[0].eval())
      self.assertAllEqual([2, 4, 6.], sparse_buckets[0].eval())
  def testStreamingQuantileBucketsWithVaryingBatch(self):
    """Sets up the quantile summary op test as follows.
    Creates batches examples with different number of inputs in each batch.
    The input values are dense in the range [1 ... N]
    The data looks like this:
    | Batch | Start | InputList
    |   1   |   1   |  [1]
    |   2   |   2   |  [2, 3]
    |   3   |   4   |  [4, 5, 6]
    |   4   |   7   |  [7, 8, 9, 10]
    |   5   |  11   |  [11, 12, 13, 14, 15]
    |   6   |  16   |  [16, 17, 18, 19, 20, 21]
    """
    with self.test_session() as sess:
      accumulator = quantile_ops.QuantileAccumulator(
          init_stamp_token=0, num_quantiles=3, epsilon=0.001, name="q1")
      resources.initialize_resources(resources.shared_resources()).run()
    input_column = array_ops.placeholder(dtypes.float32)
    weights = array_ops.placeholder(dtypes.float32)
    update = accumulator.add_summary(
        stamp_token=0,
        column=input_column,
        example_weights=weights)
    with self.test_session() as sess:
      for i in range(1, 23):
        # start = 1, 2, 4, 7, 11, 16 ... (see comment above)
        start = int((i * (i-1) / 2) + 1)
        sess.run(update,
                 {input_column: range(start, start+i),
                  weights: [1] * i})
    with self.test_session() as sess:
      sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1))
      are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1))
      buckets, are_ready_flush = (sess.run(
          [buckets, are_ready_flush]))
      self.assertEqual(True, are_ready_flush)
      self.assertAllEqual([1, 86., 170., 253.], buckets)
  def testStreamingQuantileBuckets(self):
    """Sets up the quantile summary op test as follows.
@ -393,6 +435,29 @@ class QuantilesOpTest(test_util.TensorFlowTestCase):
      # Sparse feature 2
      self.assertAllEqual([0, 0], sparse_quantiles[2].eval())
  def testBucketizeWithInputBoundaries(self):
    with self.test_session():
      buckets = quantile_ops.bucketize_with_input_boundaries(
          input=[1, 2, 3, 4, 5],
          boundaries=[3])
      self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval())
  def testBucketizeWithInputBoundaries2(self):
    with self.test_session():
      boundaries = constant_op.constant([3], dtype=dtypes.float32)
      buckets = quantile_ops.bucketize_with_input_boundaries(
          input=[1, 2, 3, 4, 5],
          boundaries=boundaries)
      self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval())
  def testBucketizeWithInputBoundaries3(self):
    with self.test_session():
      b = array_ops.placeholder(dtypes.float32)
      buckets = quantile_ops.bucketize_with_input_boundaries(
          input=[1, 2, 3, 4, 5],
          boundaries=b)
      self.assertAllEqual([0, 1, 1, 2, 2],
                          buckets.eval(feed_dict={b: [2, 4]}))
 if __name__ == "__main__":
  googletest.main()