From 6b3cb17b007422085c8c6ae7069a601897b2c5b5 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 2 Aug 2017 09:44:34 -0700 Subject: [PATCH] Quantiles TFT Analyzer and Mapper. Added a Bucketize op suitable for working with TFT. PiperOrigin-RevId: 163994906 --- .../boosted_trees/kernels/quantile_ops.cc | 53 +++++++++++++++ .../contrib/boosted_trees/ops/quantile_ops.cc | 28 ++++++++ .../python/kernel_tests/quantile_ops_test.py | 65 +++++++++++++++++++ 3 files changed, 146 insertions(+) diff --git a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc index df6bf22571e..7ee3bab0cce 100644 --- a/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc +++ b/tensorflow/contrib/boosted_trees/kernels/quantile_ops.cc @@ -874,4 +874,57 @@ class QuantilesOp : public OpKernel { REGISTER_KERNEL_BUILDER(Name("Quantiles").Device(DEVICE_CPU), QuantilesOp); +template +class BucketizeWithInputBoundariesOp : public OpKernel { + public: + explicit BucketizeWithInputBoundariesOp(OpKernelConstruction* context) + : OpKernel(context) {} + + void Compute(OpKernelContext* context) override { + const Tensor& boundaries_tensor = context->input(1); + VLOG(1) << "boundaries has shape: " + << boundaries_tensor.shape().DebugString(); + auto boundaries = boundaries_tensor.flat(); + boundaries_.clear(); + for (size_t i = 0; i < boundaries.size(); i++) { + boundaries_.push_back(boundaries(i)); + VLOG(1) << "boundaries(" << i << ") : " << boundaries(i); + } + OP_REQUIRES(context, std::is_sorted(boundaries_.begin(), boundaries_.end()), + errors::InvalidArgument("Expected sorted boundaries")); + + const Tensor& input_tensor = context->input(0); + auto input = input_tensor.flat(); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor.shape(), + &output_tensor)); + auto output = output_tensor->template flat(); + + for (size_t i = 0; i < input.size(); i++) { + output(i) = CalculateBucketIndex(input(i)); + } + } + + private: + int32 CalculateBucketIndex(const T value) { + auto first_bigger_it = + std::upper_bound(boundaries_.begin(), boundaries_.end(), value); + return first_bigger_it - boundaries_.begin(); + } + std::vector boundaries_; +}; + +#define REGISTER_KERNEL(T) \ + REGISTER_KERNEL_BUILDER(Name("BucketizeWithInputBoundaries") \ + .Device(DEVICE_CPU) \ + .TypeConstraint("T"), \ + BucketizeWithInputBoundariesOp); + +REGISTER_KERNEL(int32); +REGISTER_KERNEL(int64); +REGISTER_KERNEL(float); +REGISTER_KERNEL(double); +#undef REGISTER_KERNEL + } // namespace tensorflow diff --git a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc index 58d22a43986..0336008e861 100644 --- a/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc +++ b/tensorflow/contrib/boosted_trees/ops/quantile_ops.cc @@ -286,5 +286,33 @@ sparse_quantiles: Rank 1 tensors representing associated quantiles for each of the sparse feature tensors. )doc"); +REGISTER_OP("BucketizeWithInputBoundaries") + .Input("input: T") + .Input("boundaries: float") + .Output("output: int32") + .Attr("T: {int32, int64, float, double}") + .SetShapeFn(shape_inference::UnchangedShape) + .Doc(R"doc( +Bucketizes 'input' based on 'boundaries'. This function is similar to Bucketize +op in core math_ops, except that boundaries are specified using an input tensor, +as compared with a fixed attribute in Bucketize(). + +For example, if the inputs are + boundaries = [0, 10, 100] + input = [[-5, 10000] + [150, 10] + [5, 100]] + +then the output will be + output = [[0, 3] + [3, 2] + [1, 3]] + +input: Any shape of Tensor contains with numeric type. +boundaries: A vector Tensor of sorted floats specifies the boundaries +of the buckets. +output: Same shape as 'input', where each value of input is replaced with its corresponding bucket index. +)doc"); + } // namespace gtflow } // namespace tensorflow diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py index 3871e8d76df..1513c11c33d 100644 --- a/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py +++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/quantile_ops_test.py @@ -81,6 +81,48 @@ class QuantileBucketsOpTest(test_util.TensorFlowTestCase): self.assertAllEqual([1, 3, 5], dense_buckets[0].eval()) self.assertAllEqual([2, 4, 6.], sparse_buckets[0].eval()) + def testStreamingQuantileBucketsWithVaryingBatch(self): + """Sets up the quantile summary op test as follows. + + Creates batches examples with different number of inputs in each batch. + The input values are dense in the range [1 ... N] + The data looks like this: + | Batch | Start | InputList + | 1 | 1 | [1] + | 2 | 2 | [2, 3] + | 3 | 4 | [4, 5, 6] + | 4 | 7 | [7, 8, 9, 10] + | 5 | 11 | [11, 12, 13, 14, 15] + | 6 | 16 | [16, 17, 18, 19, 20, 21] + """ + + with self.test_session() as sess: + accumulator = quantile_ops.QuantileAccumulator( + init_stamp_token=0, num_quantiles=3, epsilon=0.001, name="q1") + resources.initialize_resources(resources.shared_resources()).run() + input_column = array_ops.placeholder(dtypes.float32) + weights = array_ops.placeholder(dtypes.float32) + update = accumulator.add_summary( + stamp_token=0, + column=input_column, + example_weights=weights) + + with self.test_session() as sess: + for i in range(1, 23): + # start = 1, 2, 4, 7, 11, 16 ... (see comment above) + start = int((i * (i-1) / 2) + 1) + sess.run(update, + {input_column: range(start, start+i), + weights: [1] * i}) + + with self.test_session() as sess: + sess.run(accumulator.flush(stamp_token=0, next_stamp_token=1)) + are_ready_flush, buckets = (accumulator.get_buckets(stamp_token=1)) + buckets, are_ready_flush = (sess.run( + [buckets, are_ready_flush])) + self.assertEqual(True, are_ready_flush) + self.assertAllEqual([1, 86., 170., 253.], buckets) + def testStreamingQuantileBuckets(self): """Sets up the quantile summary op test as follows. @@ -393,6 +435,29 @@ class QuantilesOpTest(test_util.TensorFlowTestCase): # Sparse feature 2 self.assertAllEqual([0, 0], sparse_quantiles[2].eval()) + def testBucketizeWithInputBoundaries(self): + with self.test_session(): + buckets = quantile_ops.bucketize_with_input_boundaries( + input=[1, 2, 3, 4, 5], + boundaries=[3]) + self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval()) + + def testBucketizeWithInputBoundaries2(self): + with self.test_session(): + boundaries = constant_op.constant([3], dtype=dtypes.float32) + buckets = quantile_ops.bucketize_with_input_boundaries( + input=[1, 2, 3, 4, 5], + boundaries=boundaries) + self.assertAllEqual([0, 0, 1, 1, 1], buckets.eval()) + + def testBucketizeWithInputBoundaries3(self): + with self.test_session(): + b = array_ops.placeholder(dtypes.float32) + buckets = quantile_ops.bucketize_with_input_boundaries( + input=[1, 2, 3, 4, 5], + boundaries=b) + self.assertAllEqual([0, 1, 1, 2, 2], + buckets.eval(feed_dict={b: [2, 4]})) if __name__ == "__main__": googletest.main()