From f19dc6c5e28694bf067d0bbc9698b57ec4068b4e Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 3 Nov 2016 07:42:29 -0800 Subject: [PATCH] Add a GPU implementation of the ParameterizedTruncatedNormalOp kernel. Benchmarks: Benchmark Time(ns) CPU(ns) Iterations -------------------------------------------------------------------------- BM_PTruncatedNormal_gpu_1000_1000 4632369 5175938 100 184.251M items/s BM_PTruncatedNormal_2SD_gpu_10000_100 2849437 3368804 206 283.090M items/s BM_PTruncatedNormal_OneTail_gpu_10000_100 3300317 3905713 179 244.174M items/s Change: 138074670 --- .../parameterized_truncated_normal_op.cc | 48 ++-- .../parameterized_truncated_normal_op.h | 23 +- ...arameterized_truncated_normal_op_gpu.cu.cc | 214 ++++++++++++++++++ .../parameterized_truncated_normal_op_test.cc | 3 + tensorflow/python/kernel_tests/BUILD | 2 +- .../parameterized_truncated_normal_op_test.py | 58 +++-- 6 files changed, 299 insertions(+), 49 deletions(-) create mode 100644 tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc index 4d31edbb1a9..77c4b7a7299 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op.cc +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.cc @@ -46,25 +46,6 @@ namespace functor { using random::PhiloxRandom; using random::SingleSampleAdapter; -// Sample a truncated normal random variable, with mean, stddev, minval, and -// maxval parameters for each batch. Uses two rejection sampling algorithms -// described in http://rd.springer.com/article/10.1007/BF00143942. -// -// Either minval may be -infinity, or maxval may be +infinity. If the interval -// (minval, maxval) is empty, the result is NaN. Large intervals which include -// both tails may have reduced accuracy. -template -struct TruncatedNormalFunctor { - void operator()(OpKernelContext* ctx, const Device& d, int64 num_batches, - int64 samples_per_batch, int64 num_elements, - typename TTypes::ConstFlat means, - typename TTypes::ConstFlat stddevs, - typename TTypes::ConstFlat minvals, - typename TTypes::ConstFlat maxvals, - const random::PhiloxRandom& gen, - typename TTypes::Flat output); -}; - template struct TruncatedNormalFunctor { static const int kMaxIterations = 100; @@ -96,8 +77,8 @@ struct TruncatedNormalFunctor { // Vectorized intermediate calculations for uniform rejection sampling. // We always generate at most 4 samples. - tensorflow::random::Array z; - tensorflow::random::Array g; + Eigen::array z; + Eigen::array g; for (int64 b = start_batch; b < limit_batch; ++b) { // We are passed a flat array for each of the parameter tensors. @@ -145,13 +126,7 @@ struct TruncatedNormalFunctor { if (diff < cutoff) { // Sample from a uniform distribution on [normMin, normMax]. - T plusFactor; - if (normMin < T(0)) { - // normMax > 0 because it is flipped otherwise. - plusFactor = T(0); - } else { - plusFactor = normMin * normMin; - } + const T plusFactor = (normMin < T(0)) ? T(0) : normMin * normMin; while (sample < limit_sample) { const auto rand = dist(&gen_copy); @@ -395,4 +370,21 @@ TF_CALL_double(REGISTER); #undef REGISTER +#if GOOGLE_CUDA + +#define REGISTER(TYPE) \ + REGISTER_KERNEL_BUILDER(Name("ParameterizedTruncatedNormal") \ + .Device(DEVICE_GPU) \ + .HostMemory("shape") \ + .TypeConstraint("dtype"), \ + ParameterizedTruncatedNormalOp) + +TF_CALL_half(REGISTER); +TF_CALL_float(REGISTER); +TF_CALL_double(REGISTER); + +#undef REGISTER + +#endif // GOOGLE_CUDA + } // end namespace tensorflow diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op.h b/tensorflow/core/kernels/parameterized_truncated_normal_op.h index a46bb1c9fa6..cc801eb8109 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op.h +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op.h @@ -16,14 +16,35 @@ limitations under the License. #ifndef TENSORFLOW_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_ #define TENSORFLOW_KERNELS_PARAMETERIZED_TRUNCATED_NORMAL_OP_H_ +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/lib/random/random_distributions.h" + namespace tensorflow { class OpKernelContext; namespace functor { +// Sample a truncated normal random variable, with mean, stddev, minval, and +// maxval parameters for each batch. Uses two rejection sampling algorithms +// described in http://rd.springer.com/article/10.1007/BF00143942. +// +// Either minval may be -infinity, or maxval may be +infinity. If the interval +// (minval, maxval) is empty, the result is NaN. Large intervals which include +// both tails may have reduced accuracy. template -struct TruncatedNormalFunctor; +struct TruncatedNormalFunctor { + void operator()(OpKernelContext* ctx, const Device& d, int64 num_batches, + int64 samples_per_batch, int64 num_elements, + typename TTypes::ConstFlat means, + typename TTypes::ConstFlat stddevs, + typename TTypes::ConstFlat minvals, + typename TTypes::ConstFlat maxvals, + const random::PhiloxRandom& gen, + typename TTypes::Flat output); + + static const int kMaxIterations = 100; +}; } // namespace functor } // namespace tensorflow diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc new file mode 100644 index 00000000000..42d47440690 --- /dev/null +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_gpu.cu.cc @@ -0,0 +1,214 @@ +/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#if GOOGLE_CUDA + +#define EIGEN_USE_GPU + +#include "tensorflow/core/kernels/parameterized_truncated_normal_op.h" + +#include +#include +#include + +#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" +#include "tensorflow/core/framework/tensor_types.h" +#include "tensorflow/core/lib/random/philox_random.h" +#include "tensorflow/core/lib/random/random_distributions.h" +#include "tensorflow/core/util/cuda_kernel_helper.h" + +#define UNROLL _Pragma("unroll") + +namespace tensorflow { + +class OpKernelContext; + +namespace functor { + +typedef Eigen::GpuDevice GPUDevice; + +template +__global__ void __launch_bounds__(1024) + TruncatedNormalKernel(random::PhiloxRandom gen, T* data, int64 num_batches, + int64 samples_per_batch, int64 num_elements, + const T* means, bool single_mean, const T* stddevs, + bool single_stddev, const T* minvals, + bool single_minval, const T* maxvals, + bool single_maxval, int64 kMaxIterations) { + const int32 max_samples_per_item = 2 * kMaxIterations; + // Initial offset as given by CUDA_1D_KERNEL_LOOP. + const int32 initial_offset = blockIdx.x * blockDim.x + threadIdx.x; + gen.Skip(max_samples_per_item * initial_offset); + typedef random::UniformDistribution Uniform; + Uniform dist; + const int kDistSize = Uniform::kResultElementCount; + const T quietNaN = Eigen::NumTraits::quiet_NaN(); + + // We skip the total number of threads to get to the next element. To produce + // deterministic results between devices, each element in the output array + // skips max_samples_per_item in the generator. Then after generating this + // item, we need to skip the samples for one element for every thread to get + // to the next element that we actually process. + const int32 samples_between_processed_elements = + max_samples_per_item * (gridDim.x * blockDim.x); + + CUDA_1D_KERNEL_LOOP(offset, num_elements) { + // Track how many more samples we need to skip before we process the next + // element. + int32 remaining_samples = samples_between_processed_elements; + + const int64 batch_id = offset / samples_per_batch; + T mean = means[single_mean ? 0 : batch_id]; + const T input_stddev = stddevs[single_stddev ? 0 : batch_id]; + T minval = minvals[single_minval ? 0 : batch_id]; + T maxval = maxvals[single_maxval ? 0 : batch_id]; + + // Flip the distribution if we can make the lower bound positive. + T stddev; + if (Eigen::numext::isinf(minval) || maxval < mean) { + // Reverse all calculations. normMin and normMax will be flipped. + // std::swap is a host function (not available in CUDA). + T temp = minval; + minval = maxval; + maxval = temp; + stddev = -input_stddev; + } else { + stddev = input_stddev; + } + + // Calculate normalized samples, then scale them. + const T normMin = (minval - mean) / stddev; + const T normMax = (maxval - mean) / stddev; + + // Determine the method to use. + const T sqrtFactor = Eigen::numext::sqrt((normMin * normMin) + T(4)); + const T cutoff = + T(2) * + Eigen::numext::exp(T(0.5) + (normMin * (normMin - sqrtFactor)) / T(4)) / + (normMin + sqrtFactor); + const T diff = normMax - normMin; + + // Validate the normalized min and max, because the originals may have been + // flipped already. + if (!(input_stddev > T(0) && normMin < normMax && + (Eigen::numext::isfinite(normMin) || + Eigen::numext::isfinite(normMax)))) { + data[offset] = quietNaN; + } else if (diff < cutoff) { + // Sample from a uniform distribution on [normMin, normMax]. + + // Vectorized intermediate calculations for uniform rejection sampling. + // We always generate at most 4 samples. + Eigen::array z; + Eigen::array g; + + const T plusFactor = (normMin < T(0)) ? T(0) : normMin * normMin; + + int numIterations = 0; + while (numIterations < kMaxIterations) { + const auto rand = dist(&gen); + remaining_samples -= gen.kResultElementCount; + UNROLL for (int i = 0; i < kDistSize; i++) { + z[i] = rand[i] * diff + normMin; + } + UNROLL for (int i = 0; i < kDistSize; i++) { + g[i] = (plusFactor - z[i] * z[i]) / 2.0; + } + + const auto u = dist(&gen); + remaining_samples -= gen.kResultElementCount; + UNROLL for (int i = 0; i < kDistSize; i++) { + if (u[i] <= Eigen::numext::exp(g[i]) || + numIterations + 1 >= kMaxIterations) { + // Accept the sample z. + // If we run out of iterations, just use the current uniform + // sample. Emperically, the probability of accepting each sample + // is at least 50% for typical inputs, so we will always accept + // by 100 iterations. + // This introduces a slight inaccuracy when at least one bound + // is large, minval is negative and maxval is positive. + data[offset] = z[i] * stddev + mean; + // Break out of the nested loop by updating numIterations. + numIterations = kMaxIterations; + break; + } else { + numIterations++; + } + } + } + } else { + // Sample from an exponential distribution with alpha maximizing + // acceptance probability, offset by normMin from the origin. + // Accept only if less than normMax. + const T alpha = + (normMin + Eigen::numext::sqrt((normMin * normMin) + T(4))) / T(2); + int numIterations = 0; + while (numIterations < kMaxIterations) { + auto rand = dist(&gen); + remaining_samples -= gen.kResultElementCount; + UNROLL for (int i = 0; i < kDistSize; i += 2) { + const T z = -Eigen::numext::log(rand[i]) / alpha + normMin; + const T x = normMin < alpha ? alpha - z : normMin - alpha; + const T g = Eigen::numext::exp(-x * x / 2.0); + const T u = rand[i + 1]; + if ((u <= g && z < normMax) || numIterations + 1 >= kMaxIterations) { + data[offset] = z * stddev + mean; + // Break out of the nested loop by updating numIterations. + numIterations = kMaxIterations; + break; + } else { + numIterations++; + } + } + } + } + + gen.Skip(remaining_samples); + } +} + +// Partial specialization for GPU +template +struct TruncatedNormalFunctor { + static const int kMaxIterations = 100; + + void operator()(OpKernelContext* ctx, const GPUDevice& d, int64 num_batches, + int64 samples_per_batch, int64 num_elements, + typename TTypes::ConstFlat means, + typename TTypes::ConstFlat stddevs, + typename TTypes::ConstFlat minvals, + typename TTypes::ConstFlat maxvals, + const random::PhiloxRandom& gen, + typename TTypes::Flat output) { + const auto config = GetCudaLaunchConfig(num_elements, d); + + TruncatedNormalKernel< + T><<>>( + gen, output.data(), num_batches, samples_per_batch, num_elements, + means.data(), means.dimension(0) == 1, stddevs.data(), + stddevs.dimension(0) == 1, minvals.data(), minvals.dimension(0) == 1, + maxvals.data(), maxvals.dimension(0) == 1, kMaxIterations); + }; +}; + +// Explicit instantiation of the GPU distributions functors +template struct TruncatedNormalFunctor; +template struct TruncatedNormalFunctor; +template struct TruncatedNormalFunctor; + +} // namespace functor +} // namespace tensorflow + +#endif // GOOGLE_CUDA diff --git a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc index 13d1187f926..07f2f75ca5a 100644 --- a/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc +++ b/tensorflow/core/kernels/parameterized_truncated_normal_op_test.cc @@ -131,5 +131,8 @@ static Graph* PTruncatedNormalOneTail(int num_batches, int samples_per_batch) { BM_PTruncatedNormalDev(cpu, 1000, 1000); BM_PTruncatedNormalDev_2SD(cpu, 10000, 100); BM_PTruncatedNormalDev_OneTail(cpu, 10000, 100); +BM_PTruncatedNormalDev(gpu, 1000, 1000); +BM_PTruncatedNormalDev_2SD(gpu, 10000, 100); +BM_PTruncatedNormalDev_OneTail(gpu, 10000, 100); } // namespace tensorflow diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD index 24a1d8e7c97..0100e6b3268 100644 --- a/tensorflow/python/kernel_tests/BUILD +++ b/tensorflow/python/kernel_tests/BUILD @@ -220,7 +220,7 @@ cuda_py_test( additional_deps = ["//tensorflow:tensorflow_py"], ) -tf_py_test( +cuda_py_test( name = "parameterized_truncated_normal_op_test", size = "small", srcs = ["parameterized_truncated_normal_op_test.py"], diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py index 8d41029c0b5..1c09949598a 100644 --- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py +++ b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py @@ -97,10 +97,10 @@ def z_test(real, expected, i, num_samples): class ParameterizedTruncatedNormalTest(tf.test.TestCase): - use_gpu = False + _use_gpu = False z_limit = 6.0 - # Stop at moment 20 to avoid numerical errors in the theoretical moments. + # Stop at moment 10 to avoid numerical errors in the theoretical moments. max_moment = 10 def validateMoments(self, shape, mean, stddev, minval, maxval, seed=1618): @@ -109,9 +109,11 @@ class ParameterizedTruncatedNormalTest(tf.test.TestCase): # Give up early if we are unable to import it. import scipy.stats # pylint: disable=g-import-not-at-top,unused-variable tf.set_random_seed(seed) - with self.test_session(use_gpu=self.use_gpu): - samples = random_ops.parameterized_truncated_normal( - shape, mean, stddev, minval, maxval).eval() + with self.test_session(use_gpu=self._use_gpu): + samples = random_ops.parameterized_truncated_normal(shape, mean, stddev, + minval, + maxval).eval() + assert (~np.isnan(samples)).all() moments = calculate_moments(samples, self.max_moment) expected_moments = TruncatedNormalMoments(mean, stddev, minval, maxval) num_samples = functools.reduce(lambda x, y: x * y, shape, 1) @@ -131,9 +133,11 @@ class ParameterizedTruncatedNormalTest(tf.test.TestCase): try: import scipy.stats # pylint: disable=g-import-not-at-top tf.set_random_seed(seed) - with self.test_session(use_gpu=self.use_gpu): - samples = random_ops.parameterized_truncated_normal( - shape, mean, stddev, minval, maxval).eval() + with self.test_session(use_gpu=self._use_gpu): + samples = random_ops.parameterized_truncated_normal(shape, mean, stddev, + minval, + maxval).eval() + assert (~np.isnan(samples)).all() minval = max(mean - stddev * 10, minval) maxval = min(mean + stddev * 10, maxval) dist = scipy.stats.norm(loc=mean, scale=stddev) @@ -173,8 +177,12 @@ class ParameterizedTruncatedNormalTest(tf.test.TestCase): self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10) +class ParameterizedTruncatedNormalGpuTest(ParameterizedTruncatedNormalTest): + _use_gpu = True + + # Benchmarking code -def parameterized_vs_naive(shape, num_iters): +def parameterized_vs_naive(shape, num_iters, use_gpu=False): np.random.seed(1618) # Make it reproducible. # No CSE/CF. @@ -183,17 +191,29 @@ def parameterized_vs_naive(shape, num_iters): graph_options=tf.GraphOptions(optimizer_options=optimizer_options)) with tf.Session(config=config) as sess: - param_op = tf.group(random_ops.parameterized_truncated_normal(shape)) - naive_op = tf.group(random_ops.truncated_normal(shape)) + with tf.device("/cpu:0" if not use_gpu else None): + param_op = tf.group(random_ops.parameterized_truncated_normal(shape)) + naive_op = tf.group(random_ops.truncated_normal(shape)) + # Burn-in to avoid session setup costs in the timing. + sess.run(param_op) + sess.run(param_op) param_dt = timeit.timeit(lambda: sess.run(param_op), number=num_iters) + sess.run(naive_op) + sess.run(naive_op) naive_dt = timeit.timeit(lambda: sess.run(naive_op), number=num_iters) return param_dt, naive_dt class TruncatedNormalBenchmark(tf.test.Benchmark): - def benchmarkParameterizedOpVsNaiveOp(self): + def benchmarkParameterizedOpVsNaiveOpCpu(self): + self._benchmarkParameterizedOpVsNaiveOp(False) + + def benchmarkParameterizedOpVsNaiveOpGpu(self): + self._benchmarkParameterizedOpVsNaiveOp(True) + + def _benchmarkParameterizedOpVsNaiveOp(self, use_gpu): num_iters = 50 print(("Composition of new ParameterizedTruncatedNormalOp vs. " "naive TruncatedNormalOp [%d iters]") % num_iters) @@ -201,16 +221,16 @@ class TruncatedNormalBenchmark(tf.test.Benchmark): for shape in [[10000, 100], [1000, 1000], [1000000], [100, 100, 100], [20, 20, 20, 20]]: - p_dt, n_dt = parameterized_vs_naive(shape, num_iters) + p_dt, n_dt = parameterized_vs_naive(shape, num_iters, use_gpu) print("%s\t%.3f\t%.3f\t%.2f" % (shape, p_dt, n_dt, p_dt / n_dt)) shape_str = "-".join(map(str, shape)) - self.report_benchmark(name="parameterized_shape" + shape_str, - iters=num_iters, - wall_time=p_dt) - self.report_benchmark(name="naive_shape" + shape_str, - iters=num_iters, - wall_time=n_dt) + self.report_benchmark( + name="parameterized_shape" + shape_str, + iters=num_iters, + wall_time=p_dt) + self.report_benchmark( + name="naive_shape" + shape_str, iters=num_iters, wall_time=n_dt) if __name__ == "__main__":