TensorFlow: upstream changes from eigen to fix build from
changes in last commit.
This commit is contained in:
parent
bf6b536bde
commit
bb7a7a8858
@ -486,6 +486,39 @@ struct functor_traits<scalar_cube_op<Scalar> >
|
|||||||
{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
|
{ enum { Cost = 2*NumTraits<Scalar>::MulCost, PacketAccess = packet_traits<Scalar>::HasMul }; };
|
||||||
|
|
||||||
|
|
||||||
|
/** \internal
|
||||||
|
* \brief Template functor to compute the signum of a scalar
|
||||||
|
* \sa class CwiseUnaryOp, Cwise::sign()
|
||||||
|
*/
|
||||||
|
template<typename Scalar,bool iscpx=(NumTraits<Scalar>::IsComplex!=0) > struct scalar_sign_op;
|
||||||
|
template<typename Scalar>
|
||||||
|
struct scalar_sign_op<Scalar,false> {
|
||||||
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
|
||||||
|
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
|
||||||
|
{
|
||||||
|
return Scalar( (a>Scalar(0)) - (a<Scalar(0)) );
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<typename Scalar>
|
||||||
|
struct scalar_sign_op<Scalar,true> {
|
||||||
|
EIGEN_EMPTY_STRUCT_CTOR(scalar_sign_op)
|
||||||
|
EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const
|
||||||
|
{
|
||||||
|
typename NumTraits<Scalar>::Real aa = std::abs(a);
|
||||||
|
return (aa==0) ? Scalar(0) : (a/aa);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
template<typename Scalar>
|
||||||
|
struct functor_traits<scalar_sign_op<Scalar> >
|
||||||
|
{ enum {
|
||||||
|
Cost =
|
||||||
|
NumTraits<Scalar>::IsComplex
|
||||||
|
? ( 8*NumTraits<Scalar>::MulCost ) // roughly
|
||||||
|
: ( 3*NumTraits<Scalar>::AddCost),
|
||||||
|
PacketAccess = false,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
} // end namespace internal
|
} // end namespace internal
|
||||||
|
|
||||||
} // end namespace Eigen
|
} // end namespace Eigen
|
||||||
|
@ -59,7 +59,7 @@
|
|||||||
#include <curand_kernel.h>
|
#include <curand_kernel.h>
|
||||||
#endif // defined(__CUDACC__)
|
#endif // defined(__CUDACC__)
|
||||||
#else
|
#else
|
||||||
#include "perftools/gputools/executor/gcuda.h"
|
#include "platforms/gpus/gcudacc/runtime/gcudacc_runtime.h"
|
||||||
#ifdef __CUDACC__
|
#ifdef __CUDACC__
|
||||||
#include "third_party/gpus/cuda/curand_device/curand_kernel.h"
|
#include "third_party/gpus/cuda/curand_device/curand_kernel.h"
|
||||||
#endif // defined(__CUDACC__)
|
#endif // defined(__CUDACC__)
|
||||||
@ -88,6 +88,7 @@
|
|||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorTraits.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h"
|
||||||
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h"
|
||||||
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
|
#include "unsupported/Eigen/CXX11/src/Tensor/TensorBlock.h"
|
||||||
|
|
||||||
|
@ -80,6 +80,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
|
|||||||
return unaryExpr(internal::scalar_opposite_op<Scalar>());
|
return unaryExpr(internal::scalar_opposite_op<Scalar>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC
|
||||||
|
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sign_op<Scalar>, const Derived>
|
||||||
|
sign() const {
|
||||||
|
return unaryExpr(internal::scalar_sign_op<Scalar>());
|
||||||
|
}
|
||||||
|
|
||||||
EIGEN_DEVICE_FUNC
|
EIGEN_DEVICE_FUNC
|
||||||
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
|
EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sqrt_op<Scalar>, const Derived>
|
||||||
sqrt() const {
|
sqrt() const {
|
||||||
|
@ -757,11 +757,17 @@ static inline void setCudaSharedMemConfig(cudaSharedMemConfig cache_config) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
struct GpuDevice {
|
struct GpuDevice {
|
||||||
GpuDevice()
|
// Default constructor: Get [cached] device 0 and its default stream.
|
||||||
: stream_(perftools::gputools::MachineManager::singleton()->stream_for_device(0)),
|
GpuDevice() : allocator_(nullptr) {
|
||||||
allocator_(nullptr),
|
perftools::gputools::Platform* platform =
|
||||||
stream_exec_(stream_->parent()),
|
perftools::gputools::MultiPlatformManager::PlatformWithName("cuda")
|
||||||
device_descr_(&(stream_exec_->GetDeviceDescription())) {}
|
.ValueOrDie();
|
||||||
|
stream_exec_ = platform->ExecutorForDevice(0).ValueOrDie();
|
||||||
|
// TODO(rspringer): If we ever pull from an executor aside from 0, this will
|
||||||
|
// need to be preceded by a call to SetDevice(N);
|
||||||
|
stream_ = platforms::gpus::gcudacc::GetDefaultStream();
|
||||||
|
device_descr_ = &(stream_exec_->GetDeviceDescription());
|
||||||
|
}
|
||||||
|
|
||||||
GpuDevice(perftools::gputools::Stream* stream,
|
GpuDevice(perftools::gputools::Stream* stream,
|
||||||
const Allocator* alloc = nullptr)
|
const Allocator* alloc = nullptr)
|
||||||
|
@ -418,11 +418,13 @@ inline void TensorExecutor<Expression, GpuDevice, false, Tileable>::run(
|
|||||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
const int num_blocks = device.getNumCudaMultiProcessors() *
|
|
||||||
device.maxCudaThreadsPerMultiProcessor() /
|
|
||||||
device.maxCudaThreadsPerBlock();
|
|
||||||
const int block_size = device.maxCudaThreadsPerBlock();
|
const int block_size = device.maxCudaThreadsPerBlock();
|
||||||
|
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||||
|
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||||
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||||
|
|
||||||
LAUNCH_CUDA_KERNEL(
|
LAUNCH_CUDA_KERNEL(
|
||||||
(EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
|
(EigenMetaKernel_NonVectorizable<TensorEvaluator<Expression, GpuDevice>,
|
||||||
Index>),
|
Index>),
|
||||||
@ -438,11 +440,13 @@ inline void TensorExecutor<Expression, GpuDevice, true, Tileable>::run(
|
|||||||
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
TensorEvaluator<Expression, GpuDevice> evaluator(expr, device);
|
||||||
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
|
||||||
if (needs_assign) {
|
if (needs_assign) {
|
||||||
const int num_blocks = device.getNumCudaMultiProcessors() *
|
|
||||||
device.maxCudaThreadsPerMultiProcessor() /
|
|
||||||
device.maxCudaThreadsPerBlock();
|
|
||||||
const int block_size = device.maxCudaThreadsPerBlock();
|
const int block_size = device.maxCudaThreadsPerBlock();
|
||||||
|
const int max_blocks = device.getNumCudaMultiProcessors() *
|
||||||
|
device.maxCudaThreadsPerMultiProcessor() / block_size;
|
||||||
const Index size = array_prod(evaluator.dimensions());
|
const Index size = array_prod(evaluator.dimensions());
|
||||||
|
// Create a least one block to ensure we won't crash when tensorflow calls with tensors of size 0.
|
||||||
|
const int num_blocks = numext::maxi<int>(numext::mini<int>(max_blocks, (size + block_size - 1) / block_size), 1);
|
||||||
|
|
||||||
LAUNCH_CUDA_KERNEL(
|
LAUNCH_CUDA_KERNEL(
|
||||||
(EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
|
(EigenMetaKernel_Vectorizable<TensorEvaluator<Expression, GpuDevice>,
|
||||||
Index>),
|
Index>),
|
||||||
|
@ -59,13 +59,8 @@ namespace {
|
|||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct DividerTraits {
|
struct DividerTraits {
|
||||||
#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
|
|
||||||
typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
|
typedef typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type type;
|
||||||
static const int N = sizeof(T) * 8;
|
static const int N = sizeof(T) * 8;
|
||||||
#else
|
|
||||||
typedef uint32_t type;
|
|
||||||
static const int N = 32;
|
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
@ -78,40 +73,39 @@ namespace {
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__CUDA_ARCH__)
|
|
||||||
template <typename T>
|
|
||||||
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
|
||||||
return __umul64hi(a, b);
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t muluh(const uint64_t a, const T b) {
|
||||||
#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
|
#if defined(__CUDA_ARCH__)
|
||||||
|
return __umul64hi(a, b);
|
||||||
|
#elif defined(__SIZEOF_INT128__)
|
||||||
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
|
__uint128_t v = static_cast<__uint128_t>(a) * static_cast<__uint128_t>(b);
|
||||||
return static_cast<uint64_t>(v >> 64);
|
return static_cast<uint64_t>(v >> 64);
|
||||||
#else
|
#else
|
||||||
EIGEN_STATIC_ASSERT(sizeof(T) == 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
return (TensorUInt128<static_val<0>, uint64_t>(a) * TensorUInt128<static_val<0>, uint64_t>(b)).upper();
|
||||||
return (a * b) >> 32;
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
template <int N, typename T>
|
template <int N, typename T>
|
||||||
struct DividerHelper {
|
struct DividerHelper {
|
||||||
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier (const int log_div, const T divider) {
|
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint32_t computeMultiplier(const int log_div, const T divider) {
|
||||||
EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
EIGEN_STATIC_ASSERT(N == 32, YOU_MADE_A_PROGRAMMING_MISTAKE);
|
||||||
return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
|
return (static_cast<uint64_t>(1) << (N+log_div)) / divider - (static_cast<uint64_t>(1) << N) + 1;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
#if defined(__SIZEOF_INT128__) && !defined(__CUDACC__)
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
struct DividerHelper<64, T> {
|
struct DividerHelper<64, T> {
|
||||||
static EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE uint64_t computeMultiplier(const int log_div, const T divider) {
|
||||||
|
#if defined(__SIZEOF_INT128__) && !defined(__CUDA_ARCH__)
|
||||||
return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
return ((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
|
||||||
|
#else
|
||||||
|
const uint64_t shift = 1ULL << log_div;
|
||||||
|
TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
|
||||||
|
return static_cast<uint64_t>(result);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -141,6 +141,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
m_unshuffledInputStrides[i] =
|
m_unshuffledInputStrides[i] =
|
||||||
m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
|
m_unshuffledInputStrides[i - 1] * input_dims[i - 1];
|
||||||
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
m_outputStrides[i] = m_outputStrides[i - 1] * m_dimensions[i - 1];
|
||||||
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
m_unshuffledInputStrides[NumDims - 1] = 1;
|
m_unshuffledInputStrides[NumDims - 1] = 1;
|
||||||
@ -149,6 +150,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
m_unshuffledInputStrides[i] =
|
m_unshuffledInputStrides[i] =
|
||||||
m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
|
m_unshuffledInputStrides[i + 1] * input_dims[i + 1];
|
||||||
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
m_outputStrides[i] = m_outputStrides[i + 1] * m_dimensions[i + 1];
|
||||||
|
m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(m_outputStrides[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -319,14 +321,14 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
Index inputIndex = 0;
|
Index inputIndex = 0;
|
||||||
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
|
||||||
for (int i = NumDims - 1; i > 0; --i) {
|
for (int i = NumDims - 1; i > 0; --i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_fastOutputStrides[i];
|
||||||
inputIndex += idx * m_inputStrides[i];
|
inputIndex += idx * m_inputStrides[i];
|
||||||
index -= idx * m_outputStrides[i];
|
index -= idx * m_outputStrides[i];
|
||||||
}
|
}
|
||||||
return inputIndex + index * m_inputStrides[0];
|
return inputIndex + index * m_inputStrides[0];
|
||||||
} else {
|
} else {
|
||||||
for (int i = 0; i < NumDims - 1; ++i) {
|
for (int i = 0; i < NumDims - 1; ++i) {
|
||||||
const Index idx = index / m_outputStrides[i];
|
const Index idx = index / m_fastOutputStrides[i];
|
||||||
inputIndex += idx * m_inputStrides[i];
|
inputIndex += idx * m_inputStrides[i];
|
||||||
index -= idx * m_outputStrides[i];
|
index -= idx * m_outputStrides[i];
|
||||||
}
|
}
|
||||||
@ -338,6 +340,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
|
|||||||
Dimensions m_dimensions;
|
Dimensions m_dimensions;
|
||||||
array<Index, NumDims> m_inverseShuffle;
|
array<Index, NumDims> m_inverseShuffle;
|
||||||
array<Index, NumDims> m_outputStrides;
|
array<Index, NumDims> m_outputStrides;
|
||||||
|
array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
|
||||||
array<Index, NumDims> m_inputStrides;
|
array<Index, NumDims> m_inputStrides;
|
||||||
array<Index, NumDims> m_unshuffledInputStrides;
|
array<Index, NumDims> m_unshuffledInputStrides;
|
||||||
TensorEvaluator<ArgType, Device> m_impl;
|
TensorEvaluator<ArgType, Device> m_impl;
|
||||||
|
232
third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
vendored
Normal file
232
third_party/eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
vendored
Normal file
@ -0,0 +1,232 @@
|
|||||||
|
// This file is part of Eigen, a lightweight C++ template library
|
||||||
|
// for linear algebra.
|
||||||
|
//
|
||||||
|
// Copyright (C) 2015 Benoit Steiner <benoit.steiner.goog@gmail.com>
|
||||||
|
//
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla
|
||||||
|
// Public License v. 2.0. If a copy of the MPL was not distributed
|
||||||
|
// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
#ifndef EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
||||||
|
#define EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
||||||
|
|
||||||
|
namespace Eigen {
|
||||||
|
namespace internal {
|
||||||
|
|
||||||
|
template <uint64_t n>
|
||||||
|
struct static_val {
|
||||||
|
static const uint64_t value = n;
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
|
||||||
|
template <typename T>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
|
||||||
|
eigen_assert(v == n);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename HIGH = uint64_t, typename LOW = uint64_t>
|
||||||
|
struct TensorUInt128
|
||||||
|
{
|
||||||
|
HIGH high;
|
||||||
|
LOW low;
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
TensorUInt128(int x) : high(0), low(x) {
|
||||||
|
eigen_assert(x >= 0);
|
||||||
|
}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
TensorUInt128(int64_t x) : high(0), low(x) {
|
||||||
|
eigen_assert(x >= 0);
|
||||||
|
}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
TensorUInt128(uint64_t x) : high(0), low(x) { }
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
TensorUInt128(uint64_t y, uint64_t x) : high(y), low(x) { }
|
||||||
|
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator LOW() const {
|
||||||
|
return low;
|
||||||
|
}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE LOW lower() const {
|
||||||
|
return low;
|
||||||
|
}
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE HIGH upper() const {
|
||||||
|
return high;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
return (lhs.high == rhs.high) & (lhs.low == rhs.low);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
return (lhs.high != rhs.high) | (lhs.low != rhs.low);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
if (lhs.high != rhs.high) {
|
||||||
|
return lhs.high > rhs.high;
|
||||||
|
}
|
||||||
|
return lhs.low >= rhs.low;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
if (lhs.high != rhs.high) {
|
||||||
|
return lhs.high < rhs.high;
|
||||||
|
}
|
||||||
|
return lhs.low < rhs.low;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
|
||||||
|
if (result.low < rhs.low) {
|
||||||
|
result.high += 1;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
|
||||||
|
static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
|
||||||
|
if (result.low > lhs.low) {
|
||||||
|
result.high -= 1;
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
// Split each 128-bit integer into 4 32-bit integers, and then do the
|
||||||
|
// multiplications by hand as follow:
|
||||||
|
// lhs a b c d
|
||||||
|
// rhs e f g h
|
||||||
|
// -----------
|
||||||
|
// ah bh ch dh
|
||||||
|
// bg cg dg
|
||||||
|
// cf df
|
||||||
|
// de
|
||||||
|
// The result is stored in 2 64bit integers, high and low.
|
||||||
|
|
||||||
|
const uint64_t LOW = 0x00000000FFFFFFFFLL;
|
||||||
|
const uint64_t HIGH = 0xFFFFFFFF00000000LL;
|
||||||
|
|
||||||
|
uint64_t d = lhs.low & LOW;
|
||||||
|
uint64_t c = (lhs.low & HIGH) >> 32LL;
|
||||||
|
uint64_t b = lhs.high & LOW;
|
||||||
|
uint64_t a = (lhs.high & HIGH) >> 32LL;
|
||||||
|
|
||||||
|
uint64_t h = rhs.low & LOW;
|
||||||
|
uint64_t g = (rhs.low & HIGH) >> 32LL;
|
||||||
|
uint64_t f = rhs.high & LOW;
|
||||||
|
uint64_t e = (rhs.high & HIGH) >> 32LL;
|
||||||
|
|
||||||
|
// Compute the low 32 bits of low
|
||||||
|
uint64_t acc = d * h;
|
||||||
|
uint64_t low = acc & LOW;
|
||||||
|
// Compute the high 32 bits of low. Add a carry every time we wrap around
|
||||||
|
acc >>= 32LL;
|
||||||
|
uint64_t carry = 0;
|
||||||
|
uint64_t acc2 = acc + c * h;
|
||||||
|
if (acc2 < acc) {
|
||||||
|
carry++;
|
||||||
|
}
|
||||||
|
acc = acc2 + d * g;
|
||||||
|
if (acc < acc2) {
|
||||||
|
carry++;
|
||||||
|
}
|
||||||
|
low |= (acc << 32LL);
|
||||||
|
|
||||||
|
// Carry forward the high bits of acc to initiate the computation of the
|
||||||
|
// low 32 bits of high
|
||||||
|
acc2 = (acc >> 32LL) | (carry << 32LL);
|
||||||
|
carry = 0;
|
||||||
|
|
||||||
|
acc = acc2 + b * h;
|
||||||
|
if (acc < acc2) {
|
||||||
|
carry++;
|
||||||
|
}
|
||||||
|
acc2 = acc + c * g;
|
||||||
|
if (acc2 < acc) {
|
||||||
|
carry++;
|
||||||
|
}
|
||||||
|
acc = acc2 + d * f;
|
||||||
|
if (acc < acc2) {
|
||||||
|
carry++;
|
||||||
|
}
|
||||||
|
uint64_t high = acc & LOW;
|
||||||
|
|
||||||
|
// Start to compute the high 32 bits of high.
|
||||||
|
acc2 = (acc >> 32LL) | (carry << 32LL);
|
||||||
|
|
||||||
|
acc = acc2 + a * h;
|
||||||
|
acc2 = acc + b * g;
|
||||||
|
acc = acc2 + c * f;
|
||||||
|
acc2 = acc + d * e;
|
||||||
|
high |= (acc2 << 32LL);
|
||||||
|
|
||||||
|
return TensorUInt128<uint64_t, uint64_t>(high, low);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename HL, typename LL, typename HR, typename LR>
|
||||||
|
EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
|
||||||
|
static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
|
||||||
|
{
|
||||||
|
if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
|
||||||
|
return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
|
||||||
|
} else if (lhs < rhs) {
|
||||||
|
return TensorUInt128<uint64_t, uint64_t>(0);
|
||||||
|
} else {
|
||||||
|
// calculate the biggest power of 2 times rhs that's less than or equal to lhs
|
||||||
|
TensorUInt128<uint64_t, uint64_t> power2(1);
|
||||||
|
TensorUInt128<uint64_t, uint64_t> d(rhs);
|
||||||
|
TensorUInt128<uint64_t, uint64_t> tmp(lhs - d);
|
||||||
|
while (lhs >= d) {
|
||||||
|
tmp = tmp - d;
|
||||||
|
d = d + d;
|
||||||
|
power2 = power2 + power2;
|
||||||
|
}
|
||||||
|
|
||||||
|
tmp = TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
|
||||||
|
TensorUInt128<uint64_t, uint64_t> result(0);
|
||||||
|
while (power2 != TensorUInt128<static_val<0>, static_val<0> >(0)) {
|
||||||
|
if (tmp >= d) {
|
||||||
|
tmp = tmp - d;
|
||||||
|
result = result + power2;
|
||||||
|
}
|
||||||
|
// Shift right
|
||||||
|
power2 = TensorUInt128<uint64_t, uint64_t>(power2.high >> 1, (power2.low >> 1) | (power2.high << 63));
|
||||||
|
d = TensorUInt128<uint64_t, uint64_t>(d.high >> 1, (d.low >> 1) | (d.high << 63));
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace internal
|
||||||
|
} // namespace Eigen
|
||||||
|
|
||||||
|
|
||||||
|
#endif // EIGEN_CXX11_TENSOR_TENSOR_UINT128_H
|
Loading…
x
Reference in New Issue
Block a user