Upgraded to the latest version of Eigen that speeds up full reductions on fp16
by about 3 orders of magnitude as well as some partial reductions by 30% when using cuda 7.5 or above Change: 122191448
This commit is contained in:
parent
6207603e93
commit
df7276a15c
@ -1,6 +1,6 @@
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
archive_dir = "eigen-eigen-50812b426b7c"
|
||||
archive_dir = "eigen-eigen-aaca054ed24d"
|
||||
|
||||
cc_library(
|
||||
name = "eigen",
|
||||
|
@ -7,7 +7,7 @@
|
||||
|
||||
include (ExternalProject)
|
||||
|
||||
set(eigen_archive_hash "50812b426b7c")
|
||||
set(eigen_archive_hash "aaca054ed24d")
|
||||
|
||||
set(eigen_INCLUDE_DIRS
|
||||
${CMAKE_CURRENT_BINARY_DIR}
|
||||
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
|
||||
${tensorflow_source_dir}/third_party/eigen3
|
||||
)
|
||||
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
|
||||
set(eigen_HASH SHA256=fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a)
|
||||
set(eigen_HASH SHA256=4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59)
|
||||
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
|
||||
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
|
||||
|
||||
|
@ -122,14 +122,19 @@ class EigenAllocator : public ::Eigen::Allocator {
|
||||
#else
|
||||
class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
||||
public:
|
||||
EigenCudaStreamDevice() { Eigen::initializeDeviceProp(); }
|
||||
|
||||
EigenCudaStreamDevice() : scratch_(nullptr) { Eigen::initializeDeviceProp(); }
|
||||
~EigenCudaStreamDevice() {
|
||||
if (scratch_) {
|
||||
deallocate(scratch_);
|
||||
}
|
||||
}
|
||||
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
|
||||
int gpu_id, ::tensorflow::Allocator* alloc) {
|
||||
if (LogMemory::IsEnabled()) {
|
||||
operation_ = context->op_kernel().name() + "/EigenAllocator";
|
||||
step_id_ = context->step_id();
|
||||
}
|
||||
assert(!scratch_);
|
||||
stream_ = cuda_stream;
|
||||
allocator_ = alloc;
|
||||
device_prop_ = &Eigen::m_deviceProperties[gpu_id];
|
||||
@ -163,6 +168,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
||||
CHECK_EQ(err, cudaSuccess);
|
||||
}
|
||||
|
||||
// Return a pointer to a per stream scratchpad of 1024 bytes residing
|
||||
// in global memory.
|
||||
void* scratchpad() const {
|
||||
if (scratch_ == nullptr) {
|
||||
scratch_ = allocate(1024);
|
||||
}
|
||||
return scratch_;
|
||||
}
|
||||
|
||||
private:
|
||||
struct AsyncFreeData {
|
||||
AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
|
||||
@ -190,6 +204,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
||||
const cudaStream_t* stream_; // Not owned.
|
||||
const cudaDeviceProp* device_prop_; // Not owned.
|
||||
::tensorflow::Allocator* allocator_; // Not owned.
|
||||
mutable void* scratch_;
|
||||
|
||||
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
|
||||
};
|
||||
|
@ -41,8 +41,8 @@ class XentTest(tf.test.TestCase):
|
||||
loss = tf.nn.softmax_cross_entropy_with_logits(np_features, np_labels)
|
||||
backprop = loss.op.outputs[1]
|
||||
tf_loss, tf_backprop = sess.run([loss, backprop])
|
||||
self.assertAllClose(np_loss, tf_loss)
|
||||
self.assertAllClose(np_backprop, tf_backprop)
|
||||
self.assertAllCloseAccordingToType(np_loss, tf_loss)
|
||||
self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
|
||||
|
||||
def _testAll(self, features, labels):
|
||||
self._testXent(features, labels, use_gpu=False)
|
||||
|
@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
|
||||
|
||||
native.new_http_archive(
|
||||
name = "eigen_archive",
|
||||
url = "https://bitbucket.org/eigen/eigen/get/50812b426b7c.tar.gz",
|
||||
sha256 = "fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a",
|
||||
url = "https://bitbucket.org/eigen/eigen/get/aaca054ed24d.tar.gz",
|
||||
sha256 = "4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59",
|
||||
build_file = path_prefix + "eigen.BUILD",
|
||||
)
|
||||
|
||||
|
2
third_party/eigen3/Eigen/Cholesky
vendored
2
third_party/eigen3/Eigen/Cholesky
vendored
@ -1 +1 @@
|
||||
#include "eigen-eigen-50812b426b7c/Eigen/Cholesky"
|
||||
#include "eigen-eigen-aaca054ed24d/Eigen/Cholesky"
|
||||
|
2
third_party/eigen3/Eigen/Core
vendored
2
third_party/eigen3/Eigen/Core
vendored
@ -1 +1 @@
|
||||
#include "eigen-eigen-50812b426b7c/Eigen/Core"
|
||||
#include "eigen-eigen-aaca054ed24d/Eigen/Core"
|
||||
|
2
third_party/eigen3/Eigen/Eigenvalues
vendored
2
third_party/eigen3/Eigen/Eigenvalues
vendored
@ -1 +1 @@
|
||||
#include "eigen-eigen-50812b426b7c/Eigen/Eigenvalues"
|
||||
#include "eigen-eigen-aaca054ed24d/Eigen/Eigenvalues"
|
||||
|
2
third_party/eigen3/Eigen/LU
vendored
2
third_party/eigen3/Eigen/LU
vendored
@ -1 +1 @@
|
||||
#include "eigen-eigen-50812b426b7c/Eigen/LU"
|
||||
#include "eigen-eigen-aaca054ed24d/Eigen/LU"
|
||||
|
2
third_party/eigen3/Eigen/QR
vendored
2
third_party/eigen3/Eigen/QR
vendored
@ -1 +1 @@
|
||||
#include "eigen-eigen-50812b426b7c/Eigen/QR"
|
||||
#include "eigen-eigen-aaca054ed24d/Eigen/QR"
|
||||
|
@ -1 +1 @@
|
||||
#include "eigen-eigen-50812b426b7c/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "eigen-eigen-aaca054ed24d/unsupported/Eigen/CXX11/Tensor"
|
||||
|
Loading…
x
Reference in New Issue
Block a user