Upgraded to the latest version of Eigen that speeds up full reductions on fp16

by about 3 orders of magnitude as well as some partial reductions by 30% when using cuda 7.5 or above
Change: 122191448
This commit is contained in:
Benoit Steiner 2016-05-12 12:32:46 -08:00 committed by TensorFlower Gardener
parent 6207603e93
commit df7276a15c
11 changed files with 30 additions and 15 deletions

View File

@ -1,6 +1,6 @@
package(default_visibility = ["//visibility:public"])
archive_dir = "eigen-eigen-50812b426b7c"
archive_dir = "eigen-eigen-aaca054ed24d"
cc_library(
name = "eigen",

View File

@ -7,7 +7,7 @@
include (ExternalProject)
set(eigen_archive_hash "50812b426b7c")
set(eigen_archive_hash "aaca054ed24d")
set(eigen_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR}
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
${tensorflow_source_dir}/third_party/eigen3
)
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
set(eigen_HASH SHA256=fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a)
set(eigen_HASH SHA256=4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59)
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)

View File

@ -122,14 +122,19 @@ class EigenAllocator : public ::Eigen::Allocator {
#else
class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
public:
EigenCudaStreamDevice() { Eigen::initializeDeviceProp(); }
EigenCudaStreamDevice() : scratch_(nullptr) { Eigen::initializeDeviceProp(); }
~EigenCudaStreamDevice() {
if (scratch_) {
deallocate(scratch_);
}
}
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
int gpu_id, ::tensorflow::Allocator* alloc) {
if (LogMemory::IsEnabled()) {
operation_ = context->op_kernel().name() + "/EigenAllocator";
step_id_ = context->step_id();
}
assert(!scratch_);
stream_ = cuda_stream;
allocator_ = alloc;
device_prop_ = &Eigen::m_deviceProperties[gpu_id];
@ -163,6 +168,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
CHECK_EQ(err, cudaSuccess);
}
// Return a pointer to a per stream scratchpad of 1024 bytes residing
// in global memory.
void* scratchpad() const {
if (scratch_ == nullptr) {
scratch_ = allocate(1024);
}
return scratch_;
}
private:
struct AsyncFreeData {
AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
@ -190,6 +204,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
const cudaStream_t* stream_; // Not owned.
const cudaDeviceProp* device_prop_; // Not owned.
::tensorflow::Allocator* allocator_; // Not owned.
mutable void* scratch_;
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
};

View File

@ -41,8 +41,8 @@ class XentTest(tf.test.TestCase):
loss = tf.nn.softmax_cross_entropy_with_logits(np_features, np_labels)
backprop = loss.op.outputs[1]
tf_loss, tf_backprop = sess.run([loss, backprop])
self.assertAllClose(np_loss, tf_loss)
self.assertAllClose(np_backprop, tf_backprop)
self.assertAllCloseAccordingToType(np_loss, tf_loss)
self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
def _testAll(self, features, labels):
self._testXent(features, labels, use_gpu=False)

View File

@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
native.new_http_archive(
name = "eigen_archive",
url = "https://bitbucket.org/eigen/eigen/get/50812b426b7c.tar.gz",
sha256 = "fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a",
url = "https://bitbucket.org/eigen/eigen/get/aaca054ed24d.tar.gz",
sha256 = "4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59",
build_file = path_prefix + "eigen.BUILD",
)

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/Cholesky"
#include "eigen-eigen-aaca054ed24d/Eigen/Cholesky"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/Core"
#include "eigen-eigen-aaca054ed24d/Eigen/Core"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/Eigenvalues"
#include "eigen-eigen-aaca054ed24d/Eigen/Eigenvalues"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/LU"
#include "eigen-eigen-aaca054ed24d/Eigen/LU"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/QR"
#include "eigen-eigen-aaca054ed24d/Eigen/QR"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/unsupported/Eigen/CXX11/Tensor"
#include "eigen-eigen-aaca054ed24d/unsupported/Eigen/CXX11/Tensor"