Upgraded to the latest version of Eigen that speeds up full reductions on fp16

by about 3 orders of magnitude as well as some partial reductions by 30% when using cuda 7.5 or above
Change: 122191448
This commit is contained in:
Benoit Steiner 2016-05-12 12:32:46 -08:00 committed by TensorFlower Gardener
parent 6207603e93
commit df7276a15c
11 changed files with 30 additions and 15 deletions

View File

@ -1,6 +1,6 @@
package(default_visibility = ["//visibility:public"]) package(default_visibility = ["//visibility:public"])
archive_dir = "eigen-eigen-50812b426b7c" archive_dir = "eigen-eigen-aaca054ed24d"
cc_library( cc_library(
name = "eigen", name = "eigen",

View File

@ -7,7 +7,7 @@
include (ExternalProject) include (ExternalProject)
set(eigen_archive_hash "50812b426b7c") set(eigen_archive_hash "aaca054ed24d")
set(eigen_INCLUDE_DIRS set(eigen_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
${tensorflow_source_dir}/third_party/eigen3 ${tensorflow_source_dir}/third_party/eigen3
) )
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz) set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
set(eigen_HASH SHA256=fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a) set(eigen_HASH SHA256=4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59)
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen) set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install) set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)

View File

@ -122,14 +122,19 @@ class EigenAllocator : public ::Eigen::Allocator {
#else #else
class EigenCudaStreamDevice : public ::Eigen::StreamInterface { class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
public: public:
EigenCudaStreamDevice() { Eigen::initializeDeviceProp(); } EigenCudaStreamDevice() : scratch_(nullptr) { Eigen::initializeDeviceProp(); }
~EigenCudaStreamDevice() {
if (scratch_) {
deallocate(scratch_);
}
}
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream, void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
int gpu_id, ::tensorflow::Allocator* alloc) { int gpu_id, ::tensorflow::Allocator* alloc) {
if (LogMemory::IsEnabled()) { if (LogMemory::IsEnabled()) {
operation_ = context->op_kernel().name() + "/EigenAllocator"; operation_ = context->op_kernel().name() + "/EigenAllocator";
step_id_ = context->step_id(); step_id_ = context->step_id();
} }
assert(!scratch_);
stream_ = cuda_stream; stream_ = cuda_stream;
allocator_ = alloc; allocator_ = alloc;
device_prop_ = &Eigen::m_deviceProperties[gpu_id]; device_prop_ = &Eigen::m_deviceProperties[gpu_id];
@ -163,6 +168,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
CHECK_EQ(err, cudaSuccess); CHECK_EQ(err, cudaSuccess);
} }
// Return a pointer to a per stream scratchpad of 1024 bytes residing
// in global memory.
void* scratchpad() const {
if (scratch_ == nullptr) {
scratch_ = allocate(1024);
}
return scratch_;
}
private: private:
struct AsyncFreeData { struct AsyncFreeData {
AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o, AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
@ -190,6 +204,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
const cudaStream_t* stream_; // Not owned. const cudaStream_t* stream_; // Not owned.
const cudaDeviceProp* device_prop_; // Not owned. const cudaDeviceProp* device_prop_; // Not owned.
::tensorflow::Allocator* allocator_; // Not owned. ::tensorflow::Allocator* allocator_; // Not owned.
mutable void* scratch_;
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice); TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
}; };

View File

@ -41,8 +41,8 @@ class XentTest(tf.test.TestCase):
loss = tf.nn.softmax_cross_entropy_with_logits(np_features, np_labels) loss = tf.nn.softmax_cross_entropy_with_logits(np_features, np_labels)
backprop = loss.op.outputs[1] backprop = loss.op.outputs[1]
tf_loss, tf_backprop = sess.run([loss, backprop]) tf_loss, tf_backprop = sess.run([loss, backprop])
self.assertAllClose(np_loss, tf_loss) self.assertAllCloseAccordingToType(np_loss, tf_loss)
self.assertAllClose(np_backprop, tf_backprop) self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
def _testAll(self, features, labels): def _testAll(self, features, labels):
self._testXent(features, labels, use_gpu=False) self._testXent(features, labels, use_gpu=False)

View File

@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
native.new_http_archive( native.new_http_archive(
name = "eigen_archive", name = "eigen_archive",
url = "https://bitbucket.org/eigen/eigen/get/50812b426b7c.tar.gz", url = "https://bitbucket.org/eigen/eigen/get/aaca054ed24d.tar.gz",
sha256 = "fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a", sha256 = "4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59",
build_file = path_prefix + "eigen.BUILD", build_file = path_prefix + "eigen.BUILD",
) )

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/Cholesky" #include "eigen-eigen-aaca054ed24d/Eigen/Cholesky"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/Core" #include "eigen-eigen-aaca054ed24d/Eigen/Core"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/Eigenvalues" #include "eigen-eigen-aaca054ed24d/Eigen/Eigenvalues"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/LU" #include "eigen-eigen-aaca054ed24d/Eigen/LU"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/Eigen/QR" #include "eigen-eigen-aaca054ed24d/Eigen/QR"

View File

@ -1 +1 @@
#include "eigen-eigen-50812b426b7c/unsupported/Eigen/CXX11/Tensor" #include "eigen-eigen-aaca054ed24d/unsupported/Eigen/CXX11/Tensor"