Upgraded to the latest version of Eigen that speeds up full reductions on fp16
by about 3 orders of magnitude as well as some partial reductions by 30% when using cuda 7.5 or above Change: 122191448
This commit is contained in:
parent
6207603e93
commit
df7276a15c
@ -1,6 +1,6 @@
|
|||||||
package(default_visibility = ["//visibility:public"])
|
package(default_visibility = ["//visibility:public"])
|
||||||
|
|
||||||
archive_dir = "eigen-eigen-50812b426b7c"
|
archive_dir = "eigen-eigen-aaca054ed24d"
|
||||||
|
|
||||||
cc_library(
|
cc_library(
|
||||||
name = "eigen",
|
name = "eigen",
|
||||||
|
@ -7,7 +7,7 @@
|
|||||||
|
|
||||||
include (ExternalProject)
|
include (ExternalProject)
|
||||||
|
|
||||||
set(eigen_archive_hash "50812b426b7c")
|
set(eigen_archive_hash "aaca054ed24d")
|
||||||
|
|
||||||
set(eigen_INCLUDE_DIRS
|
set(eigen_INCLUDE_DIRS
|
||||||
${CMAKE_CURRENT_BINARY_DIR}
|
${CMAKE_CURRENT_BINARY_DIR}
|
||||||
@ -16,7 +16,7 @@ set(eigen_INCLUDE_DIRS
|
|||||||
${tensorflow_source_dir}/third_party/eigen3
|
${tensorflow_source_dir}/third_party/eigen3
|
||||||
)
|
)
|
||||||
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
|
set(eigen_URL https://bitbucket.org/eigen/eigen/get/${eigen_archive_hash}.tar.gz)
|
||||||
set(eigen_HASH SHA256=fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a)
|
set(eigen_HASH SHA256=4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59)
|
||||||
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
|
set(eigen_BUILD ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen)
|
||||||
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
|
set(eigen_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/eigen/install)
|
||||||
|
|
||||||
|
@ -122,14 +122,19 @@ class EigenAllocator : public ::Eigen::Allocator {
|
|||||||
#else
|
#else
|
||||||
class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
||||||
public:
|
public:
|
||||||
EigenCudaStreamDevice() { Eigen::initializeDeviceProp(); }
|
EigenCudaStreamDevice() : scratch_(nullptr) { Eigen::initializeDeviceProp(); }
|
||||||
|
~EigenCudaStreamDevice() {
|
||||||
|
if (scratch_) {
|
||||||
|
deallocate(scratch_);
|
||||||
|
}
|
||||||
|
}
|
||||||
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
|
void Reinitialize(OpKernelContext* context, const cudaStream_t* cuda_stream,
|
||||||
int gpu_id, ::tensorflow::Allocator* alloc) {
|
int gpu_id, ::tensorflow::Allocator* alloc) {
|
||||||
if (LogMemory::IsEnabled()) {
|
if (LogMemory::IsEnabled()) {
|
||||||
operation_ = context->op_kernel().name() + "/EigenAllocator";
|
operation_ = context->op_kernel().name() + "/EigenAllocator";
|
||||||
step_id_ = context->step_id();
|
step_id_ = context->step_id();
|
||||||
}
|
}
|
||||||
|
assert(!scratch_);
|
||||||
stream_ = cuda_stream;
|
stream_ = cuda_stream;
|
||||||
allocator_ = alloc;
|
allocator_ = alloc;
|
||||||
device_prop_ = &Eigen::m_deviceProperties[gpu_id];
|
device_prop_ = &Eigen::m_deviceProperties[gpu_id];
|
||||||
@ -163,6 +168,15 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
|||||||
CHECK_EQ(err, cudaSuccess);
|
CHECK_EQ(err, cudaSuccess);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return a pointer to a per stream scratchpad of 1024 bytes residing
|
||||||
|
// in global memory.
|
||||||
|
void* scratchpad() const {
|
||||||
|
if (scratch_ == nullptr) {
|
||||||
|
scratch_ = allocate(1024);
|
||||||
|
}
|
||||||
|
return scratch_;
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
struct AsyncFreeData {
|
struct AsyncFreeData {
|
||||||
AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
|
AsyncFreeData(::tensorflow::Allocator* a, void* p, const string& o,
|
||||||
@ -190,6 +204,7 @@ class EigenCudaStreamDevice : public ::Eigen::StreamInterface {
|
|||||||
const cudaStream_t* stream_; // Not owned.
|
const cudaStream_t* stream_; // Not owned.
|
||||||
const cudaDeviceProp* device_prop_; // Not owned.
|
const cudaDeviceProp* device_prop_; // Not owned.
|
||||||
::tensorflow::Allocator* allocator_; // Not owned.
|
::tensorflow::Allocator* allocator_; // Not owned.
|
||||||
|
mutable void* scratch_;
|
||||||
|
|
||||||
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
|
TF_DISALLOW_COPY_AND_ASSIGN(EigenCudaStreamDevice);
|
||||||
};
|
};
|
||||||
|
@ -41,8 +41,8 @@ class XentTest(tf.test.TestCase):
|
|||||||
loss = tf.nn.softmax_cross_entropy_with_logits(np_features, np_labels)
|
loss = tf.nn.softmax_cross_entropy_with_logits(np_features, np_labels)
|
||||||
backprop = loss.op.outputs[1]
|
backprop = loss.op.outputs[1]
|
||||||
tf_loss, tf_backprop = sess.run([loss, backprop])
|
tf_loss, tf_backprop = sess.run([loss, backprop])
|
||||||
self.assertAllClose(np_loss, tf_loss)
|
self.assertAllCloseAccordingToType(np_loss, tf_loss)
|
||||||
self.assertAllClose(np_backprop, tf_backprop)
|
self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
|
||||||
|
|
||||||
def _testAll(self, features, labels):
|
def _testAll(self, features, labels):
|
||||||
self._testXent(features, labels, use_gpu=False)
|
self._testXent(features, labels, use_gpu=False)
|
||||||
|
@ -13,8 +13,8 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
|
|||||||
|
|
||||||
native.new_http_archive(
|
native.new_http_archive(
|
||||||
name = "eigen_archive",
|
name = "eigen_archive",
|
||||||
url = "https://bitbucket.org/eigen/eigen/get/50812b426b7c.tar.gz",
|
url = "https://bitbucket.org/eigen/eigen/get/aaca054ed24d.tar.gz",
|
||||||
sha256 = "fa95e425c379c2c7b8a49d9ef7bd0c5a8369171c987affd6dbae5de8a8911c1a",
|
sha256 = "4abff4b7ba03316856aeece6de99abb74a1dfed453fdab85eed4da2b5fd2fc59",
|
||||||
build_file = path_prefix + "eigen.BUILD",
|
build_file = path_prefix + "eigen.BUILD",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
2
third_party/eigen3/Eigen/Cholesky
vendored
2
third_party/eigen3/Eigen/Cholesky
vendored
@ -1 +1 @@
|
|||||||
#include "eigen-eigen-50812b426b7c/Eigen/Cholesky"
|
#include "eigen-eigen-aaca054ed24d/Eigen/Cholesky"
|
||||||
|
2
third_party/eigen3/Eigen/Core
vendored
2
third_party/eigen3/Eigen/Core
vendored
@ -1 +1 @@
|
|||||||
#include "eigen-eigen-50812b426b7c/Eigen/Core"
|
#include "eigen-eigen-aaca054ed24d/Eigen/Core"
|
||||||
|
2
third_party/eigen3/Eigen/Eigenvalues
vendored
2
third_party/eigen3/Eigen/Eigenvalues
vendored
@ -1 +1 @@
|
|||||||
#include "eigen-eigen-50812b426b7c/Eigen/Eigenvalues"
|
#include "eigen-eigen-aaca054ed24d/Eigen/Eigenvalues"
|
||||||
|
2
third_party/eigen3/Eigen/LU
vendored
2
third_party/eigen3/Eigen/LU
vendored
@ -1 +1 @@
|
|||||||
#include "eigen-eigen-50812b426b7c/Eigen/LU"
|
#include "eigen-eigen-aaca054ed24d/Eigen/LU"
|
||||||
|
2
third_party/eigen3/Eigen/QR
vendored
2
third_party/eigen3/Eigen/QR
vendored
@ -1 +1 @@
|
|||||||
#include "eigen-eigen-50812b426b7c/Eigen/QR"
|
#include "eigen-eigen-aaca054ed24d/Eigen/QR"
|
||||||
|
@ -1 +1 @@
|
|||||||
#include "eigen-eigen-50812b426b7c/unsupported/Eigen/CXX11/Tensor"
|
#include "eigen-eigen-aaca054ed24d/unsupported/Eigen/CXX11/Tensor"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user