Merge changes from github.
END_PUBLIC --- Commit1e1b3d902
authored by Pete Warden<pete@petewarden.com> Committed by gunan<gunan@google.com>: Changed output directory for Pi CI build to fix permissions problem with nightlies (#13257) * Fix for RTLD_GLOBAL breakage of Pi builds, and removed Eigen version change for Pi that's no longer needed * Fixed Pi Zero OpenBLAS build problems and tidied up directories used * More robust checks in Pi build script * Changed output directory for Pi CI build to fix permissions problem --- Commitfe3a2e65c
authored by Yan Facai (???)<facai.yan@gmail.com> Committed by drpngx<drpngx@users.noreply.github.com>: check invalid string type for dest_nodes in extract_sub_graph (#13057) * BUG: check str type * TST: add unit test * CLN: remove list check * CLN: use warning * CLN: 2 indent * CLN: raise TypeError if not list * CLN: check string only --- Commit225ab7629
authored by Jean Wanka<jm.wanka@gmail.com> Committed by Jean Wanka<jm.wanka@gmail.com>: Fix polynomial decay with cycle for global step=0 For polynomial decay with cycle=True the learning rate at step 0 becomes NaN, because in the process of calculating it we devide by 0. This change should fix it, by setting the multiplier for the decay steps to one for global_step=0. --- Commit286f57061
authored by Bjarke Hammersholt Roune<broune@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Make Service::TransferToClient not attempt to manipulate the literal when the transfer failed, preventing a crash and allowing the caller to see the reason for the failed transfer. PiperOrigin-RevId: 169770126 --- Commite0501bc4d
authored by Yong Tang<yong.tang.github@outlook.com> Committed by Shanqing Cai<cais@google.com>: Fix GRUBlockCell parameter naming inconsistency (#13153) * Fix GRUBlockCell parameter naming inconsistency This fix tries to fix the issue in 13137 where parameter `cell_size` is used instead of `num_units`. This is inconsistent with other RNN cells. This fix adds support of `num_units` while at the same time maintains backward compatiblility for `cell_size`. This fix fixes 13137. Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Add `@deprecated_args` for 'cell_size' in `GRUBlockCell` This commit adds `@deprecated_args` for 'cell_size' in `GRUBlockCell` Signed-off-by: Yong Tang <yong.tang.github@outlook.com> * Address review comment Signed-off-by: Yong Tang <yong.tang.github@outlook.com> --- Commit02a2eba05
authored by Pete Warden<pete@petewarden.com> Committed by gunan<gunan@google.com>: Fix for RTLD_GLOBAL breakage of Pi builds, and removed Eigen version change that's no longer needed (#13251) * Fix for RTLD_GLOBAL breakage of Pi builds, and removed Eigen version change for Pi that's no longer needed * Fixed Pi Zero OpenBLAS build problems and tidied up directories used * More robust checks in Pi build script --- Commit8ef722253
authored by Sanjoy Das<sanjoy@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Remove a redundant setName. The EmitComputation should have emitted a function with the right name, so use a CHECK instead. PiperOrigin-RevId: 169764856 --- Commit1b94147dc
authored by Neal Wu<wun@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Fix broken GitHub links in tensorflow and tensorflow_models resulting from The Great Models Move (a.k.a. the research subfolder) PiperOrigin-RevId: 169763373 --- Commitb1ada5f0c
authored by Justine Tunney<jart@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Fix TensorBoard python -m invoke in docs PiperOrigin-RevId: 169758752 --- Commit2957cd894
authored by Mustafa Ispir<ispir@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: Local run option of estimator training. PiperOrigin-RevId: 169756384 --- Commit1dc2fe7ac
authored by Gunhan Gulsoy<gunan@google.com> Committed by TensorFlower Gardener<gardener@tensorflow.org>: BEGIN_PUBLIC Automated g4 rollback of changelist 166264198 PiperOrigin-RevId: 169998124
This commit is contained in:
parent
df22044be9
commit
e2e3a943c0
README.mdconfigure.py
tensorflow
BUILD
cc/framework
compiler
tf2xla/kernels
xla/service/gpu
contrib
BUILD
android/java/org/tensorflow/contrib/android
boosted_trees/lib/trees
cmake
crf
fused_conv/kernels
gdr
labeled_tensor/python/ops
layers
learn
makefile
mpi_collectives
BUILDREADME.md__init__.pympi_allgather_test.pympi_allreduce_test.pympi_message.protompi_ops.ccmpi_ops.pympi_ops_test.pyring.ccring.cu.ccring.h
pi_examples
rnn/python/ops
s3
tpu/ops
core
framework
grappler
kernels
BUILDconv_grad_filter_ops.ccconv_grad_input_ops.ccconv_grad_ops_3d.ccconv_ops.ccconv_ops_3d.cceigen_attention.hfused_batch_norm_op.ccfused_batch_norm_op.cu.ccfused_batch_norm_op.hmkl_aggregate_ops.ccpooling_ops_3d_sycl.h
lib/jpeg
ops
platform
public
docs_src
api_guides/python
extend
get_started/mnist
install
examples
go
java/src
14
README.md
14
README.md
@ -40,16 +40,16 @@ People who are a little more adventurous can also try our nightly binaries:
|
||||
* We are pleased to announce that TensorFlow now offers nightly pip packages
|
||||
under the [tf-nightly](https://pypi.python.org/pypi/tf-nightly) project on pypi.
|
||||
Simply run `pip install tf-nightly` in a clean environment to install the nightly
|
||||
tensorflow build. We currently only support CPU-only packages on Linux and Mac.
|
||||
GPU packages on all platforms and Windows CPU-only packages will arrive soon!
|
||||
tensorflow build. We currently only support CPU packages on Linux, Mac, and Windows.
|
||||
GPU packages on all platforms will arrive soon!
|
||||
|
||||
|
||||
**Individual whl files**
|
||||
* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-python35-linux-cpu/))
|
||||
* Linux GPU: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.4.0dev-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.4.0dev-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow_gpu-1.4.0dev-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-linux-gpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
|
||||
* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-1.4.0dev-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
|
||||
* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.4.0dev-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow-1.4.0dev-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows,PY=36/))
|
||||
* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.4.0dev-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tensorflow_gpu-1.4.0dev-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-win/M=windows-gpu,PY=36/))
|
||||
* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/))
|
||||
* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/))
|
||||
* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
|
||||
* Windows CPU-only: [Python 3.5 64-bit](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
|
||||
* Windows GPU: Coming soon!
|
||||
* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
|
||||
([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
|
||||
|
||||
|
@ -251,7 +251,7 @@ def reset_tf_configure_bazelrc():
|
||||
if not os.path.exists('.bazelrc'):
|
||||
if os.path.exists(os.path.join(home, '.bazelrc')):
|
||||
with open('.bazelrc', 'a') as f:
|
||||
f.write('import %s/.bazelrc\n' % home)
|
||||
f.write('import %s/.bazelrc\n' % home.replace('\\', '/'))
|
||||
else:
|
||||
open('.bazelrc', 'w').close()
|
||||
|
||||
|
@ -381,6 +381,7 @@ filegroup(
|
||||
"//tensorflow/contrib/losses:all_files",
|
||||
"//tensorflow/contrib/meta_graph_transform:all_files",
|
||||
"//tensorflow/contrib/metrics:all_files",
|
||||
"//tensorflow/contrib/mpi_collectives:all_files",
|
||||
"//tensorflow/contrib/ndlstm:all_files",
|
||||
"//tensorflow/contrib/nearest_neighbor:all_files",
|
||||
"//tensorflow/contrib/nn:all_files",
|
||||
@ -391,6 +392,7 @@ filegroup(
|
||||
"//tensorflow/contrib/remote_fused_graph/pylib:all_files",
|
||||
"//tensorflow/contrib/resampler:all_files",
|
||||
"//tensorflow/contrib/rnn:all_files",
|
||||
"//tensorflow/contrib/s3:all_files",
|
||||
"//tensorflow/contrib/saved_model:all_files",
|
||||
"//tensorflow/contrib/saved_model/cc/saved_model:all_files",
|
||||
"//tensorflow/contrib/seq2seq:all_files",
|
||||
|
@ -175,8 +175,14 @@ Status SymbolicGradientBuilder::Initialize() {
|
||||
"Must specify a gradient input for each output.");
|
||||
}
|
||||
std::vector<bool> reachable_nodes = GetReachableNodes();
|
||||
// TODO(theflofly) Check that inputs_ are reachable from
|
||||
// outputs_ using reachable_nodes
|
||||
for (const Output& input : inputs_) {
|
||||
if (!reachable_nodes[input.node()->id()]) {
|
||||
return errors::InvalidArgument(
|
||||
"Cannot compute the partial derivative for node '",
|
||||
input.node()->name(),
|
||||
"' as it's unreachable from the output node(s).");
|
||||
}
|
||||
}
|
||||
grad_outputs_->clear();
|
||||
grad_outputs_->resize(inputs_.size());
|
||||
// Populate `output_nodes_` from node ids in `outputs_`.
|
||||
|
@ -48,9 +48,9 @@ class GradientsTest : public ::testing::Test {
|
||||
Scope scope_test_;
|
||||
};
|
||||
|
||||
// EX.
|
||||
// Example:
|
||||
// ^ ^
|
||||
// dy| dx| // MatMul Gradient Graph
|
||||
// dy| dx| (MatMul Gradient Graph)
|
||||
// | |
|
||||
// MatMul_1 MatMul_2
|
||||
// ^ ^ ^ ^
|
||||
@ -61,7 +61,7 @@ class GradientsTest : public ::testing::Test {
|
||||
// | Const_3 |
|
||||
// | |
|
||||
// | ^ |
|
||||
// | z| | // MatMul Forward Graph
|
||||
// | z| | (MatMul Forward Graph)
|
||||
// | | |
|
||||
// | MatMul_0 |
|
||||
// | / \ |
|
||||
@ -373,24 +373,22 @@ TEST_F(GradientsTest, UnreachableEdgeGradOneOutput) {
|
||||
auto y_const = Const(scope_test_, {{1.0}, {2.0}, {3.0}});
|
||||
auto y_assign = Assign(scope_test_, y, y_const);
|
||||
|
||||
auto m1 = MatMul(scope_test_, x, y);
|
||||
auto m = MatMul(scope_test_, x, y);
|
||||
|
||||
auto z = Variable(scope_test_, {1, 3}, DT_DOUBLE);
|
||||
auto z_const = Const(scope_test_, {{9.0, 10.0, 11.0}});
|
||||
auto z_assign = Assign(scope_test_, z, z_const);
|
||||
|
||||
auto m2 = MatMul(scope_test_, y, z);
|
||||
|
||||
auto dm1 = Const(scope_test_, {{0.5}, {0.5}});
|
||||
auto diff_m = Const(scope_test_, {{0.5}, {0.5}});
|
||||
|
||||
std::vector<Output> grad_outputs;
|
||||
TF_ASSERT_OK(
|
||||
AddSymbolicGradients(scope_test_, {m1}, {y}, {dm1}, &grad_outputs));
|
||||
AddSymbolicGradients(scope_test_, {m}, {y}, {diff_m}, &grad_outputs));
|
||||
|
||||
std::vector<Tensor> outputs;
|
||||
test::GetTensors(scope_test_, {x_assign, y_assign, z_assign},
|
||||
{grad_outputs[0]}, &outputs);
|
||||
// dz/dy = xT * dm1
|
||||
// dz/dy = xT * diff_m
|
||||
test::ExpectTensorNear<double>(
|
||||
outputs[0], test::AsTensor<double>({2.5, 3.5, 4.5}, {3, 1}), 1e-5);
|
||||
}
|
||||
@ -424,13 +422,37 @@ TEST_F(GradientsTest, UnreachableEdgeGradTwoOutputs) {
|
||||
test::GetTensors(scope_test_, {x_assign, y_assign, z_assign},
|
||||
{grad_outputs[0]}, &outputs);
|
||||
|
||||
// the gradients from m1 and m2 will be summed to compute the gradient
|
||||
// w.r.t y
|
||||
// The gradients from m1 and m2 will be summed to compute the gradient
|
||||
// w.r.t y:
|
||||
// dz/dy = xT * dm1 + dm2 * zT
|
||||
test::ExpectTensorNear<double>(
|
||||
outputs[0], test::AsTensor<double>({17.5, 24.7, 26.8}, {3, 1}), 1e-5);
|
||||
}
|
||||
|
||||
TEST_F(GradientsTest, UnreachableInput) {
|
||||
auto x = Const(scope_test_, {{1.0, 2.0, 3.0}, {4.0, 5.0, 6.0}});
|
||||
auto y = Const(scope_test_, {{1.0}, {2.0}, {3.0}});
|
||||
auto z = Const(scope_test_.WithOpName("z"), {{9.0, 10.0, 11.0}});
|
||||
|
||||
auto m1 = MatMul(scope_test_, x, y);
|
||||
auto m2 = MatMul(scope_test_, y, z);
|
||||
auto dm1 = Const(scope_test_, {{0.5}, {0.5}});
|
||||
|
||||
// From m1, z is unreachable, so an error status should be returned.
|
||||
// m2 m1
|
||||
// | |
|
||||
// * *
|
||||
// / \ / \
|
||||
// z y x
|
||||
std::vector<Output> grad_outputs;
|
||||
Status status =
|
||||
AddSymbolicGradients(scope_test_, {m1}, {z}, {dm1}, &grad_outputs);
|
||||
EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
|
||||
EXPECT_EQ(status.error_message(),
|
||||
"Cannot compute the partial derivative"
|
||||
" for node 'z' as it's unreachable from the output node(s).");
|
||||
}
|
||||
|
||||
// StopGradientSingleOutputMultiEdgeTest tests combinations of valid and
|
||||
// 'NoGradient' (induced by StopGradient op) returned along multiple edges from
|
||||
// a single nodes output.
|
||||
|
@ -85,6 +85,10 @@ class FusedBatchNormGradOp : public XlaOpKernel {
|
||||
string data_format;
|
||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("epsilon", &epsilon_));
|
||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
|
||||
bool is_training;
|
||||
OP_REQUIRES_OK(ctx, ctx->GetAttr("is_training", &is_training));
|
||||
CHECK(is_training) << "FusedBatchNormGradOp with is_training=False cannot "
|
||||
"be used with XLA for now!";
|
||||
TensorFormat tensor_format;
|
||||
if (ctx->GetAttr("data_format", &data_format).ok()) {
|
||||
OP_REQUIRES(ctx, FormatFromString(data_format, &tensor_format),
|
||||
|
@ -33,6 +33,7 @@ using se::dnn::ConvolutionDescriptor;
|
||||
using se::dnn::DataLayout;
|
||||
using se::dnn::FilterDescriptor;
|
||||
using se::dnn::FilterLayout;
|
||||
using se::dnn::AlgorithmDesc;
|
||||
|
||||
ConvolveScratchAllocator::ConvolveScratchAllocator(
|
||||
int device_ordinal, DeviceMemoryAllocator* memory_allocator)
|
||||
@ -251,12 +252,13 @@ tensorflow::Status ConvolutionThunk::Convolve(
|
||||
"Unable to launch convolution for thunk %p with type %s and algorithm "
|
||||
"(%lld, %lld)",
|
||||
this, ConvolutionKindToString(convolution_kind_).c_str(),
|
||||
algorithm_config.algorithm(), algorithm_config.algorithm_no_scratch());
|
||||
algorithm_config.algorithm().algo_id(),
|
||||
algorithm_config.algorithm_no_scratch().algo_id());
|
||||
}
|
||||
|
||||
std::vector<se::dnn::AlgorithmType> ConvolutionThunk::GetAlgorithms(
|
||||
std::vector<AlgorithmDesc::Index> ConvolutionThunk::GetAlgorithms(
|
||||
se::StreamExecutor* stream_exec) const {
|
||||
std::vector<se::dnn::AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
// TODO(yangzihao): Currently disable the use of winograd nonfused in XLA
|
||||
// by default. Should send in conv parameters and enable it when
|
||||
// ShouldIncludeWinogradNonfusedAlgo() returns true.
|
||||
@ -286,7 +288,7 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
|
||||
const ConvolutionDescriptor& convolution_descriptor,
|
||||
const BufferAllocations& buffer_allocations, se::Stream* stream) {
|
||||
// TODO(b/29126320): Try cudnn v5's new auto-tuner when it's rolled out.
|
||||
if (best_algorithm_.algorithm() == se::dnn::kDefaultAlgorithm) {
|
||||
if (best_algorithm_.algorithm().is_default()) {
|
||||
// Auto-tuning either is disabled or only happens in the first run of this
|
||||
// function.
|
||||
VLOG(2) << "Profiling for best convolution algorithm used for "
|
||||
@ -295,26 +297,32 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
|
||||
|
||||
se::dnn::ProfileResult best_result;
|
||||
se::dnn::ProfileResult best_result_without_scratch;
|
||||
for (se::dnn::AlgorithmType algorithm : GetAlgorithms(stream->parent())) {
|
||||
ConvolveScratchAllocator scratch_allocator(
|
||||
buffer_allocations.device_ordinal(),
|
||||
buffer_allocations.memory_allocator());
|
||||
se::dnn::ProfileResult profile_result;
|
||||
bool launch_ok =
|
||||
Convolve(input_descriptor, input_data, filter_descriptor, filter_data,
|
||||
output_descriptor, output_data, convolution_descriptor,
|
||||
se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
|
||||
&scratch_allocator, &profile_result)
|
||||
.ok();
|
||||
if (launch_ok && profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalAllocatedBytes() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_without_scratch.elapsed_time_in_ms()) {
|
||||
best_result_without_scratch = profile_result;
|
||||
std::vector<AlgorithmDesc::Index> algorithms =
|
||||
GetAlgorithms(stream->parent());
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
AlgorithmDesc algorithm(algo_index, use_tensor_ops);
|
||||
ConvolveScratchAllocator scratch_allocator(
|
||||
buffer_allocations.device_ordinal(),
|
||||
buffer_allocations.memory_allocator());
|
||||
se::dnn::ProfileResult profile_result;
|
||||
bool launch_ok =
|
||||
Convolve(input_descriptor, input_data, filter_descriptor,
|
||||
filter_data, output_descriptor, output_data,
|
||||
convolution_descriptor,
|
||||
se::dnn::AlgorithmConfig(algorithm, algorithm), stream,
|
||||
&scratch_allocator, &profile_result)
|
||||
.ok();
|
||||
if (launch_ok && profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalAllocatedBytes() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_without_scratch.elapsed_time_in_ms()) {
|
||||
best_result_without_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -324,7 +332,7 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
|
||||
} else {
|
||||
LOG(ERROR) << "No convolution algorithm works with profiling. Fall back "
|
||||
"to the default algorithm.";
|
||||
best_algorithm_.set_algorithm(se::dnn::kDefaultAlgorithm);
|
||||
best_algorithm_.set_algorithm(AlgorithmDesc());
|
||||
}
|
||||
|
||||
if (best_result_without_scratch.is_valid()) {
|
||||
@ -334,13 +342,14 @@ tensorflow::Status ConvolutionThunk::ConvolveWithTune(
|
||||
LOG(ERROR) << "No convolution algorithm without scratch works with "
|
||||
"profiling. Fall back "
|
||||
"to the default algorithm.";
|
||||
best_algorithm_.set_algorithm_no_scratch(se::dnn::kDefaultAlgorithm);
|
||||
best_algorithm_.set_algorithm_no_scratch(AlgorithmDesc());
|
||||
}
|
||||
}
|
||||
|
||||
{
|
||||
VLOG(2) << "Using convolution algorithm (" << best_algorithm_.algorithm()
|
||||
<< ", " << best_algorithm_.algorithm_no_scratch()
|
||||
VLOG(2) << "Using convolution algorithm ("
|
||||
<< best_algorithm_.algorithm().algo_id() << ", "
|
||||
<< best_algorithm_.algorithm_no_scratch().algo_id()
|
||||
<< ") for ConvolutionThunk: " << this;
|
||||
ConvolveScratchAllocator scratch_allocator(
|
||||
buffer_allocations.device_ordinal(),
|
||||
|
@ -115,7 +115,9 @@ class ConvolutionThunk : public Thunk {
|
||||
perftools::gputools::dnn::ProfileResult* profile_result);
|
||||
|
||||
// Returns the convolve algorithms that can be used for this ConvolutionThunk.
|
||||
std::vector<perftools::gputools::dnn::AlgorithmType> GetAlgorithms(
|
||||
// TODO(nluehr) GetAlgorithms should return AlgorithmDesc including both
|
||||
// tensor-op and non-tensor-op variants.
|
||||
std::vector<perftools::gputools::dnn::AlgorithmDesc::Index> GetAlgorithms(
|
||||
perftools::gputools::StreamExecutor* stream_exec) const;
|
||||
|
||||
// Fastest cuDNN convolution algorithm for this thunk learned from
|
||||
|
@ -544,9 +544,9 @@ bool AreShapesForTranspose021(const Shape& a, const Shape& b) {
|
||||
|
||||
// Emits a tiled 0-2-1 transpose, assuming both input and output lain out from
|
||||
// major to minor. The x- and y- dimensions are tiled in square tiles of edge
|
||||
// length `tile_size`. Each thread block of `tile_size` threads transposes one
|
||||
// tile: each thread copies a row from the input to a shared memory tile, then
|
||||
// copies a column from the shared memory tile to the output.
|
||||
// length `tile_size`. Each thread block of `tile_size` x `num_rows` threads
|
||||
// transposes one tile: each thread copies a row from the input to a shared
|
||||
// memory tile, then copies a column from the shared memory tile to the output.
|
||||
//
|
||||
// `tile_size` should usually be same as warp size.
|
||||
//
|
||||
@ -557,9 +557,10 @@ bool AreShapesForTranspose021(const Shape& a, const Shape& b) {
|
||||
// in any case, the number of blocks we can launch is limited.
|
||||
//
|
||||
// This is the same algorithm in CUDA:
|
||||
// https://github.com/tensorflow/tensorflow/blob/6172351b81af76d0b819fea6bb478cbd4016d6c2/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc#L183
|
||||
// https://github.com/tensorflow/tensorflow/blob/d2693c8a70567cc78b2e8a9ac8020d321620ca83/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc#L189
|
||||
int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
|
||||
const int64 tile_size, llvm::IRBuilder<>* builder) {
|
||||
const int64 tile_size, const int64 num_rows,
|
||||
llvm::IRBuilder<>* builder) {
|
||||
// Adds `addend` to the given `dim` of `index`.
|
||||
auto offset_dim = [builder](llvm_ir::IrArray::Index index,
|
||||
llvm::Value* addend, int64 dim) {
|
||||
@ -590,18 +591,29 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
|
||||
// let x = threadIdx.x
|
||||
llvm::Value* x = llvm_ir::EmitCallToIntrinsic(
|
||||
llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder);
|
||||
llvm_ir::AddRangeMetadata(0, tile_size, static_cast<llvm::Instruction*>(x));
|
||||
llvm_ir::AddRangeMetadata(0, num_rows * tile_size,
|
||||
static_cast<llvm::Instruction*>(x));
|
||||
x = builder->CreateIntCast(x, builder->getInt64Ty(), /*isSigned=*/true,
|
||||
"thread.id.x");
|
||||
|
||||
// computing logical thread ids
|
||||
// logical_x = x % tile_size
|
||||
auto logical_x = builder->CreateURem(x, builder->getInt64(tile_size));
|
||||
|
||||
// logical_y = x / tile_size
|
||||
auto logical_y = builder->CreateUDiv(x, builder->getInt64(tile_size));
|
||||
|
||||
// `emit_cp` emits equivalent to following pseudocode:
|
||||
// if (tile_size == tile_width && tile_size == tile_height) {
|
||||
// unroll for (y in 0..tile_size) {
|
||||
// emit_cp_element(index + {0, y, 0}, y);
|
||||
// unroll for (i in range(0, tile_size, num_rows)) {
|
||||
// emit_cp_element(index + {0, i, 0}, y + logical_y);
|
||||
// }
|
||||
// } else if (x < tile_width) {
|
||||
// for (y in 0..tile_height) {
|
||||
// emit_cp_element(index + {0, y, 0}, y);
|
||||
// tile_height_upperbound = ceil(tile_height / num_rows) * num_rows;
|
||||
// for (i in range(0, tile_height_upperbound, num_rows)) {
|
||||
// y_loc = i + logical_y;
|
||||
// if (y_loc < tile_height)
|
||||
// emit_cp_element(index + {0, i, 0}, y_loc);
|
||||
// }
|
||||
// }
|
||||
//
|
||||
@ -615,32 +627,50 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
|
||||
// tile, whether which is row or column is a function of whether we're copying
|
||||
// from input or to output, and `index` is the index into the input or output
|
||||
// array.
|
||||
auto emit_cp_tile = [builder, tile_size, x, &offset_dim](
|
||||
std::function<void(const llvm_ir::IrArray::Index&, llvm::Value*)>
|
||||
emit_cp_element,
|
||||
llvm::Value* tile_width, llvm::Value* tile_height,
|
||||
const llvm_ir::IrArray::Index& index, const string& loop_name) {
|
||||
auto emit_cp_tile = [builder, tile_size, &offset_dim, num_rows, logical_x,
|
||||
logical_y](
|
||||
std::function<void(const llvm_ir::IrArray::Index&,
|
||||
llvm::Value*)>
|
||||
emit_cp_element,
|
||||
llvm::Value* tile_width, llvm::Value* tile_height,
|
||||
const llvm_ir::IrArray::Index& index,
|
||||
const string& loop_name) {
|
||||
llvm_ir::LlvmIfData if_not_last_row = llvm_ir::EmitIfThenElse(
|
||||
builder->CreateAnd(
|
||||
builder->CreateICmpEQ(builder->getInt64(tile_size), tile_width),
|
||||
builder->CreateICmpEQ(builder->getInt64(tile_size), tile_height)),
|
||||
"not_last_row", builder);
|
||||
builder->SetInsertPoint(if_not_last_row.true_block->getTerminator());
|
||||
for (int64 i = 0; i < tile_size; ++i) {
|
||||
emit_cp_element(offset_dim(index, builder->getInt64(i), /*dim=*/1),
|
||||
builder->getInt64(i));
|
||||
for (int64 i = 0; i < tile_size; i += num_rows) {
|
||||
auto source_idx = offset_dim(index, builder->getInt64(i), /*dim=*/1);
|
||||
auto y_loc = builder->CreateAdd(builder->getInt64(i), logical_y);
|
||||
emit_cp_element(source_idx, y_loc);
|
||||
}
|
||||
builder->SetInsertPoint(if_not_last_row.false_block->getTerminator());
|
||||
llvm_ir::LlvmIfData if_in_tile = llvm_ir::EmitIfThenElse(
|
||||
builder->CreateICmpULT(x, tile_width), "in_tile", builder);
|
||||
builder->CreateICmpULT(logical_x, tile_width), "x_in_tile", builder);
|
||||
builder->SetInsertPoint(if_in_tile.true_block->getTerminator());
|
||||
auto loop = llvm_ir::ForLoop::EmitForLoop(loop_name, builder->getInt64(0),
|
||||
tile_height, builder->getInt64(1),
|
||||
builder);
|
||||
|
||||
// tile_height_upper_bound = ceil(tile_height / num_rows) * num_rows
|
||||
auto tile_height_upper_bound = builder->CreateMul(
|
||||
builder->CreateUDiv(
|
||||
builder->CreateAdd(tile_height, builder->getInt64(num_rows - 1)),
|
||||
builder->getInt64(num_rows)),
|
||||
builder->getInt64(num_rows));
|
||||
|
||||
auto loop = llvm_ir::ForLoop::EmitForLoop(
|
||||
loop_name, builder->getInt64(0), tile_height_upper_bound,
|
||||
builder->getInt64(num_rows), builder);
|
||||
llvm_ir::SetToFirstInsertPoint(loop->GetHeaderBasicBlock(), builder);
|
||||
builder->SetInsertPoint(loop->GetBodyBasicBlock()->getTerminator());
|
||||
|
||||
auto y_loc = builder->CreateAdd(loop->GetIndVarValue(), logical_y);
|
||||
auto if_y_in_tile = llvm_ir::EmitIfThenElse(
|
||||
builder->CreateICmpULT(y_loc, tile_height), "y_in_tile", builder);
|
||||
builder->SetInsertPoint(if_y_in_tile.true_block->getTerminator());
|
||||
|
||||
emit_cp_element(offset_dim(index, loop->GetIndVarValue(), /*dim=*/1),
|
||||
loop->GetIndVarValue());
|
||||
y_loc);
|
||||
builder->SetInsertPoint(if_not_last_row.after_block->getTerminator());
|
||||
};
|
||||
|
||||
@ -673,7 +703,8 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
|
||||
index;
|
||||
});
|
||||
const llvm_ir::IrArray::Index input_index =
|
||||
offset_dim(input_tile_origin, x, /*dim=*/2);
|
||||
offset_dim(offset_dim(input_tile_origin, logical_x, /*dim=*/2), logical_y,
|
||||
/*dim=*/1);
|
||||
std::vector<llvm::Value*> tile_dims(input_shape.dimensions().size());
|
||||
// Only last row or column may not have full size.
|
||||
for (int i = 1; i < 3; ++i) {
|
||||
@ -688,11 +719,11 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
|
||||
// Load data from input memory to shared memory tile.
|
||||
emit_cp_tile(
|
||||
// tile[y, x] = input_array[index]
|
||||
[builder, tile, x, &input](const llvm_ir::IrArray::Index& index,
|
||||
llvm::Value* y) {
|
||||
[builder, tile, &input, logical_x](const llvm_ir::IrArray::Index& index,
|
||||
llvm::Value* y) {
|
||||
builder->CreateStore(
|
||||
input.EmitReadArrayElement(index, builder, "input_element"),
|
||||
builder->CreateGEP(tile, {builder->getInt64(0), y, x}));
|
||||
builder->CreateGEP(tile, {builder->getInt64(0), y, logical_x}));
|
||||
},
|
||||
tile_dims[2], tile_dims[1], input_index, "input");
|
||||
|
||||
@ -706,17 +737,18 @@ int64 EmitTranspose021Tiled(llvm_ir::IrArray input, llvm_ir::IrArray output,
|
||||
const llvm_ir::IrArray::Index output_tile_origin(
|
||||
Permute({0, 2, 1}, input_tile_origin.multidim()));
|
||||
const llvm_ir::IrArray::Index output_index =
|
||||
offset_dim(output_tile_origin, x, /*dim=*/2);
|
||||
offset_dim(offset_dim(output_tile_origin, logical_x, /*dim=*/2),
|
||||
logical_y, /*dim=*/1);
|
||||
|
||||
// Store data from shared memory tile to output memory.
|
||||
emit_cp_tile(
|
||||
// output_array[index] = tile[x, y]
|
||||
[builder, tile, x, &output](const llvm_ir::IrArray::Index& index,
|
||||
llvm::Value* y) {
|
||||
[builder, tile, &output, logical_x](const llvm_ir::IrArray::Index& index,
|
||||
llvm::Value* y) {
|
||||
output.EmitWriteArrayElement(
|
||||
index,
|
||||
builder->CreateLoad(
|
||||
builder->CreateGEP(tile, {builder->getInt64(0), x, y}),
|
||||
builder->CreateGEP(tile, {builder->getInt64(0), logical_x, y}),
|
||||
"output_element"),
|
||||
builder);
|
||||
},
|
||||
@ -742,13 +774,14 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
|
||||
thunk_sequence_->emplace_back(BuildKernelThunk(copy));
|
||||
VLOG(3) << "Emitting tiled 0-2-1 transposition";
|
||||
constexpr int64 tile_size = 32;
|
||||
constexpr int64 num_rows = 8;
|
||||
int64 num_tiles = EmitTranspose021Tiled(
|
||||
GetIrArray(*(copy->operand(0)))
|
||||
.CastToShape(reduced_input_shape, &ir_builder_),
|
||||
GetIrArray(*copy).CastToShape(reduced_output_shape, &ir_builder_),
|
||||
tile_size, &ir_builder_);
|
||||
UpdateLaunchDimensions(LaunchDimensions(num_tiles, tile_size), LastThunk(),
|
||||
ir_emitter_context_->llvm_module());
|
||||
tile_size, num_rows, &ir_builder_);
|
||||
UpdateLaunchDimensions(LaunchDimensions(num_tiles, num_rows * tile_size),
|
||||
LastThunk(), ir_emitter_context_->llvm_module());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -5,6 +5,8 @@ licenses(["notice"]) # Apache 2.0
|
||||
|
||||
package(default_visibility = ["//tensorflow:__subpackages__"])
|
||||
|
||||
load("//third_party/mpi:mpi.bzl", "if_mpi")
|
||||
|
||||
py_library(
|
||||
name = "contrib_py",
|
||||
srcs = glob(["**/*.py"]),
|
||||
@ -85,7 +87,7 @@ py_library(
|
||||
"//tensorflow/contrib/tpu",
|
||||
"//tensorflow/contrib/training:training_py",
|
||||
"//tensorflow/contrib/util:util_py",
|
||||
],
|
||||
] + if_mpi(["//tensorflow/contrib/mpi_collectives:mpi_ops_py"]),
|
||||
)
|
||||
|
||||
cc_library(
|
||||
|
@ -159,6 +159,22 @@ public class TensorFlowInferenceInterface {
|
||||
throw new RuntimeException("Failed to load model from the input stream", e);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct a TensorFlowInferenceInterface with provided Graph
|
||||
*
|
||||
* @param g The Graph to use to construct this interface.
|
||||
*/
|
||||
public TensorFlowInferenceInterface(Graph g) {
|
||||
prepareNativeRuntime();
|
||||
|
||||
// modelName is redundant here, here is for
|
||||
// avoiding error in initialization as modelName is marked final.
|
||||
this.modelName = "";
|
||||
this.g = g;
|
||||
this.sess = new Session(g);
|
||||
this.runner = sess.runner();
|
||||
}
|
||||
|
||||
/**
|
||||
* Runs inference between the previously registered input nodes (via feed*) and the requested
|
||||
|
@ -15,6 +15,8 @@
|
||||
#include "tensorflow/contrib/boosted_trees/lib/trees/decision_tree.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
|
||||
#include <algorithm>
|
||||
|
||||
namespace tensorflow {
|
||||
namespace boosted_trees {
|
||||
namespace trees {
|
||||
|
@ -52,6 +52,7 @@ else()
|
||||
CONFIGURE_COMMAND
|
||||
${farmhash_BUILD}/configure
|
||||
--prefix=${farmhash_INSTALL}
|
||||
--libdir=${farmhash_INSTALL}/lib
|
||||
--enable-shared=yes
|
||||
CXXFLAGS=-fPIC)
|
||||
|
||||
|
1
tensorflow/contrib/cmake/external/gif.cmake
vendored
1
tensorflow/contrib/cmake/external/gif.cmake
vendored
@ -66,6 +66,7 @@ else()
|
||||
${CMAKE_CURRENT_BINARY_DIR}/gif/src/gif/configure
|
||||
--with-pic
|
||||
--prefix=${gif_INSTALL}
|
||||
--libdir=${gif_INSTALL}/lib
|
||||
--enable-shared=yes
|
||||
)
|
||||
|
||||
|
4
tensorflow/contrib/cmake/external/grpc.cmake
vendored
4
tensorflow/contrib/cmake/external/grpc.cmake
vendored
@ -28,7 +28,8 @@ else()
|
||||
set(grpc_STATIC_LIBRARIES
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a
|
||||
${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/libcares.a)
|
||||
endif()
|
||||
|
||||
ExternalProject_Add(grpc
|
||||
@ -42,6 +43,7 @@ ExternalProject_Add(grpc
|
||||
# on "grpc" from the "grpc++_unsecure" rule.
|
||||
PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/grpc/CMakeLists.txt ${GRPC_BUILD}
|
||||
BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
|
||||
COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
|
||||
INSTALL_COMMAND ""
|
||||
CMAKE_CACHE_ARGS
|
||||
-DCMAKE_BUILD_TYPE:STRING=Release
|
||||
|
1
tensorflow/contrib/cmake/external/jpeg.cmake
vendored
1
tensorflow/contrib/cmake/external/jpeg.cmake
vendored
@ -74,6 +74,7 @@ else()
|
||||
CONFIGURE_COMMAND
|
||||
${jpeg_BUILD}/configure
|
||||
--prefix=${jpeg_INSTALL}
|
||||
--libdir=${jpeg_INSTALL}/lib
|
||||
--enable-shared=yes
|
||||
CFLAGS=-fPIC
|
||||
)
|
||||
|
13
tensorflow/contrib/cmake/external/sqlite.cmake
vendored
13
tensorflow/contrib/cmake/external/sqlite.cmake
vendored
@ -23,7 +23,7 @@ set(sqlite_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/sqlite/install)
|
||||
if(WIN32)
|
||||
set(sqlite_STATIC_LIBRARIES ${sqlite_INSTALL}/lib/sqlite.lib)
|
||||
else()
|
||||
set(sqlite_STATIC_LIBRARIES ${sqlite_INSTALL}/lib/sqlite.a)
|
||||
set(sqlite_STATIC_LIBRARIES ${sqlite_INSTALL}/lib/libsqlite.a)
|
||||
endif()
|
||||
|
||||
set(sqlite_HEADERS
|
||||
@ -49,11 +49,14 @@ else()
|
||||
PREFIX sqlite
|
||||
URL ${sqlite_URL}
|
||||
URL_HASH ${sqlite_HASH}
|
||||
PATCH_COMMAND ${CMAKE_COMMAND} -E copy_if_different ${CMAKE_CURRENT_SOURCE_DIR}/patches/sqlite/CMakeLists.txt ${sqlite_BUILD}
|
||||
INSTALL_DIR ${sqlite_INSTALL}
|
||||
DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
|
||||
BUILD_COMMAND $(MAKE)
|
||||
INSTALL_COMMAND $(MAKE) install
|
||||
CFLAGS=-fPIC
|
||||
CMAKE_CACHE_ARGS
|
||||
-DCMAKE_BUILD_TYPE:STRING=Release
|
||||
-DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
|
||||
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
|
||||
-DCMAKE_INSTALL_PREFIX:STRING=${sqlite_INSTALL}
|
||||
)
|
||||
|
||||
endif()
|
||||
@ -69,4 +72,4 @@ add_custom_target(sqlite_copy_headers_to_destination
|
||||
foreach(header_file ${sqlite_HEADERS})
|
||||
add_custom_command(TARGET sqlite_copy_headers_to_destination PRE_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy_if_different ${header_file} ${sqlite_INCLUDE_DIR})
|
||||
endforeach()
|
||||
endforeach()
|
||||
|
@ -49,6 +49,44 @@ function(RELATIVE_PROTOBUF_GENERATE_CPP SRCS HDRS ROOT_DIR)
|
||||
set(${HDRS} ${${HDRS}} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
if(NOT WIN32)
|
||||
function(RELATIVE_PROTOBUF_GENERATE_GRPC_CPP SRCS HDRS ROOT_DIR)
|
||||
if(NOT ARGN)
|
||||
message(SEND_ERROR "Error: RELATIVE_PROTOBUF_GENERATE_GRPC_CPP() called without any proto files")
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(${SRCS})
|
||||
set(${HDRS})
|
||||
foreach(FIL ${ARGN})
|
||||
set(ABS_FIL ${ROOT_DIR}/${FIL})
|
||||
get_filename_component(FIL_WE ${FIL} NAME_WE)
|
||||
get_filename_component(FIL_DIR ${ABS_FIL} PATH)
|
||||
file(RELATIVE_PATH REL_DIR ${ROOT_DIR} ${FIL_DIR})
|
||||
|
||||
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.cc")
|
||||
list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.h")
|
||||
list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.cc")
|
||||
list(APPEND ${HDRS} "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.h")
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.cc"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.grpc.pb.h"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.cc"
|
||||
"${CMAKE_CURRENT_BINARY_DIR}/${REL_DIR}/${FIL_WE}.pb.h"
|
||||
COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
|
||||
ARGS --grpc_out ${CMAKE_CURRENT_BINARY_DIR} --cpp_out ${CMAKE_CURRENT_BINARY_DIR} --plugin protoc-gen-grpc=${GRPC_BUILD}/grpc_cpp_plugin -I ${ROOT_DIR} ${ABS_FIL} -I ${PROTOBUF_INCLUDE_DIRS}
|
||||
DEPENDS ${ABS_FIL} protobuf grpc
|
||||
COMMENT "Running C++ protocol buffer grpc compiler on ${FIL}"
|
||||
VERBATIM )
|
||||
endforeach()
|
||||
|
||||
set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
|
||||
set(${SRCS} ${${SRCS}} PARENT_SCOPE)
|
||||
set(${HDRS} ${${HDRS}} PARENT_SCOPE)
|
||||
endfunction()
|
||||
endif()
|
||||
|
||||
function(RELATIVE_PROTOBUF_TEXT_GENERATE_CPP SRCS HDRS ROOT_DIR)
|
||||
if(NOT ARGN)
|
||||
message(SEND_ERROR "Error: RELATIVE_PROTOBUF_TEXT_GENERATE_CPP() called without any proto files")
|
||||
@ -93,6 +131,7 @@ RELATIVE_PROTOBUF_GENERATE_CPP(PROTO_SRCS PROTO_HDRS
|
||||
${tensorflow_source_dir} ${tf_protos_cc_srcs}
|
||||
)
|
||||
|
||||
|
||||
set(PROTO_TEXT_EXE "proto_text")
|
||||
set(tf_proto_text_srcs
|
||||
"tensorflow/core/example/example.proto"
|
||||
@ -133,7 +172,17 @@ RELATIVE_PROTOBUF_TEXT_GENERATE_CPP(PROTO_TEXT_SRCS PROTO_TEXT_HDRS
|
||||
${tensorflow_source_dir} ${tf_proto_text_srcs}
|
||||
)
|
||||
|
||||
add_library(tf_protos_cc ${PROTO_SRCS} ${PROTO_HDRS})
|
||||
if(WIN32)
|
||||
add_library(tf_protos_cc ${PROTO_SRCS} ${PROTO_HDRS})
|
||||
else()
|
||||
file(GLOB_RECURSE tf_protos_grpc_cc_srcs RELATIVE ${tensorflow_source_dir}
|
||||
"${tensorflow_source_dir}/tensorflow/core/debug/*.proto"
|
||||
)
|
||||
RELATIVE_PROTOBUF_GENERATE_GRPC_CPP(PROTO_GRPC_SRCS PROTO_GRPC_HDRS
|
||||
${tensorflow_source_dir} ${tf_protos_grpc_cc_srcs}
|
||||
)
|
||||
add_library(tf_protos_cc ${PROTO_GRPC_SRCS} ${PROTO_GRPC_HDRS} ${PROTO_SRCS} ${PROTO_HDRS})
|
||||
endif()
|
||||
|
||||
########################################################
|
||||
# tf_core_lib library
|
||||
|
@ -46,31 +46,25 @@ with tf.Graph().as_default():
|
||||
log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
|
||||
unary_scores, y_t, sequence_lengths_t)
|
||||
|
||||
# Compute the viterbi sequence and score.
|
||||
viterbi_sequence, viterbi_score = tf.contrib.crf.crf_decode(
|
||||
unary_scores, transition_params, sequence_lengths_t)
|
||||
|
||||
# Add a training op to tune the parameters.
|
||||
loss = tf.reduce_mean(-log_likelihood)
|
||||
train_op = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
|
||||
|
||||
# Train for a fixed number of iterations.
|
||||
session.run(tf.global_variables_initializer())
|
||||
|
||||
mask = (np.expand_dims(np.arange(num_words), axis=0) <
|
||||
np.expand_dims(sequence_lengths, axis=1))
|
||||
total_labels = np.sum(sequence_lengths)
|
||||
|
||||
# Train for a fixed number of iterations.
|
||||
for i in range(1000):
|
||||
tf_unary_scores, tf_transition_params, _ = session.run(
|
||||
[unary_scores, transition_params, train_op])
|
||||
tf_viterbi_sequence, _ = session.run([viterbi_sequence, train_op])
|
||||
if i % 100 == 0:
|
||||
correct_labels = 0
|
||||
total_labels = 0
|
||||
for tf_unary_scores_, y_, sequence_length_ in zip(tf_unary_scores, y,
|
||||
sequence_lengths):
|
||||
# Remove padding from the scores and tag sequence.
|
||||
tf_unary_scores_ = tf_unary_scores_[:sequence_length_]
|
||||
y_ = y_[:sequence_length_]
|
||||
|
||||
# Compute the highest scoring sequence.
|
||||
viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(
|
||||
tf_unary_scores_, tf_transition_params)
|
||||
|
||||
# Evaluate word-level accuracy.
|
||||
correct_labels += np.sum(np.equal(viterbi_sequence, y_))
|
||||
total_labels += sequence_length_
|
||||
correct_labels = np.sum((y == tf_viterbi_sequence) * mask)
|
||||
accuracy = 100.0 * correct_labels / float(total_labels)
|
||||
print("Accuracy: %.2f%%" % accuracy)
|
||||
```
|
||||
|
@ -493,37 +493,42 @@ void LaunchFusedConv2DBiasActivationOp<GPUDevice, T, BiasType, ScaleType>::
|
||||
dnn::AlgorithmConfig algorithm_config;
|
||||
if (cudnn_use_autotune && !AutoTuneConvBiasActivation::GetInstance()->Find(
|
||||
fused_conv_parameters, &algorithm_config)) {
|
||||
std::vector<dnn::AlgorithmType> algorithms;
|
||||
std::vector<dnn::AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveAlgorithms(
|
||||
fused_conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(),
|
||||
&algorithms));
|
||||
dnn::ProfileResult best_result;
|
||||
dnn::ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
dnn::ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenFusedConvolveWithAlgorithm(
|
||||
conv_input_desc, conv_input_ptr, conv_input_scale,
|
||||
filter_desc, filter_ptr, conv_desc, side_input_ptr,
|
||||
side_input_scale, bias_desc, bias_ptr,
|
||||
dnn::ActivationMode::kRelu, output_desc, &output_ptr,
|
||||
&scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
dnn::AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
dnn::ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenFusedConvolveWithAlgorithm(
|
||||
conv_input_desc, conv_input_ptr, conv_input_scale,
|
||||
filter_desc, filter_ptr, conv_desc, side_input_ptr,
|
||||
side_input_scale, bias_desc, bias_ptr,
|
||||
dnn::ActivationMode::kRelu, output_desc, &output_ptr,
|
||||
&scratch_allocator, dnn::AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -125,13 +125,15 @@ class GdrMemoryManager : public RemoteMemoryManager {
|
||||
|
||||
virtual void Stop() override;
|
||||
|
||||
virtual Status TransportOptionsFromTensor(
|
||||
virtual void TransportOptionsFromTensor(
|
||||
::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
|
||||
Device* device, DeviceContext* device_context, bool on_host) override;
|
||||
Device* device, DeviceContext* device_context, bool on_host,
|
||||
StatusCallback done) override;
|
||||
|
||||
virtual Status TensorFromTransportOptions(
|
||||
virtual void TensorFromTransportOptions(
|
||||
Tensor* tensor, const ::google::protobuf::Any& transport_options,
|
||||
Device* device, DeviceContext* device_context, bool on_host) override;
|
||||
Device* device, DeviceContext* device_context, bool on_host,
|
||||
StatusCallback done) override;
|
||||
|
||||
protected:
|
||||
Status CreateEndpoint(const string& host, const string& port,
|
||||
@ -145,10 +147,6 @@ class GdrMemoryManager : public RemoteMemoryManager {
|
||||
|
||||
void InsertMemoryRegion(void* addr, size_t length);
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
void InsertCUDAMemoryRegion(void* addr, size_t length);
|
||||
#endif
|
||||
|
||||
void EvictMemoryRegion(void* addr, size_t length);
|
||||
|
||||
private:
|
||||
@ -415,45 +413,74 @@ void GdrMemoryManager::Run() {
|
||||
|
||||
void GdrMemoryManager::Stop() { stopped_ = true; }
|
||||
|
||||
Status GdrMemoryManager::TransportOptionsFromTensor(
|
||||
void GdrMemoryManager::TransportOptionsFromTensor(
|
||||
::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
|
||||
Device* device, DeviceContext* device_context, bool on_host) {
|
||||
Device* device, DeviceContext* device_context, bool on_host,
|
||||
StatusCallback done) {
|
||||
auto buffer = DMAHelper::buffer(&tensor);
|
||||
void* addr = buffer->data();
|
||||
size_t length = buffer->size();
|
||||
if (length == 0) {
|
||||
return errors::Unavailable("Cannot register tensor buffer of size 0");
|
||||
done(errors::Unavailable("Cannot register tensor buffer of size 0"));
|
||||
return;
|
||||
}
|
||||
|
||||
ibv_mr* mr = FindMemoryRegion(addr, length);
|
||||
|
||||
Tensor host_copy;
|
||||
#if GOOGLE_CUDA
|
||||
if (!on_host && mr != nullptr) {
|
||||
TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
|
||||
} else if (!on_host) {
|
||||
if (!on_host) {
|
||||
Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
|
||||
host_copy = Tensor(alloc, tensor.dtype(), tensor.shape());
|
||||
Status s;
|
||||
Notification n;
|
||||
GPUUtil::CopyGPUTensorToCPU(device, device_context, &tensor, &host_copy,
|
||||
[&s, &n](const Status& status) {
|
||||
s.Update(status);
|
||||
n.Notify();
|
||||
});
|
||||
n.WaitForNotification();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
}
|
||||
buffer = DMAHelper::buffer(&host_copy);
|
||||
addr = buffer->data();
|
||||
length = buffer->size();
|
||||
mr = FindMemoryRegion(addr, length);
|
||||
Tensor* host_copy = new Tensor(alloc, tensor.dtype(), tensor.shape());
|
||||
GPUUtil::CopyGPUTensorToCPU(
|
||||
device, device_context, &tensor, host_copy,
|
||||
[done, host_copy, mutable_transport_options, this](const Status& s) {
|
||||
if (!s.ok()) {
|
||||
done(s);
|
||||
delete host_copy;
|
||||
return;
|
||||
}
|
||||
auto buffer = DMAHelper::buffer(host_copy);
|
||||
void* addr = buffer->data();
|
||||
size_t length = buffer->size();
|
||||
ibv_mr* mr = FindMemoryRegion(addr, length);
|
||||
|
||||
if (mr == nullptr) {
|
||||
done(errors::Unavailable("Cannot find pinned memory region"));
|
||||
delete host_copy;
|
||||
return;
|
||||
}
|
||||
|
||||
buffer->Ref();
|
||||
TensorKey tensor_key = next_key_++;
|
||||
{
|
||||
mutex_lock l(server_mu_);
|
||||
tensor_buffers_.insert(std::make_pair(tensor_key, buffer));
|
||||
}
|
||||
|
||||
uint64_t checksum = 0;
|
||||
if (VLOG_IS_ON(2)) {
|
||||
checksum = GPUUtil::Checksum(*host_copy);
|
||||
}
|
||||
|
||||
RemoteMemoryRegion remote_mr;
|
||||
remote_mr.set_host(host_);
|
||||
remote_mr.set_port(port_);
|
||||
remote_mr.set_addr(reinterpret_cast<uint64_t>(addr));
|
||||
remote_mr.set_rkey(mr->rkey);
|
||||
remote_mr.set_tensor_key(tensor_key);
|
||||
remote_mr.set_checksum(checksum);
|
||||
mutable_transport_options->PackFrom(remote_mr);
|
||||
|
||||
done(Status::OK());
|
||||
delete host_copy;
|
||||
});
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (mr == nullptr) {
|
||||
return errors::Unavailable("Cannot find pinned memory region");
|
||||
done(errors::Unavailable("Cannot find pinned memory region"));
|
||||
return;
|
||||
}
|
||||
|
||||
buffer->Ref();
|
||||
@ -466,12 +493,8 @@ Status GdrMemoryManager::TransportOptionsFromTensor(
|
||||
uint64_t checksum = 0;
|
||||
if (VLOG_IS_ON(2)) {
|
||||
#ifdef GOOGLE_CUDA
|
||||
if (device->tensorflow_gpu_device_info() && (!on_host)) {
|
||||
if (host_copy.NumElements() > 0) {
|
||||
checksum = GPUUtil::Checksum(device, device_context, host_copy);
|
||||
} else {
|
||||
checksum = GPUUtil::Checksum(device, device_context, tensor);
|
||||
}
|
||||
if (!on_host) {
|
||||
checksum = GPUUtil::Checksum(device, device_context, tensor);
|
||||
} else {
|
||||
checksum = GPUUtil::Checksum(tensor);
|
||||
}
|
||||
@ -487,15 +510,17 @@ Status GdrMemoryManager::TransportOptionsFromTensor(
|
||||
remote_mr.set_checksum(checksum);
|
||||
mutable_transport_options->PackFrom(remote_mr);
|
||||
|
||||
return Status::OK();
|
||||
done(Status::OK());
|
||||
}
|
||||
|
||||
Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
void GdrMemoryManager::TensorFromTransportOptions(
|
||||
Tensor* tensor, const ::google::protobuf::Any& transport_options,
|
||||
Device* device, DeviceContext* device_context, bool on_host) {
|
||||
Device* device, DeviceContext* device_context, bool on_host,
|
||||
StatusCallback done) {
|
||||
RemoteMemoryRegion remote_mr;
|
||||
if (!transport_options.UnpackTo(&remote_mr)) {
|
||||
return errors::NotFound("No RDMA transport options found");
|
||||
done(errors::NotFound("No RDMA transport options found"));
|
||||
return;
|
||||
}
|
||||
|
||||
auto buffer = DMAHelper::buffer(tensor);
|
||||
@ -505,9 +530,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
|
||||
Tensor host_copy;
|
||||
#if GOOGLE_CUDA
|
||||
if (!on_host && mr != nullptr) {
|
||||
TF_RETURN_IF_ERROR(GPUUtil::Sync(device));
|
||||
} else if (!on_host) {
|
||||
if (mr == nullptr && !on_host) {
|
||||
Allocator* alloc = ProcessState::singleton()->GetCUDAHostAllocator(0);
|
||||
host_copy = Tensor(alloc, tensor->dtype(), tensor->shape());
|
||||
buffer = DMAHelper::buffer(&host_copy);
|
||||
@ -518,7 +541,8 @@ Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
if (mr == nullptr) {
|
||||
return errors::Unavailable("Cannot find pinned memory region");
|
||||
done(errors::Unavailable("Cannot find pinned memory region"));
|
||||
return;
|
||||
}
|
||||
|
||||
decltype(clients_)::iterator iter;
|
||||
@ -529,8 +553,12 @@ Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
std::make_pair(std::make_pair(remote_mr.host(), remote_mr.port()),
|
||||
RdmaEndpointPtr(nullptr, EndpointDeleter)));
|
||||
if (success || iter->second.get() == nullptr) {
|
||||
TF_RETURN_IF_ERROR(
|
||||
CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second));
|
||||
Status s =
|
||||
CreateEndpoint(remote_mr.host(), remote_mr.port(), iter->second);
|
||||
if (!s.ok()) {
|
||||
done(s);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
rdma_cm_id* id = iter->second.get();
|
||||
@ -539,37 +567,57 @@ Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
|
||||
if (rdma_post_read(id, nullptr, buffer->data(), buffer->size(), mr, 0,
|
||||
remote_mr.addr(), remote_mr.rkey())) {
|
||||
return errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed");
|
||||
done(errors::Unavailable(strerror(errno), ": ", "rdma_post_read failed"));
|
||||
return;
|
||||
}
|
||||
|
||||
ibv_send_wr wr = {};
|
||||
wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
|
||||
wr.imm_data = htonl(remote_mr.tensor_key());
|
||||
wr.send_flags = IBV_SEND_FENCE | IBV_SEND_SIGNALED;
|
||||
wr.send_flags = IBV_SEND_SIGNALED;
|
||||
ibv_send_wr* bad_wr;
|
||||
if (ibv_post_send(id->qp, &wr, &bad_wr)) {
|
||||
return errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed");
|
||||
done(errors::Unavailable(strerror(errno), ": ", "ibv_post_send failed"));
|
||||
return;
|
||||
}
|
||||
|
||||
ibv_wc wc = {};
|
||||
int ret = rdma_get_send_comp(id, &wc);
|
||||
int ret;
|
||||
while ((ret = ibv_poll_cq(id->send_cq, 1, &wc)) == 0)
|
||||
;
|
||||
if (ret < 0 || wc.status) {
|
||||
return errors::Unavailable(ibv_wc_status_str(wc.status));
|
||||
done(errors::Unavailable(ibv_wc_status_str(wc.status)));
|
||||
return;
|
||||
}
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
if (host_copy.NumElements() > 0) {
|
||||
Status s;
|
||||
Notification n;
|
||||
GPUUtil::CopyCPUTensorToGPU(&host_copy, device_context, device, tensor,
|
||||
[&s, &n](const Status& status) {
|
||||
s.Update(status);
|
||||
n.Notify();
|
||||
});
|
||||
n.WaitForNotification();
|
||||
if (!s.ok()) {
|
||||
return s;
|
||||
uint64_t checksum = 0;
|
||||
if (VLOG_IS_ON(2)) {
|
||||
checksum = GPUUtil::Checksum(host_copy);
|
||||
CHECK(checksum == remote_mr.checksum())
|
||||
<< "Checksum mismatch: " << checksum << "!=" << remote_mr.checksum();
|
||||
}
|
||||
Tensor* ref = new Tensor;
|
||||
std::swap(host_copy, *ref);
|
||||
GPUUtil::CopyCPUTensorToGPU(
|
||||
ref, device_context, device, tensor,
|
||||
[ref, done, buffer, remote_mr, start](const Status& s) {
|
||||
if (!s.ok()) {
|
||||
done(s);
|
||||
delete ref;
|
||||
return;
|
||||
}
|
||||
uint64_t end = Env::Default()->NowMicros();
|
||||
|
||||
VLOG(2) << "RDMA from remote memory region " << remote_mr.rkey()
|
||||
<< " of size " << buffer->size() << " with tensor key "
|
||||
<< remote_mr.tensor_key() << " took " << (end - start)
|
||||
<< " micros";
|
||||
done(Status::OK());
|
||||
delete ref;
|
||||
});
|
||||
return;
|
||||
}
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
@ -583,11 +631,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
if (VLOG_IS_ON(2)) {
|
||||
#ifdef GOOGLE_CUDA
|
||||
if (device->tensorflow_gpu_device_info() && (!on_host)) {
|
||||
if (host_copy.NumElements() > 0) {
|
||||
checksum = GPUUtil::Checksum(device, device_context, host_copy);
|
||||
} else {
|
||||
checksum = GPUUtil::Checksum(device, device_context, *tensor);
|
||||
}
|
||||
checksum = GPUUtil::Checksum(device, device_context, *tensor);
|
||||
} else {
|
||||
checksum = GPUUtil::Checksum(*tensor);
|
||||
}
|
||||
@ -595,7 +639,7 @@ Status GdrMemoryManager::TensorFromTransportOptions(
|
||||
<< "!=" << remote_mr.checksum();
|
||||
#endif
|
||||
}
|
||||
return Status::OK();
|
||||
done(Status::OK());
|
||||
}
|
||||
|
||||
Status GdrMemoryManager::CreateEndpoint(const string& host, const string& port,
|
||||
|
@ -39,15 +39,17 @@ class RemoteMemoryManager {
|
||||
|
||||
// Encodes the tensor information to an arbitrary protocol buffer
|
||||
// The protocol buffer needs to be transmitted via some other channel
|
||||
virtual Status TransportOptionsFromTensor(
|
||||
virtual void TransportOptionsFromTensor(
|
||||
::google::protobuf::Any* mutable_transport_options, const Tensor& tensor,
|
||||
Device* device, DeviceContext* device_context, bool on_host) = 0;
|
||||
Device* device, DeviceContext* device_context, bool on_host,
|
||||
StatusCallback done) = 0;
|
||||
|
||||
// Retrieve the tensor from the encoded protocol buffer
|
||||
// Note that the tensor has to be allocated, but not initialized
|
||||
virtual Status TensorFromTransportOptions(
|
||||
virtual void TensorFromTransportOptions(
|
||||
Tensor* tensor, const ::google::protobuf::Any& transport_options,
|
||||
Device* device, DeviceContext* device_context, bool on_host) = 0;
|
||||
Device* device, DeviceContext* device_context, bool on_host,
|
||||
StatusCallback done) = 0;
|
||||
};
|
||||
|
||||
RemoteMemoryManager* CreateRemoteMemoryManager(const string& host,
|
||||
|
@ -61,16 +61,20 @@ class GdrRecvTensorCall : public BaseRecvTensorCall {
|
||||
const bool on_host =
|
||||
(dst_device_->tensorflow_gpu_device_info() == nullptr) ||
|
||||
recv_args_.alloc_attrs.on_host();
|
||||
Status s = remote_memory_manager_->TensorFromTransportOptions(
|
||||
remote_memory_manager_->TensorFromTransportOptions(
|
||||
const_cast<Tensor*>(&tensor()), transport_options, dst_device_,
|
||||
recv_args_.device_context, on_host);
|
||||
if (!s.ok()) {
|
||||
mutex_lock l(mu_);
|
||||
status_.Update(s);
|
||||
LOG(ERROR)
|
||||
<< "Cannot find pinned memory region from allocator "
|
||||
<< dst_device_->GetAllocator(recv_args_.alloc_attrs)->Name();
|
||||
}
|
||||
recv_args_.device_context, on_host,
|
||||
[this, recv_done](const Status& s) {
|
||||
if (!s.ok()) {
|
||||
mutex_lock l(mu_);
|
||||
status_.Update(s);
|
||||
LOG(ERROR) << "Cannot find pinned memory region from allocator "
|
||||
<< dst_device_->GetAllocator(recv_args_.alloc_attrs)
|
||||
->Name();
|
||||
}
|
||||
recv_done();
|
||||
});
|
||||
return;
|
||||
}
|
||||
if (!s.ok()) {
|
||||
mutex_lock l(mu_);
|
||||
|
@ -86,24 +86,25 @@ void GdrWorker::GrpcRecvTensorAsync(CallOptions* opts,
|
||||
if (val.TotalBytes() > 0 && (!is_dead) &&
|
||||
DMAHelper::CanUseDMA(&val) && dma_ok) {
|
||||
// DMA cases.
|
||||
RecvTensorResponse proto;
|
||||
auto transport_options = proto.mutable_transport_options();
|
||||
Status s = remote_memory_manager_->TransportOptionsFromTensor(
|
||||
RecvTensorResponse* proto = new RecvTensorResponse;
|
||||
proto->set_is_dead(is_dead);
|
||||
proto->set_send_start_micros(Env::Default()->NowMicros());
|
||||
TensorProto* tensor_proto = proto->mutable_tensor();
|
||||
tensor_proto->set_dtype(val.dtype());
|
||||
val.shape().AsProto(tensor_proto->mutable_tensor_shape());
|
||||
auto transport_options = proto->mutable_transport_options();
|
||||
remote_memory_manager_->TransportOptionsFromTensor(
|
||||
transport_options, val, src_dev, send_args.device_context,
|
||||
on_host);
|
||||
if (s.ok()) {
|
||||
proto.set_is_dead(is_dead);
|
||||
proto.set_send_start_micros(Env::Default()->NowMicros());
|
||||
TensorProto* tensor_proto = proto.mutable_tensor();
|
||||
tensor_proto->set_dtype(val.dtype());
|
||||
val.shape().AsProto(tensor_proto->mutable_tensor_shape());
|
||||
grpc::EncodeRecvTensorResponseToByteBuffer(proto, response);
|
||||
done(Status::OK());
|
||||
return;
|
||||
} else {
|
||||
done(s);
|
||||
return;
|
||||
}
|
||||
on_host, [proto, done, response](const Status& s) {
|
||||
if (s.ok()) {
|
||||
grpc::EncodeRecvTensorResponseToByteBuffer(*proto,
|
||||
response);
|
||||
done(Status::OK());
|
||||
} else {
|
||||
done(s);
|
||||
}
|
||||
delete proto;
|
||||
});
|
||||
} else {
|
||||
// Non-DMA cases.
|
||||
if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
|
||||
|
@ -278,7 +278,7 @@ class LabeledTensor(object):
|
||||
@tc.accepts(object, ops.Tensor,
|
||||
tc.Union(Axes, tc.Collection(tc.Union(string_types, AxisLike))))
|
||||
def __init__(self, tensor, axes):
|
||||
"""Construct a LabeledTenor.
|
||||
"""Construct a LabeledTensor.
|
||||
|
||||
Args:
|
||||
tensor: The underlying tensor containing the data.
|
||||
|
@ -91,6 +91,7 @@ See the @{$python/contrib.layers} guide.
|
||||
@@sparse_column_with_hash_bucket
|
||||
@@sparse_column_with_integerized_feature
|
||||
@@sparse_column_with_keys
|
||||
@@sparse_column_with_vocabulary_file
|
||||
@@weighted_sparse_column
|
||||
@@weighted_sum_from_feature_columns
|
||||
@@infer_real_valued_columns
|
||||
|
@ -2600,7 +2600,7 @@ def spatial_softmax(features,
|
||||
|
||||
Read more here:
|
||||
"Learning visual feature spaces for robotic manipulation with
|
||||
deep spatial autoencoders." Finn et. al, http://arxiv.org/abs/1509.06113.
|
||||
deep spatial autoencoders." Finn et al., http://arxiv.org/abs/1509.06113.
|
||||
|
||||
Args:
|
||||
features: A `Tensor` of size [batch_size, W, H, num_channels]; the
|
||||
|
@ -768,7 +768,7 @@ py_test(
|
||||
":learn",
|
||||
"//tensorflow/contrib/layers:layers_py",
|
||||
"//tensorflow/contrib/session_bundle:exporter",
|
||||
"//tensorflow/contrib/session_bundle:manifest_proto_py_pb2",
|
||||
"//tensorflow/contrib/session_bundle:manifest_proto_py",
|
||||
"//tensorflow/python:array_ops",
|
||||
"//tensorflow/python:client",
|
||||
"//tensorflow/python:client_testlib",
|
||||
|
@ -418,6 +418,7 @@ class BaseEstimator(
|
||||
"model_dir are set both in constructor and RunConfig, but with "
|
||||
"different values. In constructor: '{}', in RunConfig: "
|
||||
"'{}' ".format(model_dir, self._config.model_dir))
|
||||
# pylint: enable=g-doc-exception
|
||||
|
||||
self._model_dir = model_dir or self._config.model_dir
|
||||
if self._model_dir is None:
|
||||
|
@ -327,15 +327,20 @@ class Experiment(object):
|
||||
# Otherwise, the servers will wait to connect to each other before starting
|
||||
# to train. We might as well start as soon as we can.
|
||||
config = self._estimator.config
|
||||
if (config.cluster_spec and config.master and
|
||||
config.environment == run_config.Environment.LOCAL):
|
||||
logging.warn("ClusterSpec and master are provided, but environment is "
|
||||
"set to 'local'. Set environment to 'cloud' if you intend "
|
||||
"to use the distributed runtime.")
|
||||
if (config.environment != run_config.Environment.LOCAL and
|
||||
config.environment != run_config.Environment.GOOGLE and
|
||||
config.cluster_spec and config.master):
|
||||
self._start_server()
|
||||
if isinstance(config, run_config.RunConfig):
|
||||
if (config.cluster_spec and config.master and
|
||||
config.environment == run_config.Environment.LOCAL):
|
||||
logging.warn("ClusterSpec and master are provided, but environment is "
|
||||
"set to 'local'. Set environment to 'cloud' if you intend "
|
||||
"to use the distributed runtime.")
|
||||
if (config.environment != run_config.Environment.LOCAL and
|
||||
config.environment != run_config.Environment.GOOGLE and
|
||||
config.cluster_spec and config.master):
|
||||
self._start_server()
|
||||
elif config.cluster_spec and config.master:
|
||||
raise ValueError('For distributed runtime, Experiment class only works with'
|
||||
'tf.contrib.learn.RunConfig for now, but provided {}'
|
||||
.format(type(config)))
|
||||
|
||||
extra_hooks = []
|
||||
if delay_secs is None:
|
||||
|
@ -42,9 +42,6 @@ import time
|
||||
import numpy as np
|
||||
import six
|
||||
|
||||
from tensorflow.contrib.framework import deprecated
|
||||
from tensorflow.contrib.framework.python.ops import variables as contrib_variables
|
||||
from tensorflow.contrib.learn.python.learn.summary_writer_cache import SummaryWriterCache
|
||||
from tensorflow.core.framework.summary_pb2 import Summary
|
||||
from tensorflow.core.util.event_pb2 import SessionLog
|
||||
from tensorflow.python.estimator import estimator as core_estimator
|
||||
@ -883,7 +880,7 @@ class GraphDump(BaseMonitor):
|
||||
class ExportMonitor(EveryN):
|
||||
"""Monitor that exports Estimator every N steps."""
|
||||
|
||||
@deprecated("2017-03-25",
|
||||
@deprecation.deprecated("2017-03-25",
|
||||
"ExportMonitor is deprecated. Please pass an "
|
||||
"ExportStrategy to Experiment instead.")
|
||||
def __init__(self,
|
||||
|
@ -14,4 +14,5 @@ RUN apt-get install -y \
|
||||
make \
|
||||
python \
|
||||
unzip \
|
||||
wget \
|
||||
zlib1g-dev
|
||||
|
@ -512,7 +512,6 @@ $(wildcard tensorflow/core/grappler/clusters/single_machine.*)
|
||||
# Filter out all the excluded files.
|
||||
TF_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
|
||||
# Add in any extra files that don't fit the patterns easily
|
||||
TF_CC_SRCS += tensorflow/core/platform/default/gpu_tracer.cc
|
||||
TF_CC_SRCS += tensorflow/contrib/makefile/downloads/fft2d/fftsg.c
|
||||
# Also include the op and kernel definitions.
|
||||
TF_CC_SRCS += $(shell cat $(MAKEFILE_DIR)/tf_op_files.txt)
|
||||
|
@ -25,30 +25,7 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
WORKSPACE="${SCRIPT_DIR}/../../../"
|
||||
cd ${WORKSPACE} || exit 1
|
||||
|
||||
DOCKER_IMG_NAME="tf-make-base"
|
||||
DOCKER_CONTEXT_PATH="${WORKSPACE}tensorflow/contrib/makefile/"
|
||||
DOCKERFILE_PATH="${DOCKER_CONTEXT_PATH}Dockerfile"
|
||||
|
||||
# Build the docker image.
|
||||
echo "Building image ${DOCKER_IMG_NAME}..."
|
||||
docker build -t ${DOCKER_IMG_NAME} \
|
||||
-f "${DOCKERFILE_PATH}" "${DOCKER_CONTEXT_PATH}"
|
||||
|
||||
# Check docker build command status.
|
||||
if [[ $? != "0" ]]; then
|
||||
echo "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
COMMAND="tensorflow/contrib/makefile/build_all_linux.sh"
|
||||
|
||||
# Run the command inside the container.
|
||||
echo "Running ${COMMAND} inside ${DOCKER_IMG_NAME}..."
|
||||
# By default we cleanup - remove the container once it finish running (--rm)
|
||||
# and share the PID namespace (--pid=host) so the process inside does not have
|
||||
# pid 1 and SIGKILL is propagated to the process inside (jenkins can kill it).
|
||||
docker run --rm --pid=host \
|
||||
-v ${WORKSPACE}:/workspace \
|
||||
-w /workspace \
|
||||
"${DOCKER_IMG_NAME}" \
|
||||
${COMMAND}
|
||||
echo "Running ${COMMAND} inside Android docker image..."
|
||||
tensorflow/tools/ci_build/ci_build.sh android ${COMMAND}
|
||||
|
@ -286,7 +286,7 @@ for arch in $archs; do
|
||||
|
||||
if [ ! -d "$nsync_platform_dir" ]; then
|
||||
mkdir "$nsync_platform_dir"
|
||||
echo "$makefile" | sed 's,^[ \t]*,,' > "$nsync_platform_dir/Makefile"
|
||||
echo "$makefile" | sed $'s,^[ \t]*,,' > "$nsync_platform_dir/Makefile"
|
||||
touch "$nsync_platform_dir/dependfile"
|
||||
fi
|
||||
if (cd "$nsync_platform_dir" && make depend nsync.a >&2); then
|
||||
|
@ -20,11 +20,11 @@ DOWNLOADS_DIR=tensorflow/contrib/makefile/downloads
|
||||
BZL_FILE_PATH=tensorflow/workspace.bzl
|
||||
|
||||
EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
|
||||
GEMMLOWP_URL="$(grep -o 'http.*github.com/google/gemmlowp/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
|
||||
GEMMLOWP_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
|
||||
GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
|
||||
NSYNC_URL="$(grep -o 'http.*github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
|
||||
PROTOBUF_URL="$(grep -o 'http.*github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
|
||||
RE2_URL="$(grep -o 'http.*github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
|
||||
NSYNC_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
|
||||
PROTOBUF_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
|
||||
RE2_URL="$(grep -o 'http://mirror.bazel.build/github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
|
||||
FFT2D_URL="$(grep -o 'http.*fft\.tgz' "${BZL_FILE_PATH}" | grep -v bazel-mirror | head -n1)"
|
||||
|
||||
# TODO(petewarden): Some new code in Eigen triggers a clang bug with iOS arm64,
|
||||
@ -49,7 +49,18 @@ download_and_extract() {
|
||||
local dir="${2:?${usage}}"
|
||||
echo "downloading ${url}" >&2
|
||||
mkdir -p "${dir}"
|
||||
curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
|
||||
if [[ "${url}" == *gz ]]; then
|
||||
curl -Ls "${url}" | tar -C "${dir}" --strip-components=1 -xz
|
||||
elif [[ "${url}" == *zip ]]; then
|
||||
tempdir=$(mktemp -d)
|
||||
tempdir2=$(mktemp -d)
|
||||
wget ${url} -P ${tempdir}
|
||||
unzip ${tempdir}/* -d ${tempdir2}
|
||||
# unzip has no strip components, so unzip to a temp dir, and move the files
|
||||
# we want from the tempdir to destination.
|
||||
cp -R ${tempdir2}/*/* ${dir}/
|
||||
rm -rf ${tempdir2} ${tempdir}
|
||||
fi
|
||||
|
||||
# Delete any potential BUILD files, which would interfere with Bazel builds.
|
||||
find "${dir}" -type f -name '*BUILD' -delete
|
||||
|
80
tensorflow/contrib/mpi_collectives/BUILD
Normal file
80
tensorflow/contrib/mpi_collectives/BUILD
Normal file
@ -0,0 +1,80 @@
|
||||
# Ops that communicate with other processes via MPI.
|
||||
|
||||
package(default_visibility = [
|
||||
"//tensorflow:__subpackages__",
|
||||
])
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
filegroup(
|
||||
name = "all_files",
|
||||
srcs = glob(
|
||||
["**/*"],
|
||||
exclude = [
|
||||
"**/METADATA",
|
||||
"**/OWNERS",
|
||||
],
|
||||
),
|
||||
visibility = ["//tensorflow:__subpackages__"],
|
||||
)
|
||||
|
||||
load(
|
||||
"//tensorflow/core:platform/default/build_config.bzl",
|
||||
"tf_proto_library_cc",
|
||||
)
|
||||
|
||||
tf_proto_library_cc(
|
||||
name = "mpi_message_proto",
|
||||
srcs = ["mpi_message.proto"],
|
||||
cc_api_version = 2,
|
||||
protodeps = ["//tensorflow/core:protos_all"],
|
||||
visibility = [
|
||||
"//tensorflow:__subpackages__",
|
||||
],
|
||||
)
|
||||
|
||||
load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
|
||||
load("//tensorflow:tensorflow.bzl", "tf_py_test")
|
||||
|
||||
tf_custom_op_library(
|
||||
name = "mpi_collectives.so",
|
||||
srcs = [
|
||||
"mpi_ops.cc",
|
||||
"ring.cc",
|
||||
"ring.h",
|
||||
],
|
||||
gpu_srcs = [
|
||||
"ring.cu.cc",
|
||||
"ring.h",
|
||||
],
|
||||
deps = [
|
||||
":mpi_message_proto_cc",
|
||||
"//third_party/mpi",
|
||||
],
|
||||
)
|
||||
|
||||
tf_py_test(
|
||||
name = "mpi_ops_test",
|
||||
srcs = ["mpi_ops_test.py"],
|
||||
additional_deps = [
|
||||
"//tensorflow:tensorflow_py",
|
||||
"//tensorflow/python:platform",
|
||||
],
|
||||
data = [
|
||||
":mpi_collectives.so",
|
||||
],
|
||||
tags = ["manual"],
|
||||
)
|
||||
|
||||
py_library(
|
||||
name = "mpi_ops_py",
|
||||
srcs = [
|
||||
"__init__.py",
|
||||
"mpi_ops.py",
|
||||
],
|
||||
data = [
|
||||
":mpi_collectives.so",
|
||||
],
|
||||
srcs_version = "PY2AND3",
|
||||
visibility = ["//visibility:public"],
|
||||
)
|
5
tensorflow/contrib/mpi_collectives/README.md
Normal file
5
tensorflow/contrib/mpi_collectives/README.md
Normal file
@ -0,0 +1,5 @@
|
||||
# MPI TensorFlow integration
|
||||
|
||||
Tensorflow MPI integration allows communicating between different TensorFlow
|
||||
processes using MPI. This enables training across multiple nodes and GPUs
|
||||
using high-speed interconnects.
|
273
tensorflow/contrib/mpi_collectives/__init__.py
Normal file
273
tensorflow/contrib/mpi_collectives/__init__.py
Normal file
@ -0,0 +1,273 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
# pylint: disable=g-short-docstring-punctuation
|
||||
"""## Communicating Between Processes with MPI
|
||||
|
||||
TensorFlow natively provides inter-device communication through send and
|
||||
receive ops and inter-node communication through Distributed TensorFlow, based
|
||||
on the same send and receive abstractions. On HPC clusters where Infiniband or
|
||||
other high-speed node interconnects are available, these can end up being
|
||||
insufficient for synchronous data-parallel training (without asynchronous
|
||||
gradient descent). This module implements a variety of MPI ops which can take
|
||||
advantage of hardware-specific MPI libraries for efficient communication.
|
||||
|
||||
In order to use this module, TensorFlow must be built with an MPI library,
|
||||
which can be provided to the `./configure` script at build time. As a user of
|
||||
TensorFlow, you will need to build TensorFlow yourself to select the MPI
|
||||
library to use; to do so, follow the [instructions for building TensorFlow from
|
||||
source](https://www.tensorflow.org/get_started/os_setup#installing_from_sources).
|
||||
|
||||
### Utility Ops
|
||||
|
||||
In addition to reductions and gathers, this module provides utility operations
|
||||
for detecting the running MPI configuration.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
from tensorflow.contrib import mpi
|
||||
|
||||
# Use `mpi.Session` instead of `tf.Session`
|
||||
with mpi.Session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
print("My MPI Rank:", rank)
|
||||
|
||||
if rank == 0:
|
||||
print("MPI Size:", session.run(mpi.size()))
|
||||
```
|
||||
|
||||
@@rank
|
||||
@@size
|
||||
|
||||
### Ring Allreduce and Allgather
|
||||
|
||||
When summing or averaging tensors across many processes, communication can
|
||||
easily become a bottleneck. A naive implementation will send all the tensor
|
||||
values to the same process, perform the reduction, and then broadcast the
|
||||
values back to all other processes, effectively creating a synchronous
|
||||
parameter server in one process. However, the process responsible for
|
||||
performing the reduction will have to receive and send a massive amount of data
|
||||
which scales with the number of processes *and* the number of parameters in the
|
||||
model.
|
||||
|
||||
Instead of centralizing the reduction and having one primary reducer, we can
|
||||
implement a distributed allreduce or allgather. A bandwidth-optimal allreduce
|
||||
will end up sending 2(N - 1) values for every value in the input tensor,
|
||||
and can be implemented with a ring allreduce [1]. (Intuitively, a linear reduce
|
||||
requires at least (N - 1) sends between the different nodes, and a broadcast of
|
||||
the result also requires (N - 1) sends, for a total of 2 (N - 1); these two
|
||||
steps cannot be combined in a clever way to reduce the number of required
|
||||
sends.) This module implements bandwidth-optimal ring allreduce and ring
|
||||
allgather operations using MPI; by choosing a hardware-appropriate MPI
|
||||
implementation (such as OpenMPI with CUDA-IPC support), you can train large
|
||||
models with synchronous gradient descent with minimal communication overhead.
|
||||
|
||||
In addition to the `allreduce` and `allgather` functions, a convenience
|
||||
`DistributedOptimizer` wrapper is provided to simplify using these functions
|
||||
for reducing model gradients.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
import tensorflow as tf
|
||||
from tensorflow.contrib import mpi_collectives as mpi
|
||||
|
||||
# Construct a simple linear regression model to optimize
|
||||
W = tf.get_variable("W", shape=[20, 1], dtype=tf.float32)
|
||||
B = tf.get_variable("B", shape=[1, 1], dtype=tf.float32)
|
||||
inputs = tf.placeholder("Inputs", shape=[None, 20])
|
||||
outputs = tf.placeholder("Outputs", shape=[None, 1])
|
||||
loss = tf.nn.l2_loss(tf.matmul(inputs, W) + B - outputs)
|
||||
|
||||
# Training using MPI allreduce with DistributedOptimizer
|
||||
optimizer = mpi.DistributedOptimizer(tf.train.AdamOptimizer())
|
||||
train = optimizer.minimize(loss)
|
||||
|
||||
# Average loss over all ranks, for printing.
|
||||
# Do not pass this to an optimizer!
|
||||
avg_loss = mpi.allreduce(loss)
|
||||
|
||||
# On different ranks, feed different input data.
|
||||
with mpi.Session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
batch_inputs, batch_outputs = construct_batch_for_rank(rank)
|
||||
feed_dict = {inputs: batch_inputs, outputs: batch_outputs}
|
||||
_, l = session.run([train, avg_loss], feed_dict=feed_dict)
|
||||
print("Average Loss:", l)
|
||||
```
|
||||
|
||||
[1] Patarasuk, Pitch and Yuan, Xin. "Bandwidth Optimal All-reduce Algorithms
|
||||
for Clusters of Workstations".
|
||||
|
||||
@@Session
|
||||
@@DistributedOptimizer
|
||||
@@allreduce
|
||||
@@allgather
|
||||
"""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.contrib.mpi_collectives.mpi_ops import size
|
||||
from tensorflow.contrib.mpi_collectives.mpi_ops import rank
|
||||
from tensorflow.contrib.mpi_collectives.mpi_ops import local_rank
|
||||
from tensorflow.contrib.mpi_collectives.mpi_ops import allgather
|
||||
from tensorflow.contrib.mpi_collectives.mpi_ops import _allreduce
|
||||
from tensorflow.contrib.mpi_collectives.mpi_ops import init
|
||||
|
||||
|
||||
def allreduce(tensor, average=True):
|
||||
"""Perform an MPI allreduce on a tf.Tensor or tf.IndexedSlices.
|
||||
|
||||
Arguments:
|
||||
tensor: tf.Tensor, tf.Variable, or tf.IndexedSlices to reduce.
|
||||
The shape of the input must be identical across all ranks.
|
||||
average: If True, computes the average over all ranks.
|
||||
Otherwise, computes the sum over all ranks.
|
||||
|
||||
This function performs a bandwidth-optimal ring allreduce on the input
|
||||
tensor. If the input is an tf.IndexedSlices, the function instead does an
|
||||
allgather on the values and the indices, effectively doing an allreduce on
|
||||
the represented tensor.
|
||||
"""
|
||||
if isinstance(tensor, tf.IndexedSlices):
|
||||
# For IndexedSlices, do two allgathers intead of an allreduce.
|
||||
mpi_size = tf.cast(size(), tensor.values.dtype)
|
||||
values = allgather(tensor.values)
|
||||
indices = allgather(tensor.indices)
|
||||
|
||||
# To make this operation into an average, divide all gathered values by
|
||||
# the MPI size.
|
||||
new_values = tf.div(values, mpi_size) if average else values
|
||||
return tf.IndexedSlices(new_values, indices,
|
||||
dense_shape=tensor.dense_shape)
|
||||
else:
|
||||
mpi_size = tf.cast(size(), tensor.dtype)
|
||||
summed_tensor = _allreduce(tensor)
|
||||
new_tensor = (tf.div(summed_tensor, mpi_size)
|
||||
if average else summed_tensor)
|
||||
return new_tensor
|
||||
|
||||
|
||||
class DistributedOptimizer(tf.train.Optimizer):
|
||||
"""An optimizer that wraps another tf.Optimizer, using an MPI allreduce to
|
||||
average gradient values before applying gradients to model weights."""
|
||||
|
||||
def __init__(self, optimizer, name=None, use_locking=False):
|
||||
"""Construct a new DistributedOptimizer, which uses another optimizer
|
||||
under the hood for computing single-process gradient values and
|
||||
applying gradient updates after the gradient values have been averaged
|
||||
across all the MPI ranks.
|
||||
|
||||
Args:
|
||||
optimizer: Optimizer to use for computing gradients and applying updates.
|
||||
name: Optional name prefix for the operations created when applying
|
||||
gradients. Defaults to "Distributed" followed by the provided
|
||||
optimizer type.
|
||||
use_locking: Whether to use locking when updating variables. See
|
||||
Optimizer.__init__ for more info.
|
||||
"""
|
||||
if name is None:
|
||||
name = "Distributed{}".format(type(optimizer).__name__)
|
||||
|
||||
self._optimizer = optimizer
|
||||
super(DistributedOptimizer, self).__init__(
|
||||
name=name, use_locking=use_locking)
|
||||
|
||||
def compute_gradients(self, *args, **kwargs):
|
||||
"""Compute gradients of all trainable variables.
|
||||
|
||||
See Optimizer.compute_gradients() for more info.
|
||||
|
||||
In DistributedOptimizer, compute_gradients() is overriden to also
|
||||
allreduce the gradients before returning them.
|
||||
"""
|
||||
gradients = (super(DistributedOptimizer, self)
|
||||
.compute_gradients(*args, **kwargs))
|
||||
return [(allreduce(gradient), var) for (gradient, var) in gradients]
|
||||
|
||||
def _apply_dense(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._apply_dense(*args, **kwargs)
|
||||
|
||||
def _apply_sparse(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._apply_sparse(*args, **kwargs)
|
||||
|
||||
def _apply_sparse_duplicate_indices(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._apply_sparse_duplicate_indices(*args,
|
||||
**kwargs)
|
||||
|
||||
def _prepare(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._prepare(*args, **kwargs)
|
||||
|
||||
def _create_slots(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._create_slots(*args, **kwargs)
|
||||
|
||||
def _valid_dtypes(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._valid_dtypes(*args, **kwargs)
|
||||
|
||||
def _finish(self, *args, **kwargs):
|
||||
"""Calls this same method on the underlying optimizer."""
|
||||
return self._optimizer._finish(*args, **kwargs)
|
||||
|
||||
|
||||
class Session(tf.Session):
|
||||
"""A class for running TensorFlow operations, with copies of the same graph
|
||||
running distributed across different MPI nodes.
|
||||
|
||||
The primary difference between `tf.Session` and
|
||||
`tf.contrib.mpi_collectives.Session` is that the MPI `Session` ensures that
|
||||
the `Session` options are correct for use with `tf.contrib.mpi`, and
|
||||
initializes MPI immediately upon the start of the session.
|
||||
"""
|
||||
|
||||
def __init__(self, target='', graph=None, config=None):
|
||||
"""Creates a new TensorFlow MPI session.
|
||||
|
||||
Unlike a normal `tf.Session`, an MPI Session may only use a single GPU,
|
||||
which must be specified in advance before the session is initialized.
|
||||
In addition, it only uses a single graph evaluation thread, and
|
||||
initializes MPI immediately upon starting.
|
||||
|
||||
If no `graph` argument is specified when constructing the session,
|
||||
the default graph will be launched in the session. If you are
|
||||
using more than one graph (created with `tf.Graph()` in the same
|
||||
process, you will have to use different sessions for each graph,
|
||||
but each graph can be used in multiple sessions. In this case, it
|
||||
is often clearer to pass the graph to be launched explicitly to
|
||||
the session constructor.
|
||||
|
||||
Args:
|
||||
target: (Optional.) The execution engine to connect to.
|
||||
graph: (Optional.) The `Graph` to be launched (described above).
|
||||
config: (Optional.) A `ConfigProto` protocol buffer with configuration
|
||||
options for the session.
|
||||
"""
|
||||
super(Session, self).__init__(target, graph, config=config)
|
||||
|
||||
# Initialize MPI on the relevant device.
|
||||
# TODO: Move this to library load and eliminate mpi.Session()
|
||||
if graph is None:
|
||||
graph = tf.get_default_graph()
|
||||
with graph.as_default():
|
||||
self.run(init())
|
114
tensorflow/contrib/mpi_collectives/mpi_allgather_test.py
Normal file
114
tensorflow/contrib/mpi_collectives/mpi_allgather_test.py
Normal file
@ -0,0 +1,114 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.mpi_collectives as mpi
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
average_allgather = False
|
||||
|
||||
|
||||
class AllgatherTest(test.TestCase):
|
||||
def checkAllgather(self, num_ranks, all_gathered, local_gathered):
|
||||
# Ensure that indices match.
|
||||
all_gat_ind = np.sort(all_gathered.indices)
|
||||
loc_gat_ind = np.sort(local_gathered.indices)
|
||||
assert(len(loc_gat_ind) == len(all_gat_ind))
|
||||
for i in range(len(loc_gat_ind)):
|
||||
assert(loc_gat_ind[i] == all_gat_ind[i])
|
||||
|
||||
# For each index, verify same values.
|
||||
local_checked = []
|
||||
for i in range(len(local_gathered.indices)):
|
||||
local_checked.append(False)
|
||||
for i in range(len(all_gathered.indices)):
|
||||
all_index = all_gathered.indices[i]
|
||||
# TODO(jthestness): Make this lookup quicker using sorting.
|
||||
loc_index = -1
|
||||
for j in range(len(local_gathered.indices)):
|
||||
if local_gathered.indices[j] == all_index and not local_checked[j]:
|
||||
loc_index = j
|
||||
local_checked[j] = True
|
||||
break
|
||||
assert(loc_index >= 0)
|
||||
correct_output = local_gathered.values[loc_index][0]
|
||||
if average_allgather:
|
||||
correct_output = correct_output / float(num_ranks)
|
||||
assert(all_gathered.values[i][0] == correct_output)
|
||||
|
||||
|
||||
def test_mpi_allgather(self):
|
||||
# Get MPI rank
|
||||
my_rank = int(os.environ['PMI_RANK'])
|
||||
num_ranks = int(os.environ['PMI_SIZE'])
|
||||
|
||||
indices_per_rank = 100
|
||||
tensor_width = 10
|
||||
|
||||
# Create IndexedSlices for each rank, some with overlapping indices.
|
||||
to_gather_indices = []
|
||||
to_gather_values = []
|
||||
to_gather = []
|
||||
for rank_id in range(num_ranks):
|
||||
indices = []
|
||||
values = []
|
||||
my_multiple = rank_id + 1
|
||||
current_index = my_multiple
|
||||
for i in range(indices_per_rank):
|
||||
indices.append(current_index)
|
||||
ones_tensor = tf.ones([tensor_width])
|
||||
values.append(tf.multiply(ones_tensor,
|
||||
tf.fill(ones_tensor.get_shape(),
|
||||
float(current_index))))
|
||||
current_index += my_multiple
|
||||
concat_ind = tf.stack(indices)
|
||||
concat_vals = tf.stack(values)
|
||||
to_gather_indices.append(concat_ind)
|
||||
to_gather_values.append(concat_vals)
|
||||
to_gather.append(tf.IndexedSlices(concat_vals, concat_ind))
|
||||
|
||||
# Collect the local IndexedSlices (indices and values) to create
|
||||
# correct IndexedSlices output.
|
||||
correct_gather_indices = tf.concat(to_gather_indices, 0)
|
||||
correct_gather_values = tf.concat(to_gather_values, 0)
|
||||
correct_gather = tf.IndexedSlices(correct_gather_values,
|
||||
correct_gather_indices)
|
||||
|
||||
all_gather = mpi.allreduce(to_gather[my_rank], average_allgather)
|
||||
|
||||
# NOTE: This assumes that device IDs are numbered the same as ranks.
|
||||
gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
|
||||
config = tf.ConfigProto(gpu_options=gpu_options)
|
||||
|
||||
# MPI Session to test allgather.
|
||||
with mpi.Session(config=config) as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
all_gathered, local_gathered = sess.run([all_gather, correct_gather])
|
||||
|
||||
# Compare all_gathered with local_gathered.
|
||||
self.checkAllgather(num_ranks, all_gathered, local_gathered)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test.main()
|
153
tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py
Normal file
153
tensorflow/contrib/mpi_collectives/mpi_allreduce_test.py
Normal file
@ -0,0 +1,153 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ==============================================================================
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import tensorflow as tf
|
||||
import tensorflow.contrib.mpi_collectives as mpi
|
||||
from tensorflow.python.platform import test
|
||||
|
||||
|
||||
average_allreduce = False
|
||||
max_wrong_count = -1
|
||||
|
||||
|
||||
class AllreduceTest(test.TestCase):
|
||||
def dumpFailure(self, my_rank, out_loc_red, my_correct, out_all_red,
|
||||
our_correct):
|
||||
# Find reduced/allreduced indices that are wrong and print all the
|
||||
# values from output, slices, reduced, allreduced, so we can debug
|
||||
# which is incorrect:
|
||||
wrong_count = 0
|
||||
red_dims = out_loc_red.shape
|
||||
assert(len(red_dims) == 2)
|
||||
for i in range(red_dims[0]):
|
||||
for j in range(red_dims[1]):
|
||||
suffix = ""
|
||||
if out_loc_red[i][j] != my_correct[i][j] or \
|
||||
out_all_red[i][j] != our_correct[i][j]:
|
||||
suffix = "WRONG"
|
||||
wrong_count += 1
|
||||
print("{}\t{}\t{}\t{}\t{}\t{}"
|
||||
.format(my_rank, i, j, out_loc_red[i][j],
|
||||
out_all_red[i][j], suffix), flush=True)
|
||||
if max_wrong_count > 0 and wrong_count >= max_wrong_count:
|
||||
return
|
||||
|
||||
def test_mpi_allreduce(self):
|
||||
# Get MPI rank
|
||||
my_rank = int(os.environ['PMI_RANK'])
|
||||
num_ranks = int(os.environ['PMI_SIZE'])
|
||||
|
||||
stages = 13
|
||||
batch_size = 1331
|
||||
hidden_size = batch_size
|
||||
out_size = batch_size
|
||||
|
||||
# Input placeholder (batch_size x hidden) - init to 1s
|
||||
inputs = tf.placeholder(tf.float32, shape=(batch_size, hidden_size),
|
||||
name="Input")
|
||||
|
||||
# Large matrices (hidden x out_dim) - init random
|
||||
weights = []
|
||||
for i in range(stages):
|
||||
initer = tf.constant_initializer(pow(2.0, i + 1.0))
|
||||
weights.append(tf.get_variable("weights_{}".format(i),
|
||||
shape=(hidden_size, out_size),
|
||||
dtype=tf.float32,
|
||||
initializer=initer))
|
||||
|
||||
# Calculate output through dependent allreduces
|
||||
stage_input = inputs
|
||||
for i in range(stages):
|
||||
inter_output = tf.add(stage_input, weights[i],
|
||||
name="add_red_{}".format(i))
|
||||
stage_input = mpi.allreduce(inter_output,
|
||||
average=average_allreduce)
|
||||
|
||||
all_reduced = stage_input
|
||||
|
||||
# Local reduced output for verification
|
||||
local_input = inputs
|
||||
for i in range(stages):
|
||||
inter_output = tf.add(local_input, weights[i],
|
||||
name="addin_loc_{}".format(i))
|
||||
my_reducer = tf.Variable(initial_value=np.ones((hidden_size, out_size)),
|
||||
dtype=tf.float32, name="loc_redr_{}".format(i))
|
||||
for r in range(num_ranks):
|
||||
my_reducer = tf.add(my_reducer, inter_output,
|
||||
name="add_loc_{}_{}".format(i, r))
|
||||
if average_allreduce:
|
||||
local_input = tf.div(my_reducer, num_ranks,
|
||||
name="div_loc_{}".format(i))
|
||||
else:
|
||||
local_input = my_reducer
|
||||
|
||||
local_reduced = local_input
|
||||
|
||||
# NOTE: This assumes that device IDs are numbered the same as ranks
|
||||
gpu_options = tf.GPUOptions(visible_device_list=str(my_rank))
|
||||
config = tf.ConfigProto(gpu_options=gpu_options)
|
||||
|
||||
# MPI Session to test allreduce
|
||||
with mpi.Session(config=config) as sess:
|
||||
sess.run(tf.global_variables_initializer())
|
||||
|
||||
input_feed = np.ones((batch_size, hidden_size), dtype=np.float32)
|
||||
our_output = input_feed[0][0]
|
||||
spread_var = 100
|
||||
input_feed = input_feed + my_rank * spread_var
|
||||
my_output = input_feed[0][0]
|
||||
for i in range(stages):
|
||||
curr_feed = my_output + pow(2.0, i + 1.0)
|
||||
my_output = curr_feed * num_ranks + 1
|
||||
curr_our_feed = our_output + pow(2.0, i + 1.0)
|
||||
if i == 0:
|
||||
sum_ranks = num_ranks * (num_ranks - 1) / 2
|
||||
our_output = curr_our_feed * num_ranks + \
|
||||
spread_var * sum_ranks
|
||||
else:
|
||||
our_output = curr_our_feed * num_ranks
|
||||
|
||||
print("rank {}: My output is {}".format(my_rank, my_output))
|
||||
my_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
|
||||
my_correct = my_correct + my_output
|
||||
print("rank {}: Our output is {}".format(my_rank, our_output))
|
||||
our_correct = np.zeros((batch_size, hidden_size), dtype=np.float32)
|
||||
our_correct = our_correct + our_output
|
||||
|
||||
for i in range(1000):
|
||||
if i % 100 == 0:
|
||||
print("{}: iter {}".format(my_rank, i), flush=True)
|
||||
feed_dict = {inputs: input_feed}
|
||||
out_all_red, out_loc_red \
|
||||
= sess.run([all_reduced, local_reduced],
|
||||
feed_dict=feed_dict)
|
||||
|
||||
if not np.allclose(out_loc_red, my_correct) or \
|
||||
not np.allclose(out_all_red, our_correct):
|
||||
print("Test incorrect on iter {}".format(i), flush=True)
|
||||
self.dumpFailure(my_rank, out_loc_red, my_correct, out_all_red,
|
||||
our_correct)
|
||||
assert(np.allclose(out_loc_red, my_correct) and
|
||||
np.allclose(out_all_red, our_correct))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test.main()
|
64
tensorflow/contrib/mpi_collectives/mpi_message.proto
Normal file
64
tensorflow/contrib/mpi_collectives/mpi_message.proto
Normal file
@ -0,0 +1,64 @@
|
||||
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
syntax = "proto3";
|
||||
|
||||
package tensorflow.contrib.mpi;
|
||||
|
||||
import "tensorflow/core/framework/tensor_shape.proto";
|
||||
import "tensorflow/core/framework/types.proto";
|
||||
|
||||
// An MPIRequest is a message sent from a rank greater than zero to the
|
||||
// coordinator (rank zero), informing the coordinator of an operation that
|
||||
// the rank wants to do and the tensor that it wants to apply the operation to.
|
||||
message MPIRequest {
|
||||
enum RequestType {
|
||||
ALLREDUCE = 0;
|
||||
ALLGATHER = 1;
|
||||
}
|
||||
|
||||
// The request rank is necessary to create a consistent ordering of results,
|
||||
// for example in the allgather where the order of outputs should be sorted
|
||||
// by rank.
|
||||
int32 request_rank = 1;
|
||||
RequestType request_type = 2;
|
||||
DataType tensor_type = 3;
|
||||
string tensor_name = 4;
|
||||
TensorShapeProto tensor_shape = 5;
|
||||
};
|
||||
|
||||
// An MPIResponse is a message sent from the coordinator (rank zero) to a rank
|
||||
// greater than zero, informing the rank of an operation should be performed
|
||||
// now. If the operation requested would result in an error (for example, due
|
||||
// to a type or shape mismatch), then the MPIResponse can contain an error and
|
||||
// an error message instead. Finally, an MPIResponse can be a DONE message (if
|
||||
// there are no more tensors to reduce on this tick of the background loop) or
|
||||
// SHUTDOWN if all MPI processes should shut down.
|
||||
message MPIResponse {
|
||||
enum ResponseType {
|
||||
ALLREDUCE = 0;
|
||||
ALLGATHER = 1;
|
||||
ERROR = 2;
|
||||
DONE = 3;
|
||||
SHUTDOWN = 4;
|
||||
}
|
||||
|
||||
// Empty if the type is DONE or SHUTDOWN.
|
||||
ResponseType response_type = 1;
|
||||
string tensor_name = 2;
|
||||
|
||||
// Empty unless response_type is ERROR.
|
||||
string error_message = 3;
|
||||
};
|
1236
tensorflow/contrib/mpi_collectives/mpi_ops.cc
Normal file
1236
tensorflow/contrib/mpi_collectives/mpi_ops.cc
Normal file
File diff suppressed because it is too large
Load Diff
165
tensorflow/contrib/mpi_collectives/mpi_ops.py
Normal file
165
tensorflow/contrib/mpi_collectives/mpi_ops.py
Normal file
@ -0,0 +1,165 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# =============================================================================
|
||||
"""Inter-process communication using MPI."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
from tensorflow.python.framework import errors
|
||||
from tensorflow.python.framework import load_library
|
||||
from tensorflow.python.framework import ops
|
||||
from tensorflow.python.platform import resource_loader
|
||||
from tensorflow.python.platform import tf_logging as logging
|
||||
|
||||
|
||||
def _load_library(name, op_list=None):
|
||||
"""Loads a .so file containing the specified operators.
|
||||
|
||||
Args:
|
||||
name: The name of the .so file to load.
|
||||
op_list: A list of names of operators that the library should have. If None
|
||||
then the .so file's contents will not be verified.
|
||||
|
||||
Raises:
|
||||
NameError if one of the required ops is missing.
|
||||
"""
|
||||
try:
|
||||
filename = resource_loader.get_path_to_datafile(name)
|
||||
library = load_library.load_op_library(filename)
|
||||
for expected_op in (op_list or []):
|
||||
for lib_op in library.OP_LIST.op:
|
||||
if lib_op.name == expected_op:
|
||||
break
|
||||
else:
|
||||
raise NameError(
|
||||
'Could not find operator %s in dynamic library %s' %
|
||||
(expected_op, name))
|
||||
return library
|
||||
except errors.NotFoundError:
|
||||
logging.warning('%s file could not be loaded.', name)
|
||||
|
||||
|
||||
MPI_LIB = _load_library('mpi_collectives.so', ['MPISize', 'MPIRank',
|
||||
'MPILocalRank', 'MPIAllgather',
|
||||
'MPIAllreduce'])
|
||||
|
||||
|
||||
def size(name=None):
|
||||
"""An op which returns the number of MPI processes.
|
||||
|
||||
This is equivalent to running `MPI_Comm_size(MPI_COMM_WORLD, ...)` to get the
|
||||
size of the global communicator.
|
||||
|
||||
Returns:
|
||||
An integer scalar containing the number of MPI processes.
|
||||
"""
|
||||
return MPI_LIB.mpi_size(name=name)
|
||||
|
||||
|
||||
ops.NotDifferentiable('MPISize')
|
||||
|
||||
|
||||
def rank(name=None):
|
||||
"""An op which returns the MPI rank of the calling process.
|
||||
|
||||
This is equivalent to running `MPI_Comm_rank(MPI_COMM_WORLD, ...)` to get the
|
||||
rank of the current process in the global communicator.
|
||||
|
||||
Returns:
|
||||
An integer scalar with the MPI rank of the calling process.
|
||||
"""
|
||||
return MPI_LIB.mpi_rank(name=name)
|
||||
|
||||
|
||||
ops.NotDifferentiable('MPIRank')
|
||||
|
||||
|
||||
def init(name=None):
|
||||
"""An op which initializes MPI on the device on which it is run.
|
||||
|
||||
All future MPI ops must be run on the same device that the `init` op was run
|
||||
on.
|
||||
"""
|
||||
return MPI_LIB.mpi_init(name=name)
|
||||
|
||||
|
||||
ops.NotDifferentiable('MPIInit')
|
||||
|
||||
|
||||
def local_rank(name=None):
|
||||
"""An op which returns the local MPI rank of the calling process, within the
|
||||
node that it is running on. For example, if there are seven processes running
|
||||
on a node, their local ranks will be zero through six, inclusive.
|
||||
|
||||
This is equivalent to running `MPI_Comm_rank(...)` on a new communicator
|
||||
which only includes processes on the same node.
|
||||
|
||||
Returns:
|
||||
An integer scalar with the local MPI rank of the calling process.
|
||||
"""
|
||||
return MPI_LIB.mpi_local_rank(name=name)
|
||||
|
||||
|
||||
ops.NotDifferentiable('MPILocalRank')
|
||||
|
||||
|
||||
def _allreduce(tensor, name=None):
|
||||
"""An op which sums an input tensor over all the MPI processes.
|
||||
|
||||
The reduction operation is keyed by the name of the op. The tensor type and
|
||||
shape must be the same on all MPI processes for a given name. The reduction
|
||||
will not start until all processes are ready to send and receive the tensor.
|
||||
|
||||
Returns:
|
||||
A tensor of the same shape and type as `tensor`, summed across all
|
||||
processes.
|
||||
"""
|
||||
return MPI_LIB.mpi_allreduce(tensor, name=name)
|
||||
|
||||
|
||||
ops.NotDifferentiable('MPIAllreduce')
|
||||
|
||||
|
||||
def allgather(tensor, name=None):
|
||||
"""An op which concatenates the input tensor with the same input tensor on
|
||||
all other MPI processes.
|
||||
|
||||
The concatenation is done on the first dimension, so the input tensors on the
|
||||
different processes must have the same rank and shape, except for the first
|
||||
dimension, which is allowed to be different.
|
||||
|
||||
Returns:
|
||||
A tensor of the same type as `tensor`, concatenated on dimension zero
|
||||
across all processes. The shape is identical to the input shape, except for
|
||||
the first dimension, which may be greater and is the sum of all first
|
||||
dimensions of the tensors in different MPI processes.
|
||||
"""
|
||||
# Specify that first allgather is to collect the tensor gather sizes,
|
||||
# indicated by passing in a scalar (0-D tensor) of value 0
|
||||
sizes_flag = tf.constant(0, dtype=tf.int64, name="size_flag_const")
|
||||
my_size = tf.slice(tf.shape(tensor, out_type=tf.int64), [0], [1], name="size_slice")
|
||||
if name is None:
|
||||
name = "allgather"
|
||||
sizing_name = "{}_sizing".format(name)
|
||||
sizes = MPI_LIB.mpi_allgather(my_size, sizes_flag, name=sizing_name)
|
||||
return MPI_LIB.mpi_allgather(tensor, sizes, name=name)
|
||||
|
||||
|
||||
ops.NotDifferentiable('MPIAllgather')
|
||||
|
||||
|
296
tensorflow/contrib/mpi_collectives/mpi_ops_test.py
Normal file
296
tensorflow/contrib/mpi_collectives/mpi_ops_test.py
Normal file
@ -0,0 +1,296 @@
|
||||
# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# =============================================================================
|
||||
|
||||
"""Tests for tensorflow.contrib.mpi_collectives.mpi_ops."""
|
||||
|
||||
from __future__ import absolute_import
|
||||
from __future__ import division
|
||||
from __future__ import print_function
|
||||
|
||||
import os.path
|
||||
import itertools
|
||||
|
||||
import tensorflow as tf
|
||||
|
||||
import tensorflow.contrib.mpi_collectives as mpi
|
||||
|
||||
|
||||
def mpi_env_rank_and_size():
|
||||
"""Get MPI rank and size from environment variables and return them as a
|
||||
tuple of integers.
|
||||
|
||||
Most MPI implementations have an `mpirun` or `mpiexec` command that will
|
||||
run an MPI executable and set up all communication necessary between the
|
||||
different processors. As part of that set up, they will set environment
|
||||
variables that contain the rank and size of the MPI_COMM_WORLD
|
||||
communicator. We can read those environment variables from Python in order
|
||||
to ensure that `mpi.rank()` and `mpi.size()` return the expected values.
|
||||
|
||||
Since MPI is just a standard, not an implementation, implementations
|
||||
typically choose their own environment variable names. This function tries
|
||||
to support several different implementation, but really it only needs to
|
||||
support whatever implementation we want to use for the TensorFlow test
|
||||
suite.
|
||||
|
||||
If this is not running under MPI, then defaults of rank zero and size one
|
||||
are returned. (This is appropriate because when you call MPI_Init in an
|
||||
application not started with mpirun, it will create a new independent
|
||||
communicator with only one process in it.)
|
||||
"""
|
||||
rank_env = "PMI_RANK OMPI_COMM_WORLD_RANK".split()
|
||||
size_env = "PMI_SIZE OMPI_COMM_WORLD_SIZE".split()
|
||||
|
||||
for rank_var, size_var in zip(rank_env, size_env):
|
||||
rank = os.environ.get(rank_var)
|
||||
size = os.environ.get(size_var)
|
||||
if rank is not None and size is not None:
|
||||
return int(rank), int(size)
|
||||
|
||||
# Default to rank zero and size one if there are no environment variables
|
||||
return 0, 1
|
||||
|
||||
|
||||
class MPITests(tf.test.TestCase):
|
||||
"""
|
||||
Tests for MPI ops in tensorflow.contrib.mpi_collectives.
|
||||
"""
|
||||
|
||||
def test_mpi_rank(self):
|
||||
"""Test that the rank returned by mpi.rank() is correct."""
|
||||
true_rank, _ = mpi_env_rank_and_size()
|
||||
with self.test_session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
self.assertEqual(true_rank, rank)
|
||||
|
||||
def test_mpi_size(self):
|
||||
"""Test that the size returned by mpi.size() is correct."""
|
||||
_, true_size = mpi_env_rank_and_size()
|
||||
with self.test_session() as session:
|
||||
size = session.run(mpi.size())
|
||||
self.assertEqual(true_size, size)
|
||||
|
||||
def test_mpi_allreduce_cpu(self):
|
||||
"""Test on CPU that the allreduce correctly sums 1D, 2D, 3D tensors."""
|
||||
with self.test_session() as session:
|
||||
size = session.run(mpi.size())
|
||||
|
||||
dtypes = [tf.int32, tf.float32]
|
||||
dims = [1, 2, 3]
|
||||
for dtype, dim in itertools.product(dtypes, dims):
|
||||
tf.set_random_seed(1234)
|
||||
tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
|
||||
summed = mpi.allreduce(tensor, average=False)
|
||||
multiplied = tensor * size
|
||||
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
|
||||
|
||||
# Threshold for floating point equality depends on number of
|
||||
# ranks, since we're comparing against precise multiplication.
|
||||
if size <= 3:
|
||||
threshold = 0
|
||||
elif size < 10:
|
||||
threshold = 1e-4
|
||||
elif size < 15:
|
||||
threshold = 5e-4
|
||||
else:
|
||||
break
|
||||
|
||||
diff = session.run(max_difference)
|
||||
self.assertTrue(diff <= threshold,
|
||||
"mpi.allreduce produces incorrect results")
|
||||
|
||||
def test_mpi_allreduce_gpu(self):
|
||||
"""Test that the allreduce works on GPUs.
|
||||
|
||||
This test will crash badly if used with an MPI implementation that does
|
||||
not support GPU memory transfers directly, as it will call MPI_Send on
|
||||
a GPU data pointer."""
|
||||
# Only do this test if there are GPUs available.
|
||||
if not tf.test.is_gpu_available(cuda_only=True):
|
||||
return
|
||||
|
||||
no_gpus = tf.GPUOptions(visible_device_list="")
|
||||
cpu_config = tf.ConfigProto(gpu_options=no_gpus)
|
||||
with self.test_session(config=cpu_config) as session:
|
||||
local_rank = session.run(mpi.local_rank())
|
||||
|
||||
one_gpu = tf.GPUOptions(visible_device_list=str(local_rank))
|
||||
gpu_config = tf.ConfigProto(gpu_options=one_gpu)
|
||||
with self.test_session(config=gpu_config) as session:
|
||||
size = session.run(mpi.size())
|
||||
|
||||
dtype = tf.float32
|
||||
dim = 3
|
||||
with tf.device("/gpu:0"):
|
||||
tf.set_random_seed(1234)
|
||||
tensor = tf.random_uniform([17] * dim, -100, 100, dtype=dtype)
|
||||
summed = mpi.allreduce(tensor, average=False)
|
||||
multiplied = tensor * size
|
||||
max_difference = tf.reduce_max(tf.abs(summed - multiplied))
|
||||
|
||||
# Threshold for floating point equality depends on number of
|
||||
# ranks, since we're comparing against precise multiplication.
|
||||
if size <= 3:
|
||||
threshold = 0
|
||||
elif size < 10:
|
||||
threshold = 1e-4
|
||||
elif size < 15:
|
||||
threshold = 5e-4
|
||||
else:
|
||||
return
|
||||
|
||||
diff = session.run(max_difference)
|
||||
self.assertTrue(diff <= threshold,
|
||||
"mpi.allreduce on GPU produces incorrect results")
|
||||
|
||||
def test_mpi_allreduce_error(self):
|
||||
"""Test that the allreduce raises an error if different ranks try to
|
||||
send tensors of different rank or dimension."""
|
||||
with self.test_session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
size = session.run(mpi.size())
|
||||
|
||||
# This test does not apply if there is only one worker.
|
||||
if size == 1:
|
||||
return
|
||||
|
||||
# Same rank, different dimension
|
||||
tf.set_random_seed(1234)
|
||||
dims = [17 + rank] * 3
|
||||
tensor = tf.random_uniform(dims, -1.0, 1.0)
|
||||
with self.assertRaises(tf.errors.FailedPreconditionError):
|
||||
session.run(mpi.allreduce(tensor))
|
||||
|
||||
# Same number of elements, different rank
|
||||
tf.set_random_seed(1234)
|
||||
if rank == 0:
|
||||
dims = [17, 23 * 57]
|
||||
else:
|
||||
dims = [17, 23, 57]
|
||||
tensor = tf.random_uniform(dims, -1.0, 1.0)
|
||||
with self.assertRaises(tf.errors.FailedPreconditionError):
|
||||
session.run(mpi.allreduce(tensor))
|
||||
|
||||
def test_mpi_allreduce_type_error(self):
|
||||
"""Test that the allreduce raises an error if different ranks try to
|
||||
send tensors of different type."""
|
||||
with self.test_session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
size = session.run(mpi.size())
|
||||
|
||||
# This test does not apply if there is only one worker.
|
||||
if size == 1:
|
||||
return
|
||||
|
||||
# Same rank, different dimension
|
||||
dims = [17] * 3
|
||||
tensor = tf.ones(dims, dtype=tf.int32 if rank % 2 == 0 else tf.float32)
|
||||
with self.assertRaises(tf.errors.FailedPreconditionError):
|
||||
session.run(mpi.allreduce(tensor))
|
||||
|
||||
def test_mpi_allgather(self):
|
||||
"""Test that the allgather correctly gathers 1D, 2D, 3D tensors."""
|
||||
with self.test_session() as session:
|
||||
size = session.run(mpi.size())
|
||||
rank = session.run(mpi.rank())
|
||||
|
||||
dtypes = tf.int32, tf.float32
|
||||
dims = 1, 2, 3
|
||||
for dtype, dim in itertools.product(dtypes, dims):
|
||||
tensor = tf.ones([17] * dim, dtype=dtype) * rank
|
||||
gathered = mpi.allgather(tensor)
|
||||
|
||||
gathered_tensor = session.run(gathered)
|
||||
self.assertEqual(list(gathered_tensor.shape),
|
||||
[17 * size] + [17] * (dim - 1))
|
||||
|
||||
for i in range(size):
|
||||
rank_tensor = tf.slice(gathered_tensor, [i * 17] + [0] * (dim - 1),
|
||||
[17] + [-1] * (dim - 1))
|
||||
self.assertEqual(list(rank_tensor.shape), [17] * dim)
|
||||
self.assertTrue(session.run(tf.reduce_all(tf.equal(rank_tensor, i))),
|
||||
"mpi.allgather produces incorrect gathered tensor")
|
||||
|
||||
def test_mpi_allgather_variable_size(self):
|
||||
"""Test that the allgather correctly gathers 1D, 2D, 3D tensors,
|
||||
even if those tensors have different sizes along the first dim."""
|
||||
with self.test_session() as session:
|
||||
size = session.run(mpi.size())
|
||||
rank = session.run(mpi.rank())
|
||||
|
||||
dtypes = tf.int32, tf.float32
|
||||
dims = 1, 2, 3
|
||||
for dtype, dim in itertools.product(dtypes, dims):
|
||||
# Support tests up to MPI Size of 35
|
||||
if size > 35:
|
||||
break
|
||||
|
||||
tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5
|
||||
tensor_sizes = tensor_sizes[:size]
|
||||
|
||||
tensor = tf.ones([tensor_sizes[rank]] + [17] * (dim - 1),
|
||||
dtype=dtype) * rank
|
||||
gathered = mpi.allgather(tensor)
|
||||
|
||||
gathered_tensor = session.run(gathered)
|
||||
expected_size = sum(tensor_sizes)
|
||||
self.assertEqual(list(gathered_tensor.shape),
|
||||
[expected_size] + [17] * (dim - 1))
|
||||
|
||||
for i in range(size):
|
||||
rank_size = [tensor_sizes[i]] + [17] * (dim - 1)
|
||||
rank_tensor = tf.slice(gathered,
|
||||
[sum(tensor_sizes[:i])] + [0] * (dim - 1),
|
||||
rank_size)
|
||||
self.assertEqual(list(rank_tensor.shape), rank_size)
|
||||
self.assertTrue(session.run(tf.reduce_all(tf.equal(rank_tensor, i))),
|
||||
"mpi.allgather produces incorrect gathered tensor")
|
||||
|
||||
def test_mpi_allgather_error(self):
|
||||
"""Test that the allgather returns an error if any dimension besides
|
||||
the first is different among the tensors being gathered."""
|
||||
with self.test_session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
size = session.run(mpi.size())
|
||||
|
||||
# This test does not apply if there is only one worker.
|
||||
if size == 1:
|
||||
return
|
||||
|
||||
tensor_size = [17] * 3
|
||||
tensor_size[1] = 10 * (rank + 1)
|
||||
tensor = tf.ones(tensor_size, dtype=tf.float32) * rank
|
||||
with self.assertRaises(tf.errors.FailedPreconditionError):
|
||||
session.run(mpi.allgather(tensor))
|
||||
|
||||
def test_mpi_allgather_type_error(self):
|
||||
"""Test that the allgather returns an error if the types being gathered
|
||||
differ among the processes"""
|
||||
with self.test_session() as session:
|
||||
rank = session.run(mpi.rank())
|
||||
size = session.run(mpi.size())
|
||||
|
||||
# This test does not apply if there is only one worker.
|
||||
if size == 1:
|
||||
return
|
||||
|
||||
tensor_size = [17] * 3
|
||||
dtype = tf.int32 if rank % 2 == 0 else tf.float32
|
||||
tensor = tf.ones(tensor_size, dtype=dtype) * rank
|
||||
with self.assertRaises(tf.errors.FailedPreconditionError):
|
||||
session.run(mpi.allgather(tensor))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
tf.test.main()
|
80
tensorflow/contrib/mpi_collectives/ring.cc
Normal file
80
tensorflow/contrib/mpi_collectives/ring.cc
Normal file
@ -0,0 +1,80 @@
|
||||
/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifdef TENSORFLOW_USE_MPI
|
||||
|
||||
#define EIGEN_USE_THREADS
|
||||
|
||||
#include "tensorflow/contrib/mpi_collectives/ring.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace contrib {
|
||||
namespace mpi {
|
||||
|
||||
using CPUDevice = Eigen::ThreadPoolDevice;
|
||||
|
||||
extern template MPI_Datatype MPIType<float>();
|
||||
extern template MPI_Datatype MPIType<int>();
|
||||
extern template MPI_Datatype MPIType<long long>();
|
||||
extern template DataType TensorFlowDataType<float>();
|
||||
extern template DataType TensorFlowDataType<int>();
|
||||
extern template DataType TensorFlowDataType<long long>();
|
||||
|
||||
// Generate all necessary specializations for RingAllreduce.
|
||||
template Status RingAllreduce<CPUDevice, int>(OpKernelContext*, const Tensor*,
|
||||
Tensor*, Tensor*);
|
||||
template Status RingAllreduce<CPUDevice, long long>(OpKernelContext*,
|
||||
const Tensor*, Tensor*,
|
||||
Tensor*);
|
||||
template Status RingAllreduce<CPUDevice, float>(OpKernelContext*, const Tensor*,
|
||||
Tensor*, Tensor*);
|
||||
|
||||
// Generate all necessary specializations for RingAllgather.
|
||||
template Status RingAllgather<CPUDevice, int>(OpKernelContext*, const Tensor*,
|
||||
const std::vector<size_t>&,
|
||||
Tensor*);
|
||||
template Status RingAllgather<CPUDevice, long long>(OpKernelContext*,
|
||||
const Tensor*,
|
||||
const std::vector<size_t>&,
|
||||
Tensor*);
|
||||
template Status RingAllgather<CPUDevice, float>(OpKernelContext*, const Tensor*,
|
||||
const std::vector<size_t>&,
|
||||
Tensor*);
|
||||
|
||||
// Copy data on a CPU using a straight-forward memcpy.
|
||||
template <>
|
||||
void CopyTensorData<CPUDevice>(void* dst, void* src, size_t size) {
|
||||
std::memcpy(dst, src, size);
|
||||
};
|
||||
|
||||
// Accumulate values on a CPU.
|
||||
#define GENERATE_ACCUMULATE(type) \
|
||||
template <> \
|
||||
void AccumulateTensorData<CPUDevice, type>(type * dst, type * src, \
|
||||
size_t size) { \
|
||||
for (unsigned int i = 0; i < size; i++) { \
|
||||
dst[i] += src[i]; \
|
||||
} \
|
||||
};
|
||||
GENERATE_ACCUMULATE(int);
|
||||
GENERATE_ACCUMULATE(long long);
|
||||
GENERATE_ACCUMULATE(float);
|
||||
#undef GENERATE_ACCUMULATE
|
||||
|
||||
} // namespace mpi
|
||||
} // namespace contrib
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_USE_MPI
|
117
tensorflow/contrib/mpi_collectives/ring.cu.cc
Normal file
117
tensorflow/contrib/mpi_collectives/ring.cu.cc
Normal file
@ -0,0 +1,117 @@
|
||||
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifdef TENSORFLOW_USE_MPI
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
|
||||
#define EIGEN_USE_GPU
|
||||
|
||||
#include "tensorflow/contrib/mpi_collectives/ring.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace contrib {
|
||||
namespace mpi {
|
||||
|
||||
using CPUDevice = Eigen::ThreadPoolDevice;
|
||||
|
||||
template <>
|
||||
MPI_Datatype MPIType<float>() {
|
||||
return MPI_FLOAT;
|
||||
};
|
||||
template <>
|
||||
MPI_Datatype MPIType<int>() {
|
||||
return MPI_INT;
|
||||
};
|
||||
template <>
|
||||
MPI_Datatype MPIType<long long>() {
|
||||
return MPI_LONG_LONG;
|
||||
};
|
||||
|
||||
template <>
|
||||
DataType TensorFlowDataType<float>() {
|
||||
return DT_FLOAT;
|
||||
};
|
||||
template <>
|
||||
DataType TensorFlowDataType<int>() {
|
||||
return DT_INT32;
|
||||
};
|
||||
template <>
|
||||
DataType TensorFlowDataType<long long>() {
|
||||
return DT_INT64;
|
||||
};
|
||||
|
||||
// Generate all necessary specializations for RingAllreduce.
|
||||
template Status RingAllreduce<GPUDevice, int>(OpKernelContext*, const Tensor*,
|
||||
Tensor*, Tensor*);
|
||||
template Status RingAllreduce<GPUDevice, long long>(OpKernelContext*,
|
||||
const Tensor*, Tensor*,
|
||||
Tensor*);
|
||||
template Status RingAllreduce<GPUDevice, float>(OpKernelContext*, const Tensor*,
|
||||
Tensor*, Tensor*);
|
||||
|
||||
// Generate all necessary specializations for RingAllgather.
|
||||
template Status RingAllgather<GPUDevice, int>(OpKernelContext*, const Tensor*,
|
||||
const std::vector<size_t>&,
|
||||
Tensor*);
|
||||
template Status RingAllgather<GPUDevice, long long>(OpKernelContext*,
|
||||
const Tensor*,
|
||||
const std::vector<size_t>&,
|
||||
Tensor*);
|
||||
template Status RingAllgather<GPUDevice, float>(OpKernelContext*, const Tensor*,
|
||||
const std::vector<size_t>&,
|
||||
Tensor*);
|
||||
|
||||
// Synchronously copy data on the GPU, using a different stream than the default
|
||||
// and than TensorFlow to avoid synchronizing on operations unrelated to the
|
||||
// allreduce.
|
||||
template <>
|
||||
void CopyTensorData<GPUDevice>(void* dst, void* src, size_t size) {
|
||||
auto stream = CudaStreamForMPI();
|
||||
cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, stream);
|
||||
cudaStreamSynchronize(stream);
|
||||
};
|
||||
|
||||
// Elementwise accumulation kernel for GPU.
|
||||
template <typename T>
|
||||
__global__ void elemwise_accum(T* out, const T* in, const size_t N) {
|
||||
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
|
||||
i += blockDim.x * gridDim.x) {
|
||||
out[i] += in[i];
|
||||
}
|
||||
}
|
||||
|
||||
// Synchronously accumulate tensors on the GPU, using a different stream than
|
||||
// the default and than TensorFlow to avoid synchronizing on operations
|
||||
// unrelated to the allreduce.
|
||||
#define GENERATE_ACCUMULATE(type) \
|
||||
template <> \
|
||||
void AccumulateTensorData<GPUDevice, type>(type * dst, type * src, \
|
||||
size_t size) { \
|
||||
auto stream = CudaStreamForMPI(); \
|
||||
elemwise_accum<type><<<32, 256, 0, stream>>>(dst, src, size); \
|
||||
cudaStreamSynchronize(stream); \
|
||||
};
|
||||
GENERATE_ACCUMULATE(int);
|
||||
GENERATE_ACCUMULATE(long long);
|
||||
GENERATE_ACCUMULATE(float);
|
||||
#undef GENERATE_ACCUMULATE
|
||||
|
||||
} // namespace mpi
|
||||
} // namespace contrib
|
||||
} // namespace tensorflow
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
#endif // TENSORFLOW_USE_MPI
|
327
tensorflow/contrib/mpi_collectives/ring.h
Normal file
327
tensorflow/contrib/mpi_collectives/ring.h
Normal file
@ -0,0 +1,327 @@
|
||||
/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CONTRIB_MPI_H_
|
||||
#define TENSORFLOW_CONTRIB_MPI_H_
|
||||
|
||||
#ifdef TENSORFLOW_USE_MPI
|
||||
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/op_kernel.h"
|
||||
#include "tensorflow/core/framework/shape_inference.h"
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#include "cuda_runtime.h"
|
||||
#endif
|
||||
|
||||
// Needed to avoid header issues with C++-supporting MPI implementations
|
||||
#define OMPI_SKIP_MPICXX
|
||||
#include "third_party/mpi/mpi.h"
|
||||
|
||||
#define TAG_TENSOR 12
|
||||
|
||||
namespace tensorflow {
|
||||
namespace contrib {
|
||||
namespace mpi {
|
||||
|
||||
using CPUDevice = Eigen::ThreadPoolDevice;
|
||||
using GPUDevice = Eigen::GpuDevice;
|
||||
|
||||
// Convert from templated types to values we can pass to MPI.
|
||||
template <typename T>
|
||||
MPI_Datatype MPIType();
|
||||
|
||||
// Convert from templated types to TensorFlow data types.
|
||||
template <typename T>
|
||||
DataType TensorFlowDataType();
|
||||
|
||||
#define MPI_REQUIRES_OK(MPI_STATUS) \
|
||||
if ((MPI_STATUS) != MPI_SUCCESS) { \
|
||||
return errors::Unknown("MPI operation failed unexpectedly."); \
|
||||
}
|
||||
|
||||
// Copy data from one tensor to another tensor.
|
||||
// This uses a custom CUDA stream on GPU, which is necessary to overlay the
|
||||
// backpropagation computations with the allreduce.
|
||||
template <typename Device>
|
||||
void CopyTensorData(void* destination, void* source, size_t size);
|
||||
|
||||
// Add a tensor into another tensor, accumulating in place.
|
||||
// This uses a custom CUDA stream on GPU, which is necessary to overlay the
|
||||
// backpropagation computations with the allreduce.
|
||||
template <typename Device, typename T>
|
||||
void AccumulateTensorData(T* destination, T* source, size_t size);
|
||||
|
||||
// We need to get the right stream for doing CUDA memory transfers and
|
||||
// operations, which is possibly different from the standard TensorFlow stream.
|
||||
#if GOOGLE_CUDA
|
||||
cudaStream_t CudaStreamForMPI();
|
||||
#endif
|
||||
|
||||
/* Perform a ring allreduce on the data. Allocate the necessary output tensor
|
||||
* and store it in the output parameter.
|
||||
*
|
||||
* Assumes that all MPI processes are doing an allreduce of the same tensor,
|
||||
* with the same dimensions.
|
||||
*
|
||||
* A ring allreduce is a bandwidth-optimal way to do an allreduce. To do the
|
||||
* allreduce, the nodes involved are arranged in a ring:
|
||||
*
|
||||
* .--0--.
|
||||
* / \
|
||||
* 3 1
|
||||
* \ /
|
||||
* *--2--*
|
||||
*
|
||||
* Each node always sends to the next clockwise node in the ring, and receives
|
||||
* from the previous one.
|
||||
*
|
||||
* The allreduce is done in two parts: a scatter-reduce and an allgather. In
|
||||
* the scatter reduce, a reduction is done, so that each node ends up with a
|
||||
* chunk of the final output tensor which has contributions from all other
|
||||
* nodes. In the allgather, those chunks are distributed among all the nodes,
|
||||
* so that all nodes have the entire output tensor.
|
||||
*
|
||||
* Both of these operations are done by dividing the input tensor into N
|
||||
* evenly sized chunks (where N is the number of nodes in the ring).
|
||||
*
|
||||
* The scatter-reduce is done in N-1 steps. In the ith step, node j will send
|
||||
* the (j - i)th chunk and receive the (j - i - 1)th chunk, adding it in to
|
||||
* its existing data for that chunk. For example, in the first iteration with
|
||||
* the ring depicted above, you will have the following transfers:
|
||||
*
|
||||
* Segment 0: Node 0 --> Node 1
|
||||
* Segment 1: Node 1 --> Node 2
|
||||
* Segment 2: Node 2 --> Node 3
|
||||
* Segment 3: Node 3 --> Node 0
|
||||
*
|
||||
* In the second iteration, you'll have the following transfers:
|
||||
*
|
||||
* Segment 0: Node 1 --> Node 2
|
||||
* Segment 1: Node 2 --> Node 3
|
||||
* Segment 2: Node 3 --> Node 0
|
||||
* Segment 3: Node 0 --> Node 1
|
||||
*
|
||||
* After this iteration, Node 2 has 3 of the four contributions to Segment 0.
|
||||
* The last iteration has the following transfers:
|
||||
*
|
||||
* Segment 0: Node 2 --> Node 3
|
||||
* Segment 1: Node 3 --> Node 0
|
||||
* Segment 2: Node 0 --> Node 1
|
||||
* Segment 3: Node 1 --> Node 2
|
||||
*
|
||||
* After this iteration, Node 3 has the fully accumulated Segment 0; Node 0
|
||||
* has the fully accumulated Segment 1; and so on. The scatter-reduce is
|
||||
* complete.
|
||||
*
|
||||
* Next, the allgather distributes these fully accumululated chunks across all
|
||||
* nodes. Communication proceeds in the same ring, once again in N-1 steps. At
|
||||
* the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
|
||||
* For example, at the first iteration, the following transfers will occur:
|
||||
*
|
||||
* Segment 0: Node 3 --> Node 0
|
||||
* Segment 1: Node 0 --> Node 1
|
||||
* Segment 2: Node 1 --> Node 2
|
||||
* Segment 3: Node 2 --> Node 3
|
||||
*
|
||||
* After the first iteration, Node 0 will have a fully accumulated Segment 0
|
||||
* (from Node 3) and Segment 1. In the next iteration, Node 0 will send its
|
||||
* just-received Segment 0 onward to Node 1, and receive Segment 3 from Node 3.
|
||||
* After this has continued for N - 1 iterations, all nodes will have a the
|
||||
* fully accumulated tensor.
|
||||
*
|
||||
* Each node will do (N-1) sends for the scatter-reduce and (N-1) sends for the
|
||||
* allgather. Each send will contain K / N bytes, if there are K bytes in the
|
||||
* original tensor on every node. Thus, each node sends and receives 2K(N - 1)/N
|
||||
* bytes of data, and the performance of the allreduce (assuming no latency in
|
||||
* connections) is constrained by the slowest interconnect between the nodes.
|
||||
*
|
||||
*/
|
||||
template <typename Device, typename T>
|
||||
Status RingAllreduce(OpKernelContext* context, const Tensor* input,
|
||||
Tensor* temp, Tensor* output) {
|
||||
// Acquire MPI size and rank
|
||||
int n, r;
|
||||
MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
|
||||
MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
|
||||
|
||||
T* buffer = (T*)output->tensor_data().data();
|
||||
|
||||
CopyTensorData<Device>((void*)buffer, (void*)input->tensor_data().data(),
|
||||
output->tensor_data().size());
|
||||
|
||||
// Calculate segment sizes and segment ends
|
||||
const size_t elements_to_reduce = input->NumElements();
|
||||
const size_t segment_size = elements_to_reduce / n;
|
||||
std::vector<size_t> segment_sizes(n, segment_size);
|
||||
|
||||
const size_t residual = elements_to_reduce % n;
|
||||
for (size_t i = 0; i < residual; ++i) {
|
||||
segment_sizes[i]++;
|
||||
}
|
||||
|
||||
std::vector<size_t> segment_starts(n);
|
||||
segment_starts[0] = 0;
|
||||
for (size_t i = 1; i < segment_starts.size(); ++i) {
|
||||
segment_starts[i] = segment_starts[i - 1] + segment_sizes[i - 1];
|
||||
}
|
||||
|
||||
assert(segment_starts[n - 1] + segment_sizes[n - 1] == elements_to_reduce);
|
||||
|
||||
T* segment_recv = (T*)temp->tensor_data().data();
|
||||
|
||||
// Receive from your left neighbor with wrap-around
|
||||
const size_t recv_from = ((r - 1) + n) % n;
|
||||
|
||||
// Send to your right neighbor with wrap-around
|
||||
const size_t send_to = (r + 1) % n;
|
||||
|
||||
MPI_Status recv_status;
|
||||
MPI_Request recv_req;
|
||||
|
||||
// Now start ring. At every step, for every rank, we iterate through
|
||||
// segments with wraparound and send and recv from our neighbors and reduce
|
||||
// locally. At the i'th iteration, rank r, sends segment (r-i) and receives
|
||||
// segment (r-i-1).
|
||||
for (int i = 0; i < n - 1; i++) {
|
||||
const size_t send_seg_id = ((r - i) + n) % n;
|
||||
const size_t recv_seg_id = ((r - i - 1) + n) % n;
|
||||
|
||||
T* segment_send = &(buffer[segment_starts[send_seg_id]]);
|
||||
|
||||
MPI_REQUIRES_OK(MPI_Irecv(segment_recv, segment_sizes[recv_seg_id],
|
||||
MPIType<T>(), recv_from, TAG_TENSOR,
|
||||
MPI_COMM_WORLD, &recv_req));
|
||||
|
||||
MPI_REQUIRES_OK(MPI_Send(segment_send, segment_sizes[send_seg_id],
|
||||
MPIType<T>(), send_to, TAG_TENSOR,
|
||||
MPI_COMM_WORLD));
|
||||
|
||||
T* segment_update = &(buffer[segment_starts[recv_seg_id]]);
|
||||
|
||||
// Wait for recv to complete before reduction
|
||||
MPI_REQUIRES_OK(MPI_Wait(&recv_req, &recv_status));
|
||||
|
||||
const size_t recv_seg_size = segment_sizes[recv_seg_id];
|
||||
AccumulateTensorData<Device, T>(segment_update, segment_recv,
|
||||
recv_seg_size);
|
||||
}
|
||||
|
||||
// Now start pipelined ring allgather. At every step, for every rank, we
|
||||
// iterate through segments with wraparound and send and recv from our
|
||||
// neighbors. At the i'th iteration, rank r, sends segment (r-i+1) and
|
||||
// receives segment (r-i).
|
||||
for (size_t i = 0; i < n - 1; ++i) {
|
||||
const size_t send_seg_id = ((r - i + 1) + n) % n;
|
||||
const size_t recv_seg_id = ((r - i) + n) % n;
|
||||
|
||||
// Segment to send - at every iteration we send segment (r-i+1)
|
||||
T* segment_send = &(buffer[segment_starts[send_seg_id]]);
|
||||
|
||||
// Segment to recv - at every iteration we receive segment (r-i)
|
||||
T* segment_recv = &(buffer[segment_starts[recv_seg_id]]);
|
||||
|
||||
MPI_REQUIRES_OK(MPI_Sendrecv(
|
||||
segment_send, segment_sizes[send_seg_id], MPIType<T>(), send_to,
|
||||
TAG_TENSOR, segment_recv, segment_sizes[recv_seg_id], MPIType<T>(),
|
||||
recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
// Perform a ring allgather on a Tensor. Other ranks may allgather with a
|
||||
// tensor which differs in the first dimension only; all other dimensions must
|
||||
// be the same.
|
||||
//
|
||||
// For more information on the ring allgather, read the documentation for the
|
||||
// ring allreduce, which includes a ring allgather.
|
||||
template <typename Device, typename T>
|
||||
Status RingAllgather(OpKernelContext* context, const Tensor* input,
|
||||
const std::vector<size_t>& sizes, Tensor* output) {
|
||||
// Acquire MPI size and rank
|
||||
int n, r;
|
||||
MPI_REQUIRES_OK(MPI_Comm_size(MPI_COMM_WORLD, &n));
|
||||
MPI_REQUIRES_OK(MPI_Comm_rank(MPI_COMM_WORLD, &r));
|
||||
|
||||
assert(sizes.size() == n);
|
||||
assert(input->dim_size(0) == sizes[r]);
|
||||
|
||||
// Compute number of elements in every "row". We can't compute number of
|
||||
// elements in every chunks, because those chunks are variable length.
|
||||
size_t elements_per_row = 1;
|
||||
for (int i = 1; i < input->shape().dims(); i++) {
|
||||
elements_per_row *= input->dim_size(i);
|
||||
}
|
||||
|
||||
// Copy data from input tensor to correct place in output tensor.
|
||||
std::vector<size_t> segment_starts(n);
|
||||
segment_starts[0] = 0;
|
||||
for (int i = 1; i < n; i++) {
|
||||
segment_starts[i] = segment_starts[i - 1] + elements_per_row * sizes[i - 1];
|
||||
}
|
||||
size_t offset = segment_starts[r];
|
||||
|
||||
// Copy data to the right offset for this rank.
|
||||
T* buffer = (T*)output->tensor_data().data();
|
||||
CopyTensorData<Device>((void*)(buffer + offset),
|
||||
(void*)input->tensor_data().data(),
|
||||
elements_per_row * sizes[r] * sizeof(T));
|
||||
|
||||
// Receive from your left neighbor with wrap-around
|
||||
const size_t recv_from = ((r - 1) + n) % n;
|
||||
|
||||
// Send to your right neighbor with wrap-around
|
||||
const size_t send_to = (r + 1) % n;
|
||||
|
||||
// Perform a ring allgather. At every step, for every rank, we iterate
|
||||
// through segments with wraparound and send and recv from our neighbors.
|
||||
// At the i'th iteration, rank r, sends segment (r-i) and receives segment
|
||||
// (r-1-i).
|
||||
MPI_Status recv_status;
|
||||
for (size_t i = 0; i < n - 1; ++i) {
|
||||
const size_t send_seg_id = ((r - i) + n) % n;
|
||||
const size_t recv_seg_id = ((r - i - 1) + n) % n;
|
||||
|
||||
// Segment to send - at every iteration we send segment (r-i)
|
||||
size_t offset_send = segment_starts[send_seg_id];
|
||||
size_t rows_send = sizes[send_seg_id];
|
||||
T* segment_send = &(buffer[offset_send]);
|
||||
|
||||
// Segment to recv - at every iteration we receive segment (r-1-i)
|
||||
size_t offset_recv = segment_starts[recv_seg_id];
|
||||
size_t rows_recv = sizes[recv_seg_id];
|
||||
T* segment_recv = &(buffer[offset_recv]);
|
||||
|
||||
MPI_REQUIRES_OK(MPI_Sendrecv(
|
||||
segment_send, elements_per_row * rows_send, MPIType<T>(), send_to,
|
||||
TAG_TENSOR, segment_recv, elements_per_row * rows_recv, MPIType<T>(),
|
||||
recv_from, TAG_TENSOR, MPI_COMM_WORLD, &recv_status));
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace mpi
|
||||
} // namespace contrib
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_USE_MPI
|
||||
|
||||
#undef TENSORFLOW_CONTRIB_MPI_H_
|
||||
#endif // TENSORFLOW_CONTRIB_MPI_H_
|
@ -43,13 +43,13 @@ INCLUDES := \
|
||||
-I$(PROTOGENDIR) \
|
||||
-I$(PBTGENDIR)
|
||||
LIBS := \
|
||||
-lstdc++ \
|
||||
-lprotobuf \
|
||||
-lv4l2 \
|
||||
-Wl,--allow-multiple-definition \
|
||||
-Wl,--whole-archive \
|
||||
-ltensorflow-core \
|
||||
-Wl,--no-whole-archive \
|
||||
-lstdc++ \
|
||||
-lprotobuf \
|
||||
-lv4l2 \
|
||||
-ldl \
|
||||
-lpthread \
|
||||
-lm \
|
||||
|
@ -43,12 +43,12 @@ INCLUDES := \
|
||||
-I$(PROTOGENDIR) \
|
||||
-I$(PBTGENDIR)
|
||||
LIBS := \
|
||||
-lstdc++ \
|
||||
-lprotobuf \
|
||||
-Wl,--allow-multiple-definition \
|
||||
-Wl,--whole-archive \
|
||||
-ltensorflow-core \
|
||||
-Wl,--no-whole-archive \
|
||||
-lstdc++ \
|
||||
-lprotobuf \
|
||||
-ldl \
|
||||
-lpthread \
|
||||
-lm \
|
||||
|
@ -27,6 +27,7 @@ from tensorflow.python.ops import nn_ops
|
||||
from tensorflow.python.ops import rnn_cell_impl
|
||||
from tensorflow.python.ops import variable_scope as vs
|
||||
from tensorflow.python.platform import resource_loader
|
||||
from tensorflow.python.util.deprecation import deprecated_args
|
||||
|
||||
_gru_ops_so = loader.load_op_library(
|
||||
resource_loader.get_path_to_datafile("_gru_ops.so"))
|
||||
@ -129,13 +130,24 @@ class GRUBlockCell(rnn_cell_impl.RNNCell):
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, cell_size):
|
||||
@deprecated_args(None, "cell_size is deprecated, use num_units instead",
|
||||
"cell_size")
|
||||
def __init__(self, num_units=None, cell_size=None):
|
||||
"""Initialize the Block GRU cell.
|
||||
|
||||
Args:
|
||||
cell_size: int, GRU cell size.
|
||||
num_units: int, The number of units in the GRU cell.
|
||||
cell_size: int, The old (deprecated) name for `num_units`.
|
||||
|
||||
Raises:
|
||||
ValueError: if both cell_size and num_units are not None;
|
||||
or both are None.
|
||||
"""
|
||||
self._cell_size = cell_size
|
||||
if (cell_size is None) == (num_units is None):
|
||||
raise ValueError("Exactly one of num_units or cell_size must be provided.")
|
||||
if num_units is None:
|
||||
num_units = cell_size
|
||||
self._cell_size = num_units
|
||||
|
||||
@property
|
||||
def state_size(self):
|
||||
|
102
tensorflow/contrib/s3/BUILD
Normal file
102
tensorflow/contrib/s3/BUILD
Normal file
@ -0,0 +1,102 @@
|
||||
# Description:
|
||||
# S3 support for TensorFlow.
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
licenses(["notice"]) # Apache 2.0
|
||||
|
||||
exports_files(["LICENSE"])
|
||||
|
||||
load(
|
||||
"//tensorflow:tensorflow.bzl",
|
||||
"tf_cc_test",
|
||||
)
|
||||
|
||||
filegroup(
|
||||
name = "all_files",
|
||||
srcs = glob(
|
||||
["**/*"],
|
||||
exclude = [
|
||||
"**/METADATA",
|
||||
"**/OWNERS",
|
||||
],
|
||||
),
|
||||
visibility = ["//tensorflow:__subpackages__"],
|
||||
)
|
||||
|
||||
cc_binary(
|
||||
name = "s3_file_system.so",
|
||||
srcs = [
|
||||
"s3_crypto.cc",
|
||||
"s3_crypto.h",
|
||||
"s3_file_system.cc",
|
||||
"s3_file_system.h",
|
||||
],
|
||||
copts = ["-Wno-sign-compare"],
|
||||
defines = select({
|
||||
"//conditions:default": [
|
||||
"ENABLE_CURL_CLIENT",
|
||||
"ENABLE_NO_ENCRYPTION",
|
||||
],
|
||||
}),
|
||||
linkshared = 1,
|
||||
deps = [
|
||||
"//tensorflow/core:framework_headers_lib",
|
||||
"@aws//:aws",
|
||||
"@curl//:curl",
|
||||
"@protobuf_archive//:protobuf_headers",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "s3_crypto",
|
||||
srcs = [
|
||||
"s3_crypto.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"s3_crypto.h",
|
||||
],
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"@aws//:aws",
|
||||
"@boringssl//:crypto",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "s3_file_system",
|
||||
srcs = [
|
||||
"s3_file_system.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"s3_file_system.h",
|
||||
],
|
||||
deps = [
|
||||
":s3_crypto",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"@aws//:aws",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "s3_file_system_test",
|
||||
size = "small",
|
||||
srcs = [
|
||||
"s3_file_system_test.cc",
|
||||
],
|
||||
tags = [
|
||||
"manual",
|
||||
],
|
||||
deps = [
|
||||
":s3_file_system",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
"@aws//:aws",
|
||||
],
|
||||
)
|
113
tensorflow/contrib/s3/s3_crypto.cc
Normal file
113
tensorflow/contrib/s3/s3_crypto.cc
Normal file
@ -0,0 +1,113 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/contrib/s3/s3_crypto.h"
|
||||
#include <openssl/hmac.h>
|
||||
#include <openssl/sha.h>
|
||||
|
||||
#include <aws/core/utils/crypto/HashResult.h>
|
||||
#include <aws/s3/S3Client.h>
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
class S3Sha256HMACOpenSSLImpl : public Aws::Utils::Crypto::HMAC {
|
||||
public:
|
||||
S3Sha256HMACOpenSSLImpl() {}
|
||||
|
||||
virtual ~S3Sha256HMACOpenSSLImpl() = default;
|
||||
|
||||
virtual Aws::Utils::Crypto::HashResult Calculate(
|
||||
const Aws::Utils::ByteBuffer& toSign,
|
||||
const Aws::Utils::ByteBuffer& secret) override {
|
||||
unsigned int length = SHA256_DIGEST_LENGTH;
|
||||
Aws::Utils::ByteBuffer digest(length);
|
||||
memset(digest.GetUnderlyingData(), 0, length);
|
||||
|
||||
HMAC_CTX ctx;
|
||||
HMAC_CTX_init(&ctx);
|
||||
|
||||
HMAC_Init_ex(&ctx, secret.GetUnderlyingData(),
|
||||
static_cast<int>(secret.GetLength()), EVP_sha256(), NULL);
|
||||
HMAC_Update(&ctx, toSign.GetUnderlyingData(), toSign.GetLength());
|
||||
HMAC_Final(&ctx, digest.GetUnderlyingData(), &length);
|
||||
HMAC_CTX_cleanup(&ctx);
|
||||
|
||||
return Aws::Utils::Crypto::HashResult(std::move(digest));
|
||||
}
|
||||
};
|
||||
|
||||
class S3Sha256OpenSSLImpl : public Aws::Utils::Crypto::Hash {
|
||||
public:
|
||||
S3Sha256OpenSSLImpl() {}
|
||||
|
||||
virtual ~S3Sha256OpenSSLImpl() = default;
|
||||
|
||||
virtual Aws::Utils::Crypto::HashResult Calculate(
|
||||
const Aws::String& str) override {
|
||||
SHA256_CTX sha256;
|
||||
SHA256_Init(&sha256);
|
||||
SHA256_Update(&sha256, str.data(), str.size());
|
||||
|
||||
Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
|
||||
SHA256_Final(hash.GetUnderlyingData(), &sha256);
|
||||
|
||||
return Aws::Utils::Crypto::HashResult(std::move(hash));
|
||||
}
|
||||
|
||||
virtual Aws::Utils::Crypto::HashResult Calculate(
|
||||
Aws::IStream& stream) override {
|
||||
SHA256_CTX sha256;
|
||||
SHA256_Init(&sha256);
|
||||
|
||||
auto currentPos = stream.tellg();
|
||||
if (currentPos == -1) {
|
||||
currentPos = 0;
|
||||
stream.clear();
|
||||
}
|
||||
|
||||
stream.seekg(0, stream.beg);
|
||||
|
||||
char streamBuffer
|
||||
[Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE];
|
||||
while (stream.good()) {
|
||||
stream.read(streamBuffer,
|
||||
Aws::Utils::Crypto::Hash::INTERNAL_HASH_STREAM_BUFFER_SIZE);
|
||||
auto bytesRead = stream.gcount();
|
||||
|
||||
if (bytesRead > 0) {
|
||||
SHA256_Update(&sha256, streamBuffer, static_cast<size_t>(bytesRead));
|
||||
}
|
||||
}
|
||||
|
||||
stream.clear();
|
||||
stream.seekg(currentPos, stream.beg);
|
||||
|
||||
Aws::Utils::ByteBuffer hash(SHA256_DIGEST_LENGTH);
|
||||
SHA256_Final(hash.GetUnderlyingData(), &sha256);
|
||||
|
||||
return Aws::Utils::Crypto::HashResult(std::move(hash));
|
||||
}
|
||||
};
|
||||
|
||||
std::shared_ptr<Aws::Utils::Crypto::Hash>
|
||||
S3SHA256Factory::CreateImplementation() const {
|
||||
return Aws::MakeShared<S3Sha256OpenSSLImpl>(S3CryptoAllocationTag);
|
||||
}
|
||||
|
||||
std::shared_ptr<Aws::Utils::Crypto::HMAC>
|
||||
S3SHA256HmacFactory::CreateImplementation() const {
|
||||
return Aws::MakeShared<S3Sha256HMACOpenSSLImpl>(S3CryptoAllocationTag);
|
||||
}
|
||||
|
||||
} // namespace tensorflow
|
35
tensorflow/contrib/s3/s3_crypto.h
Normal file
35
tensorflow/contrib/s3/s3_crypto.h
Normal file
@ -0,0 +1,35 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include <aws/core/Aws.h>
|
||||
#include <aws/core/utils/crypto/Factories.h>
|
||||
#include <aws/core/utils/crypto/HMAC.h>
|
||||
#include <aws/core/utils/crypto/Hash.h>
|
||||
|
||||
namespace tensorflow {
|
||||
static const char* S3CryptoAllocationTag = "S3CryptoAllocation";
|
||||
|
||||
class S3SHA256Factory : public Aws::Utils::Crypto::HashFactory {
|
||||
public:
|
||||
std::shared_ptr<Aws::Utils::Crypto::Hash> CreateImplementation()
|
||||
const override;
|
||||
};
|
||||
|
||||
class S3SHA256HmacFactory : public Aws::Utils::Crypto::HMACFactory {
|
||||
public:
|
||||
std::shared_ptr<Aws::Utils::Crypto::HMAC> CreateImplementation()
|
||||
const override;
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
575
tensorflow/contrib/s3/s3_file_system.cc
Normal file
575
tensorflow/contrib/s3/s3_file_system.cc
Normal file
@ -0,0 +1,575 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
#include "tensorflow/contrib/s3/s3_file_system.h"
|
||||
#include "tensorflow/contrib/s3/s3_crypto.h"
|
||||
#include "tensorflow/core/lib/io/path.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
|
||||
#include <aws/core/Aws.h>
|
||||
#include <aws/core/utils/FileSystemUtils.h>
|
||||
#include <aws/s3/S3Client.h>
|
||||
#include <aws/s3/S3Errors.h>
|
||||
#include <aws/s3/model/CopyObjectRequest.h>
|
||||
#include <aws/s3/model/DeleteObjectRequest.h>
|
||||
#include <aws/s3/model/GetObjectRequest.h>
|
||||
#include <aws/s3/model/HeadBucketRequest.h>
|
||||
#include <aws/s3/model/HeadObjectRequest.h>
|
||||
#include <aws/s3/model/ListObjectsRequest.h>
|
||||
#include <aws/s3/model/PutObjectRequest.h>
|
||||
|
||||
#include <cstdlib>
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
static const char* kS3FileSystemAllocationTag = "S3FileSystemAllocation";
|
||||
static const size_t kS3ReadAppendableFileBufferSize = 1024 * 1024;
|
||||
static const int kS3GetChildrenMaxKeys = 100;
|
||||
|
||||
Aws::Client::ClientConfiguration& GetDefaultClientConfig() {
|
||||
static mutex cfg_lock;
|
||||
static bool init(false);
|
||||
static Aws::Client::ClientConfiguration cfg;
|
||||
|
||||
std::lock_guard<mutex> lock(cfg_lock);
|
||||
|
||||
if (!init) {
|
||||
const char* endpoint = getenv("S3_ENDPOINT");
|
||||
if (endpoint) {
|
||||
cfg.endpointOverride = Aws::String(endpoint);
|
||||
}
|
||||
const char* region = getenv("S3_REGION");
|
||||
if (region) {
|
||||
cfg.region = Aws::String(region);
|
||||
}
|
||||
const char* use_https = getenv("S3_USE_HTTPS");
|
||||
if (use_https) {
|
||||
if (use_https[0] == '0') {
|
||||
cfg.scheme = Aws::Http::Scheme::HTTP;
|
||||
} else {
|
||||
cfg.scheme = Aws::Http::Scheme::HTTPS;
|
||||
}
|
||||
}
|
||||
const char* verify_ssl = getenv("S3_VERIFY_SSL");
|
||||
if (verify_ssl) {
|
||||
if (verify_ssl[0] == '0') {
|
||||
cfg.verifySSL = false;
|
||||
} else {
|
||||
cfg.verifySSL = true;
|
||||
}
|
||||
}
|
||||
|
||||
init = true;
|
||||
}
|
||||
|
||||
return cfg;
|
||||
};
|
||||
|
||||
Status ParseS3Path(const string& fname, bool empty_object_ok, string* bucket,
|
||||
string* object) {
|
||||
if (!bucket || !object) {
|
||||
return errors::Internal("bucket and object cannot be null.");
|
||||
}
|
||||
StringPiece scheme, bucketp, objectp;
|
||||
io::ParseURI(fname, &scheme, &bucketp, &objectp);
|
||||
if (scheme != "s3") {
|
||||
return errors::InvalidArgument("S3 path doesn't start with 's3://': ",
|
||||
fname);
|
||||
}
|
||||
*bucket = bucketp.ToString();
|
||||
if (bucket->empty() || *bucket == ".") {
|
||||
return errors::InvalidArgument("S3 path doesn't contain a bucket name: ",
|
||||
fname);
|
||||
}
|
||||
objectp.Consume("/");
|
||||
*object = objectp.ToString();
|
||||
if (!empty_object_ok && object->empty()) {
|
||||
return errors::InvalidArgument("S3 path doesn't contain an object name: ",
|
||||
fname);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
class S3RandomAccessFile : public RandomAccessFile {
|
||||
public:
|
||||
S3RandomAccessFile(const string& bucket, const string& object)
|
||||
: bucket_(bucket), object_(object) {}
|
||||
|
||||
Status Read(uint64 offset, size_t n, StringPiece* result,
|
||||
char* scratch) const override {
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
Aws::S3::Model::GetObjectRequest getObjectRequest;
|
||||
getObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
|
||||
string bytes = strings::StrCat("bytes=", offset, "-", offset + n - 1);
|
||||
getObjectRequest.SetRange(bytes.c_str());
|
||||
getObjectRequest.SetResponseStreamFactory([]() {
|
||||
return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag);
|
||||
});
|
||||
auto getObjectOutcome = s3Client.GetObject(getObjectRequest);
|
||||
if (!getObjectOutcome.IsSuccess()) {
|
||||
n = 0;
|
||||
*result = StringPiece(scratch, n);
|
||||
return Status(error::OUT_OF_RANGE, "Read less bytes than requested");
|
||||
}
|
||||
n = getObjectOutcome.GetResult().GetContentLength();
|
||||
std::stringstream ss;
|
||||
ss << getObjectOutcome.GetResult().GetBody().rdbuf();
|
||||
ss.read(scratch, n);
|
||||
|
||||
*result = StringPiece(scratch, n);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
string bucket_;
|
||||
string object_;
|
||||
};
|
||||
|
||||
class S3WritableFile : public WritableFile {
|
||||
public:
|
||||
S3WritableFile(const string& bucket, const string& object)
|
||||
: bucket_(bucket),
|
||||
object_(object),
|
||||
sync_needed_(true),
|
||||
outfile_(Aws::MakeShared<Aws::Utils::TempFile>(
|
||||
kS3FileSystemAllocationTag, "/tmp/s3_filesystem_XXXXXX",
|
||||
std::ios_base::binary | std::ios_base::trunc | std::ios_base::in |
|
||||
std::ios_base::out)) {}
|
||||
|
||||
Status Append(const StringPiece& data) override {
|
||||
if (!outfile_) {
|
||||
return errors::FailedPrecondition(
|
||||
"The internal temporary file is not writable.");
|
||||
}
|
||||
sync_needed_ = true;
|
||||
outfile_->write(data.data(), data.size());
|
||||
if (!outfile_->good()) {
|
||||
return errors::Internal(
|
||||
"Could not append to the internal temporary file.");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Close() override {
|
||||
if (outfile_) {
|
||||
TF_RETURN_IF_ERROR(Sync());
|
||||
outfile_.reset();
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status Flush() override { return Sync(); }
|
||||
|
||||
Status Sync() override {
|
||||
if (!outfile_) {
|
||||
return errors::FailedPrecondition(
|
||||
"The internal temporary file is not writable.");
|
||||
}
|
||||
if (!sync_needed_) {
|
||||
return Status::OK();
|
||||
}
|
||||
Aws::Client::ClientConfiguration clientConfig = GetDefaultClientConfig();
|
||||
clientConfig.connectTimeoutMs = 300000;
|
||||
clientConfig.requestTimeoutMs = 600000;
|
||||
Aws::S3::S3Client s3Client(clientConfig);
|
||||
Aws::S3::Model::PutObjectRequest putObjectRequest;
|
||||
putObjectRequest.WithBucket(bucket_.c_str()).WithKey(object_.c_str());
|
||||
long offset = outfile_->tellp();
|
||||
outfile_->seekg(0);
|
||||
putObjectRequest.SetBody(outfile_);
|
||||
putObjectRequest.SetContentLength(offset);
|
||||
auto putObjectOutcome = s3Client.PutObject(putObjectRequest);
|
||||
outfile_->clear();
|
||||
outfile_->seekp(offset);
|
||||
if (!putObjectOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
putObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
putObjectOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
private:
|
||||
string bucket_;
|
||||
string object_;
|
||||
bool sync_needed_;
|
||||
std::shared_ptr<Aws::Utils::TempFile> outfile_;
|
||||
};
|
||||
|
||||
class S3ReadOnlyMemoryRegion : public ReadOnlyMemoryRegion {
|
||||
public:
|
||||
S3ReadOnlyMemoryRegion(std::unique_ptr<char[]> data, uint64 length)
|
||||
: data_(std::move(data)), length_(length) {}
|
||||
const void* data() override { return reinterpret_cast<void*>(data_.get()); }
|
||||
uint64 length() override { return length_; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<char[]> data_;
|
||||
uint64 length_;
|
||||
};
|
||||
|
||||
S3FileSystem::S3FileSystem() {
|
||||
Aws::SDKOptions options;
|
||||
options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Info;
|
||||
options.cryptoOptions.sha256Factory_create_fn = []() {
|
||||
return Aws::MakeShared<S3SHA256Factory>(S3CryptoAllocationTag);
|
||||
};
|
||||
options.cryptoOptions.sha256HMACFactory_create_fn = []() {
|
||||
return Aws::MakeShared<S3SHA256HmacFactory>(S3CryptoAllocationTag);
|
||||
};
|
||||
Aws::InitAPI(options);
|
||||
}
|
||||
|
||||
S3FileSystem::~S3FileSystem() {
|
||||
Aws::SDKOptions options;
|
||||
options.loggingOptions.logLevel = Aws::Utils::Logging::LogLevel::Info;
|
||||
Aws::ShutdownAPI(options);
|
||||
}
|
||||
|
||||
Status S3FileSystem::NewRandomAccessFile(
|
||||
const string& fname, std::unique_ptr<RandomAccessFile>* result) {
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
|
||||
result->reset(new S3RandomAccessFile(bucket, object));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::NewWritableFile(const string& fname,
|
||||
std::unique_ptr<WritableFile>* result) {
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
|
||||
result->reset(new S3WritableFile(bucket, object));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::NewAppendableFile(const string& fname,
|
||||
std::unique_ptr<WritableFile>* result) {
|
||||
std::unique_ptr<RandomAccessFile> reader;
|
||||
TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &reader));
|
||||
std::unique_ptr<char[]> buffer(new char[kS3ReadAppendableFileBufferSize]);
|
||||
Status status;
|
||||
uint64 offset = 0;
|
||||
StringPiece read_chunk;
|
||||
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
|
||||
result->reset(new S3WritableFile(bucket, object));
|
||||
|
||||
while (true) {
|
||||
status = reader->Read(offset, kS3ReadAppendableFileBufferSize, &read_chunk,
|
||||
buffer.get());
|
||||
if (status.ok()) {
|
||||
(*result)->Append(read_chunk);
|
||||
offset += kS3ReadAppendableFileBufferSize;
|
||||
} else if (status.code() == error::OUT_OF_RANGE) {
|
||||
(*result)->Append(read_chunk);
|
||||
break;
|
||||
} else {
|
||||
(*result).reset();
|
||||
return status;
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::NewReadOnlyMemoryRegionFromFile(
|
||||
const string& fname, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
|
||||
uint64 size;
|
||||
TF_RETURN_IF_ERROR(GetFileSize(fname, &size));
|
||||
std::unique_ptr<char[]> data(new char[size]);
|
||||
|
||||
std::unique_ptr<RandomAccessFile> file;
|
||||
TF_RETURN_IF_ERROR(NewRandomAccessFile(fname, &file));
|
||||
|
||||
StringPiece piece;
|
||||
TF_RETURN_IF_ERROR(file->Read(0, size, &piece, data.get()));
|
||||
|
||||
result->reset(new S3ReadOnlyMemoryRegion(std::move(data), size));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::FileExists(const string& fname) {
|
||||
FileStatistics stats;
|
||||
TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::GetChildren(const string& dir,
|
||||
std::vector<string>* result) {
|
||||
string bucket, prefix;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(dir, false, &bucket, &prefix));
|
||||
|
||||
if (prefix.back() != '/') {
|
||||
prefix.push_back('/');
|
||||
}
|
||||
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
Aws::S3::Model::ListObjectsRequest listObjectsRequest;
|
||||
listObjectsRequest.WithBucket(bucket.c_str())
|
||||
.WithPrefix(prefix.c_str())
|
||||
.WithMaxKeys(kS3GetChildrenMaxKeys)
|
||||
.WithDelimiter("/");
|
||||
listObjectsRequest.SetResponseStreamFactory(
|
||||
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
|
||||
|
||||
Aws::S3::Model::ListObjectsResult listObjectsResult;
|
||||
do {
|
||||
auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
|
||||
if (!listObjectsOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
listObjectsOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
|
||||
listObjectsResult = listObjectsOutcome.GetResult();
|
||||
for (const auto& object : listObjectsResult.GetCommonPrefixes()) {
|
||||
Aws::String s = object.GetPrefix();
|
||||
s.erase(s.length() - 1);
|
||||
Aws::String entry = s.substr(strlen(prefix.c_str()));
|
||||
if (entry.length() > 0) {
|
||||
result->push_back(entry.c_str());
|
||||
}
|
||||
}
|
||||
for (const auto& object : listObjectsResult.GetContents()) {
|
||||
Aws::String s = object.GetKey();
|
||||
Aws::String entry = s.substr(strlen(prefix.c_str()));
|
||||
if (entry.length() > 0) {
|
||||
result->push_back(entry.c_str());
|
||||
}
|
||||
}
|
||||
listObjectsRequest.SetMarker(listObjectsResult.GetNextMarker());
|
||||
} while (listObjectsResult.GetIsTruncated());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::Stat(const string& fname, FileStatistics* stats) {
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(fname, true, &bucket, &object));
|
||||
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
if (object.empty()) {
|
||||
Aws::S3::Model::HeadBucketRequest headBucketRequest;
|
||||
headBucketRequest.WithBucket(bucket.c_str());
|
||||
auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
|
||||
if (!headBucketOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
headBucketOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
headBucketOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
stats->length = 0;
|
||||
stats->is_directory = 1;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
bool found = false;
|
||||
|
||||
Aws::S3::Model::HeadObjectRequest headObjectRequest;
|
||||
headObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
|
||||
headObjectRequest.SetResponseStreamFactory(
|
||||
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
|
||||
auto headObjectOutcome = s3Client.HeadObject(headObjectRequest);
|
||||
if (headObjectOutcome.IsSuccess()) {
|
||||
stats->length = headObjectOutcome.GetResult().GetContentLength();
|
||||
stats->is_directory = 0;
|
||||
stats->mtime_nsec =
|
||||
headObjectOutcome.GetResult().GetLastModified().Millis() * 1e6;
|
||||
found = true;
|
||||
}
|
||||
string prefix = object;
|
||||
if (prefix.back() != '/') {
|
||||
prefix.push_back('/');
|
||||
}
|
||||
Aws::S3::Model::ListObjectsRequest listObjectsRequest;
|
||||
listObjectsRequest.WithBucket(bucket.c_str())
|
||||
.WithPrefix(prefix.c_str())
|
||||
.WithMaxKeys(1);
|
||||
listObjectsRequest.SetResponseStreamFactory(
|
||||
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
|
||||
auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
|
||||
if (listObjectsOutcome.IsSuccess()) {
|
||||
if (listObjectsOutcome.GetResult().GetContents().size() > 0) {
|
||||
stats->length = 0;
|
||||
stats->is_directory = 1;
|
||||
found = true;
|
||||
}
|
||||
}
|
||||
if (!found) {
|
||||
return errors::NotFound("Object ", fname, " does not exist");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::DeleteFile(const string& fname) {
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(fname, false, &bucket, &object));
|
||||
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
|
||||
deleteObjectRequest.WithBucket(bucket.c_str()).WithKey(object.c_str());
|
||||
|
||||
auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
|
||||
if (!deleteObjectOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
deleteObjectOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::CreateDir(const string& dirname) {
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(dirname, true, &bucket, &object));
|
||||
|
||||
if (object.empty()) {
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
Aws::S3::Model::HeadBucketRequest headBucketRequest;
|
||||
headBucketRequest.WithBucket(bucket.c_str());
|
||||
auto headBucketOutcome = s3Client.HeadBucket(headBucketRequest);
|
||||
if (!headBucketOutcome.IsSuccess()) {
|
||||
return errors::NotFound("The bucket ", bucket, " was not found.");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
string filename = dirname;
|
||||
if (filename.back() != '/') {
|
||||
filename.push_back('/');
|
||||
}
|
||||
std::unique_ptr<WritableFile> file;
|
||||
TF_RETURN_IF_ERROR(NewWritableFile(filename, &file));
|
||||
TF_RETURN_IF_ERROR(file->Close());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::DeleteDir(const string& dirname) {
|
||||
string bucket, object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(dirname, false, &bucket, &object));
|
||||
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
string prefix = object;
|
||||
if (prefix.back() != '/') {
|
||||
prefix.push_back('/');
|
||||
}
|
||||
Aws::S3::Model::ListObjectsRequest listObjectsRequest;
|
||||
listObjectsRequest.WithBucket(bucket.c_str())
|
||||
.WithPrefix(prefix.c_str())
|
||||
.WithMaxKeys(2);
|
||||
listObjectsRequest.SetResponseStreamFactory(
|
||||
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
|
||||
auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
|
||||
if (listObjectsOutcome.IsSuccess()) {
|
||||
auto contents = listObjectsOutcome.GetResult().GetContents();
|
||||
if (contents.size() > 1 ||
|
||||
(contents.size() == 1 && contents[0].GetKey() != prefix.c_str())) {
|
||||
return errors::FailedPrecondition("Cannot delete a non-empty directory.");
|
||||
}
|
||||
if (contents.size() == 1 && contents[0].GetKey() == prefix.c_str()) {
|
||||
string filename = dirname;
|
||||
if (filename.back() != '/') {
|
||||
filename.push_back('/');
|
||||
}
|
||||
return DeleteFile(filename);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::GetFileSize(const string& fname, uint64* file_size) {
|
||||
FileStatistics stats;
|
||||
TF_RETURN_IF_ERROR(this->Stat(fname, &stats));
|
||||
*file_size = stats.length;
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status S3FileSystem::RenameFile(const string& src, const string& target) {
|
||||
string src_bucket, src_object, target_bucket, target_object;
|
||||
TF_RETURN_IF_ERROR(ParseS3Path(src, false, &src_bucket, &src_object));
|
||||
TF_RETURN_IF_ERROR(
|
||||
ParseS3Path(target, false, &target_bucket, &target_object));
|
||||
if (src_object.back() == '/') {
|
||||
if (target_object.back() != '/') {
|
||||
target_object.push_back('/');
|
||||
}
|
||||
} else {
|
||||
if (target_object.back() == '/') {
|
||||
target_object.pop_back();
|
||||
}
|
||||
}
|
||||
|
||||
Aws::S3::S3Client s3Client(GetDefaultClientConfig());
|
||||
|
||||
Aws::S3::Model::CopyObjectRequest copyObjectRequest;
|
||||
Aws::S3::Model::DeleteObjectRequest deleteObjectRequest;
|
||||
|
||||
Aws::S3::Model::ListObjectsRequest listObjectsRequest;
|
||||
listObjectsRequest.WithBucket(src_bucket.c_str())
|
||||
.WithPrefix(src_object.c_str())
|
||||
.WithMaxKeys(kS3GetChildrenMaxKeys);
|
||||
listObjectsRequest.SetResponseStreamFactory(
|
||||
[]() { return Aws::New<Aws::StringStream>(kS3FileSystemAllocationTag); });
|
||||
|
||||
Aws::S3::Model::ListObjectsResult listObjectsResult;
|
||||
do {
|
||||
auto listObjectsOutcome = s3Client.ListObjects(listObjectsRequest);
|
||||
if (!listObjectsOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
listObjectsOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
listObjectsOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
|
||||
listObjectsResult = listObjectsOutcome.GetResult();
|
||||
for (const auto& object : listObjectsResult.GetContents()) {
|
||||
Aws::String src_key = object.GetKey();
|
||||
Aws::String target_key = src_key;
|
||||
target_key.replace(0, src_object.length(), target_object.c_str());
|
||||
Aws::String source = Aws::String(src_bucket.c_str()) + "/" + src_key;
|
||||
|
||||
copyObjectRequest.SetBucket(target_bucket.c_str());
|
||||
copyObjectRequest.SetKey(target_key);
|
||||
copyObjectRequest.SetCopySource(source);
|
||||
|
||||
auto copyObjectOutcome = s3Client.CopyObject(copyObjectRequest);
|
||||
if (!copyObjectOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
copyObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
copyObjectOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
|
||||
deleteObjectRequest.SetBucket(src_bucket.c_str());
|
||||
deleteObjectRequest.SetKey(src_key.c_str());
|
||||
|
||||
auto deleteObjectOutcome = s3Client.DeleteObject(deleteObjectRequest);
|
||||
if (!deleteObjectOutcome.IsSuccess()) {
|
||||
string error = strings::StrCat(
|
||||
deleteObjectOutcome.GetError().GetExceptionName().c_str(), ": ",
|
||||
deleteObjectOutcome.GetError().GetMessage().c_str());
|
||||
return errors::Internal(error);
|
||||
}
|
||||
}
|
||||
listObjectsRequest.SetMarker(listObjectsResult.GetNextMarker());
|
||||
} while (listObjectsResult.GetIsTruncated());
|
||||
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
REGISTER_FILE_SYSTEM("s3", S3FileSystem);
|
||||
|
||||
} // namespace tensorflow
|
60
tensorflow/contrib/s3/s3_file_system.h
Normal file
60
tensorflow/contrib/s3/s3_file_system.h
Normal file
@ -0,0 +1,60 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
|
||||
#define TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
|
||||
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
class S3FileSystem : public FileSystem {
|
||||
public:
|
||||
S3FileSystem();
|
||||
~S3FileSystem();
|
||||
|
||||
Status NewRandomAccessFile(
|
||||
const string& fname, std::unique_ptr<RandomAccessFile>* result) override;
|
||||
|
||||
Status NewWritableFile(const string& fname,
|
||||
std::unique_ptr<WritableFile>* result) override;
|
||||
|
||||
Status NewAppendableFile(const string& fname,
|
||||
std::unique_ptr<WritableFile>* result) override;
|
||||
|
||||
Status NewReadOnlyMemoryRegionFromFile(
|
||||
const string& fname,
|
||||
std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
|
||||
|
||||
Status FileExists(const string& fname) override;
|
||||
|
||||
Status GetChildren(const string& dir, std::vector<string>* result) override;
|
||||
|
||||
Status Stat(const string& fname, FileStatistics* stat) override;
|
||||
|
||||
Status DeleteFile(const string& fname) override;
|
||||
|
||||
Status CreateDir(const string& name) override;
|
||||
|
||||
Status DeleteDir(const string& name) override;
|
||||
|
||||
Status GetFileSize(const string& fname, uint64* size) override;
|
||||
|
||||
Status RenameFile(const string& src, const string& target) override;
|
||||
};
|
||||
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CONTRIB_S3_S3_FILE_SYSTEM_H_
|
233
tensorflow/contrib/s3/s3_file_system_test.cc
Normal file
233
tensorflow/contrib/s3/s3_file_system_test.cc
Normal file
@ -0,0 +1,233 @@
|
||||
/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/contrib/s3/s3_file_system.h"
|
||||
|
||||
#include "tensorflow/core/lib/core/status_test_util.h"
|
||||
#include "tensorflow/core/lib/gtl/stl_util.h"
|
||||
#include "tensorflow/core/lib/io/path.h"
|
||||
#include "tensorflow/core/platform/file_system.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
|
||||
namespace tensorflow {
|
||||
|
||||
namespace {
|
||||
|
||||
class S3FileSystemTest : public ::testing::Test {
|
||||
protected:
|
||||
S3FileSystemTest() {}
|
||||
|
||||
string TmpDir(const string& path) {
|
||||
char* test_dir = getenv("S3_TEST_TMPDIR");
|
||||
if (test_dir != nullptr) {
|
||||
return io::JoinPath(string(test_dir), path);
|
||||
} else {
|
||||
return "s3://" + io::JoinPath(testing::TmpDir(), path);
|
||||
}
|
||||
}
|
||||
|
||||
Status WriteString(const string& fname, const string& content) {
|
||||
std::unique_ptr<WritableFile> writer;
|
||||
TF_RETURN_IF_ERROR(s3fs.NewWritableFile(fname, &writer));
|
||||
TF_RETURN_IF_ERROR(writer->Append(content));
|
||||
TF_RETURN_IF_ERROR(writer->Close());
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ReadAll(const string& fname, string* content) {
|
||||
std::unique_ptr<RandomAccessFile> reader;
|
||||
TF_RETURN_IF_ERROR(s3fs.NewRandomAccessFile(fname, &reader));
|
||||
|
||||
uint64 file_size = 0;
|
||||
TF_RETURN_IF_ERROR(s3fs.GetFileSize(fname, &file_size));
|
||||
|
||||
content->resize(file_size);
|
||||
StringPiece result;
|
||||
TF_RETURN_IF_ERROR(
|
||||
reader->Read(0, file_size, &result, gtl::string_as_array(content)));
|
||||
if (file_size != result.size()) {
|
||||
return errors::DataLoss("expected ", file_size, " got ", result.size(),
|
||||
" bytes");
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
S3FileSystem s3fs;
|
||||
};
|
||||
|
||||
TEST_F(S3FileSystemTest, NewRandomAccessFile) {
|
||||
const string fname = TmpDir("RandomAccessFile");
|
||||
const string content = "abcdefghijklmn";
|
||||
|
||||
TF_ASSERT_OK(WriteString(fname, content));
|
||||
|
||||
std::unique_ptr<RandomAccessFile> reader;
|
||||
TF_EXPECT_OK(s3fs.NewRandomAccessFile(fname, &reader));
|
||||
|
||||
string got;
|
||||
got.resize(content.size());
|
||||
StringPiece result;
|
||||
TF_EXPECT_OK(
|
||||
reader->Read(0, content.size(), &result, gtl::string_as_array(&got)));
|
||||
EXPECT_EQ(content.size(), result.size());
|
||||
EXPECT_EQ(content, result);
|
||||
|
||||
got.clear();
|
||||
got.resize(4);
|
||||
TF_EXPECT_OK(reader->Read(2, 4, &result, gtl::string_as_array(&got)));
|
||||
EXPECT_EQ(4, result.size());
|
||||
EXPECT_EQ(content.substr(2, 4), result);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, NewWritableFile) {
|
||||
std::unique_ptr<WritableFile> writer;
|
||||
const string fname = TmpDir("WritableFile");
|
||||
TF_EXPECT_OK(s3fs.NewWritableFile(fname, &writer));
|
||||
TF_EXPECT_OK(writer->Append("content1,"));
|
||||
TF_EXPECT_OK(writer->Append("content2"));
|
||||
TF_EXPECT_OK(writer->Flush());
|
||||
TF_EXPECT_OK(writer->Sync());
|
||||
TF_EXPECT_OK(writer->Close());
|
||||
|
||||
string content;
|
||||
TF_EXPECT_OK(ReadAll(fname, &content));
|
||||
EXPECT_EQ("content1,content2", content);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, NewAppendableFile) {
|
||||
std::unique_ptr<WritableFile> writer;
|
||||
|
||||
const string fname = TmpDir("AppendableFile");
|
||||
TF_ASSERT_OK(WriteString(fname, "test"));
|
||||
|
||||
TF_EXPECT_OK(s3fs.NewAppendableFile(fname, &writer));
|
||||
TF_EXPECT_OK(writer->Append("content"));
|
||||
TF_EXPECT_OK(writer->Close());
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, NewReadOnlyMemoryRegionFromFile) {
|
||||
const string fname = TmpDir("MemoryFile");
|
||||
const string content = "content";
|
||||
TF_ASSERT_OK(WriteString(fname, content));
|
||||
std::unique_ptr<ReadOnlyMemoryRegion> region;
|
||||
TF_EXPECT_OK(s3fs.NewReadOnlyMemoryRegionFromFile(fname, ®ion));
|
||||
|
||||
EXPECT_EQ(content, StringPiece(reinterpret_cast<const char*>(region->data()),
|
||||
region->length()));
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, FileExists) {
|
||||
const string fname = TmpDir("FileExists");
|
||||
EXPECT_EQ(error::Code::NOT_FOUND, s3fs.FileExists(fname).code());
|
||||
TF_ASSERT_OK(WriteString(fname, "test"));
|
||||
TF_EXPECT_OK(s3fs.FileExists(fname));
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, GetChildren) {
|
||||
const string base = TmpDir("GetChildren");
|
||||
TF_EXPECT_OK(s3fs.CreateDir(base));
|
||||
|
||||
const string file = io::JoinPath(base, "TestFile.csv");
|
||||
TF_EXPECT_OK(WriteString(file, "test"));
|
||||
|
||||
const string subdir = io::JoinPath(base, "SubDir");
|
||||
TF_EXPECT_OK(s3fs.CreateDir(subdir));
|
||||
// s3 object storage doesn't support empty directory, we create file in the
|
||||
// directory
|
||||
const string subfile = io::JoinPath(subdir, "TestSubFile.csv");
|
||||
TF_EXPECT_OK(WriteString(subfile, "test"));
|
||||
|
||||
std::vector<string> children;
|
||||
TF_EXPECT_OK(s3fs.GetChildren(base, &children));
|
||||
std::sort(children.begin(), children.end());
|
||||
EXPECT_EQ(std::vector<string>({"SubDir", "TestFile.csv"}), children);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, DeleteFile) {
|
||||
const string fname = TmpDir("DeleteFile");
|
||||
TF_ASSERT_OK(WriteString(fname, "test"));
|
||||
TF_EXPECT_OK(s3fs.DeleteFile(fname));
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, GetFileSize) {
|
||||
const string fname = TmpDir("GetFileSize");
|
||||
TF_ASSERT_OK(WriteString(fname, "test"));
|
||||
uint64 file_size = 0;
|
||||
TF_EXPECT_OK(s3fs.GetFileSize(fname, &file_size));
|
||||
EXPECT_EQ(4, file_size);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, CreateDir) {
|
||||
// s3 object storage doesn't support empty directory, we create file in the
|
||||
// directory
|
||||
const string dir = TmpDir("CreateDir");
|
||||
TF_EXPECT_OK(s3fs.CreateDir(dir));
|
||||
|
||||
const string file = io::JoinPath(dir, "CreateDirFile.csv");
|
||||
TF_EXPECT_OK(WriteString(file, "test"));
|
||||
FileStatistics stat;
|
||||
TF_EXPECT_OK(s3fs.Stat(dir, &stat));
|
||||
EXPECT_TRUE(stat.is_directory);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, DeleteDir) {
|
||||
// s3 object storage doesn't support empty directory, we create file in the
|
||||
// directory
|
||||
const string dir = TmpDir("DeleteDir");
|
||||
const string file = io::JoinPath(dir, "DeleteDirFile.csv");
|
||||
TF_EXPECT_OK(WriteString(file, "test"));
|
||||
EXPECT_FALSE(s3fs.DeleteDir(dir).ok());
|
||||
|
||||
TF_EXPECT_OK(s3fs.DeleteFile(file));
|
||||
TF_EXPECT_OK(s3fs.DeleteDir(dir));
|
||||
FileStatistics stat;
|
||||
EXPECT_FALSE(s3fs.Stat(dir, &stat).ok());
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, RenameFile) {
|
||||
const string fname1 = TmpDir("RenameFile1");
|
||||
const string fname2 = TmpDir("RenameFile2");
|
||||
TF_ASSERT_OK(WriteString(fname1, "test"));
|
||||
TF_EXPECT_OK(s3fs.RenameFile(fname1, fname2));
|
||||
string content;
|
||||
TF_EXPECT_OK(ReadAll(fname2, &content));
|
||||
EXPECT_EQ("test", content);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, RenameFile_Overwrite) {
|
||||
const string fname1 = TmpDir("RenameFile1");
|
||||
const string fname2 = TmpDir("RenameFile2");
|
||||
|
||||
TF_ASSERT_OK(WriteString(fname2, "test"));
|
||||
TF_EXPECT_OK(s3fs.FileExists(fname2));
|
||||
|
||||
TF_ASSERT_OK(WriteString(fname1, "test"));
|
||||
TF_EXPECT_OK(s3fs.RenameFile(fname1, fname2));
|
||||
string content;
|
||||
TF_EXPECT_OK(ReadAll(fname2, &content));
|
||||
EXPECT_EQ("test", content);
|
||||
}
|
||||
|
||||
TEST_F(S3FileSystemTest, StatFile) {
|
||||
const string fname = TmpDir("StatFile");
|
||||
TF_ASSERT_OK(WriteString(fname, "test"));
|
||||
FileStatistics stat;
|
||||
TF_EXPECT_OK(s3fs.Stat(fname, &stat));
|
||||
EXPECT_EQ(4, stat.length);
|
||||
EXPECT_FALSE(stat.is_directory);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace tensorflow
|
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/framework/common_shape_fns.h"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/shape_inference.h"
|
||||
|
||||
@ -26,14 +27,7 @@ REGISTER_OP("InfeedDequeue")
|
||||
.Attr("dtype: type")
|
||||
.Attr("shape: shape")
|
||||
.SetIsStateful()
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle out;
|
||||
TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
|
||||
c->set_output(0, out);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Doc(R"doc(
|
||||
A placeholder op for a value that will be fed into the computation.
|
||||
|
||||
|
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/framework/common_shape_fns.h"
|
||||
#include "tensorflow/core/framework/op.h"
|
||||
#include "tensorflow/core/framework/shape_inference.h"
|
||||
|
||||
@ -48,14 +49,7 @@ REGISTER_OP("OutfeedDequeue")
|
||||
.Attr("shape: shape")
|
||||
.Attr("device_ordinal: int = -1")
|
||||
.SetIsStateful()
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle out;
|
||||
TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
|
||||
c->set_output(0, out);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Doc(R"doc(
|
||||
Retrieves a single tensor from the computation outfeed. This operation will
|
||||
block indefinitely until data is available.
|
||||
|
@ -1200,6 +1200,15 @@ Status ScatterNdUpdateShape(InferenceContext* c) {
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
Status ExplicitShape(InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle output_shape;
|
||||
TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output_shape));
|
||||
c->set_output(0, output_shape);
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
} // namespace shape_inference
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -213,6 +213,9 @@ Status ValidateSparseTensor(InferenceContext* c, ShapeHandle indices_shape,
|
||||
// Shape function for ScatterNd update/add/sub/... operations.
|
||||
Status ScatterNdUpdateShape(InferenceContext* c);
|
||||
|
||||
// Shape function for ops with an explicit "shape" attribute.
|
||||
Status ExplicitShape(InferenceContext* c);
|
||||
|
||||
} // namespace shape_inference
|
||||
|
||||
} // namespace tensorflow
|
||||
|
@ -451,7 +451,7 @@ Buffer<T>::~Buffer() {
|
||||
// default value for T.
|
||||
//
|
||||
// This routine is using the typed fields (float_val, etc.) in the
|
||||
// tenor proto as opposed to the untyped binary representation
|
||||
// tensor proto as opposed to the untyped binary representation
|
||||
// (tensor_content). This is used when we expect the TensorProto is
|
||||
// used by a client program which may not know how to encode a tensor
|
||||
// in the compact binary representation.
|
||||
|
@ -281,8 +281,8 @@ Status SingleMachine::ResetSession() {
|
||||
// Make sure the session is properly closed
|
||||
TF_RETURN_IF_ERROR(Shutdown());
|
||||
|
||||
// Destroying the object deletes all its varibles as well. This is only true
|
||||
// for DirectSession.
|
||||
// Destroying the object deletes all its variables as well. This is only
|
||||
// true for DirectSession.
|
||||
session_.reset();
|
||||
}
|
||||
|
||||
|
@ -26,8 +26,8 @@ namespace grappler {
|
||||
|
||||
constexpr int kOpsPerMac = 2;
|
||||
constexpr char kConv2d[] = "Conv2D";
|
||||
constexpr char kConv2dBackPropFilter[] = "Conv2DBackpropFilter";
|
||||
constexpr char kConv2dBackPropInput[] = "Conv2DBackpropInput";
|
||||
constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
|
||||
constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
|
||||
constexpr char kMatMul[] = "MatMul";
|
||||
constexpr char kSparseMatMul[] = "SparseMatMul";
|
||||
constexpr char kIdentity[] = "Identity";
|
||||
@ -150,10 +150,10 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
|
||||
|
||||
device_cost_impl_ = {
|
||||
{kConv2d, wrap(&OpLevelCostEstimator::PredictConv2D)},
|
||||
{kConv2dBackPropFilter,
|
||||
wrap(&OpLevelCostEstimator::PredictConv2DBackPropFilter)},
|
||||
{kConv2dBackPropInput,
|
||||
wrap(&OpLevelCostEstimator::PredictConv2DBackPropInput)},
|
||||
{kConv2dBackpropFilter,
|
||||
wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
|
||||
{kConv2dBackpropInput,
|
||||
wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
|
||||
{kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
|
||||
{kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
|
||||
{kIdentity, wrap(&OpLevelCostEstimator::PredictNoOp)},
|
||||
@ -668,20 +668,20 @@ int64 OpLevelCostEstimator::CountBatchMatMulOperations(
|
||||
return ops;
|
||||
}
|
||||
|
||||
// TODO(cliffy): Dedup this method and CountConv2DBackPropFilterOperations.
|
||||
int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
|
||||
// TODO(cliffy): Dedup this method and CountConv2DBackpropFilterOperations.
|
||||
int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
|
||||
const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
|
||||
bool* found_unknown_shapes) const {
|
||||
int64 ops = 0;
|
||||
|
||||
if (op_features.op() != kConv2dBackPropInput) {
|
||||
if (op_features.op() != kConv2dBackpropInput) {
|
||||
LOG(ERROR) << "Invalid Operation";
|
||||
return ops;
|
||||
}
|
||||
|
||||
if (op_features.outputs_size() != 1) {
|
||||
// Need _output_shapes for input shape.
|
||||
LOG(ERROR) << "No output shape in Conv2DBackPropInput op.";
|
||||
LOG(ERROR) << "No output shape in Conv2DBackpropInput op.";
|
||||
return ops;
|
||||
}
|
||||
|
||||
@ -696,7 +696,7 @@ int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
|
||||
ops *= conv_dims.iz * conv_dims.oz;
|
||||
ops *= kOpsPerMac;
|
||||
|
||||
VLOG(1) << "Operations for Conv2DBackPropInput " << ops;
|
||||
VLOG(1) << "Operations for Conv2DBackpropInput " << ops;
|
||||
|
||||
if (returned_conv_dims != nullptr) {
|
||||
*returned_conv_dims = conv_dims;
|
||||
@ -704,18 +704,18 @@ int64 OpLevelCostEstimator::CountConv2DBackPropInputOperations(
|
||||
return ops;
|
||||
}
|
||||
|
||||
int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
|
||||
int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
|
||||
const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
|
||||
bool* found_unknown_shapes) const {
|
||||
int64 ops = 0;
|
||||
if (op_features.op() != kConv2dBackPropFilter) {
|
||||
if (op_features.op() != kConv2dBackpropFilter) {
|
||||
LOG(ERROR) << "Invalid Operation";
|
||||
return ops;
|
||||
}
|
||||
|
||||
if (op_features.outputs_size() != 1) {
|
||||
// Need _output_shapes for input shape.
|
||||
LOG(ERROR) << "No output shape in Conv2DBackPropFilter op.";
|
||||
LOG(ERROR) << "No output shape in Conv2DBackpropFilter op.";
|
||||
return ops;
|
||||
}
|
||||
|
||||
@ -730,7 +730,7 @@ int64 OpLevelCostEstimator::CountConv2DBackPropFilterOperations(
|
||||
ops *= conv_dims.iz * conv_dims.oz;
|
||||
ops *= kOpsPerMac;
|
||||
|
||||
VLOG(1) << "Operations for Conv2DBackPropFilter" << ops;
|
||||
VLOG(1) << "Operations for Conv2DBackpropFilter" << ops;
|
||||
|
||||
if (returned_conv_dims != nullptr) {
|
||||
*returned_conv_dims = conv_dims;
|
||||
@ -814,22 +814,22 @@ Costs OpLevelCostEstimator::PredictConv2D(const OpInfo& op_features) const {
|
||||
return costs;
|
||||
}
|
||||
|
||||
Costs OpLevelCostEstimator::PredictConv2DBackPropInput(
|
||||
Costs OpLevelCostEstimator::PredictConv2DBackpropInput(
|
||||
const OpInfo& op_features) const {
|
||||
bool found_unknown_shapes = false;
|
||||
auto costs =
|
||||
PredictOpCountBasedCost(CountConv2DBackPropInputOperations(
|
||||
PredictOpCountBasedCost(CountConv2DBackpropInputOperations(
|
||||
op_features, nullptr, &found_unknown_shapes),
|
||||
op_features);
|
||||
costs.inaccurate = found_unknown_shapes;
|
||||
return costs;
|
||||
}
|
||||
|
||||
Costs OpLevelCostEstimator::PredictConv2DBackPropFilter(
|
||||
Costs OpLevelCostEstimator::PredictConv2DBackpropFilter(
|
||||
const OpInfo& op_features) const {
|
||||
bool found_unknown_shapes = false;
|
||||
auto costs =
|
||||
PredictOpCountBasedCost(CountConv2DBackPropFilterOperations(
|
||||
PredictOpCountBasedCost(CountConv2DBackpropFilterOperations(
|
||||
op_features, nullptr, &found_unknown_shapes),
|
||||
op_features);
|
||||
costs.inaccurate = found_unknown_shapes;
|
||||
|
@ -82,10 +82,10 @@ class OpLevelCostEstimator {
|
||||
bool* found_unknown_shapes) const;
|
||||
int64 CountBatchMatMulOperations(const OpInfo& op_features,
|
||||
bool* found_unknown_shapes) const;
|
||||
int64 CountConv2DBackPropInputOperations(const OpInfo& op_features,
|
||||
int64 CountConv2DBackpropInputOperations(const OpInfo& op_features,
|
||||
ConvolutionDimensions* conv_info,
|
||||
bool* found_unknown_shapes) const;
|
||||
int64 CountConv2DBackPropFilterOperations(const OpInfo& op_features,
|
||||
int64 CountConv2DBackpropFilterOperations(const OpInfo& op_features,
|
||||
ConvolutionDimensions* conv_info,
|
||||
bool* found_unknown_shapes) const;
|
||||
|
||||
@ -124,8 +124,8 @@ class OpLevelCostEstimator {
|
||||
// device.
|
||||
Costs PredictConv2D(const OpInfo& op_features) const;
|
||||
Costs PredictCwiseOp(const OpInfo& op_features) const;
|
||||
Costs PredictConv2DBackPropInput(const OpInfo& op_features) const;
|
||||
Costs PredictConv2DBackPropFilter(const OpInfo& op_features) const;
|
||||
Costs PredictConv2DBackpropInput(const OpInfo& op_features) const;
|
||||
Costs PredictConv2DBackpropFilter(const OpInfo& op_features) const;
|
||||
Costs PredictMatMul(const OpInfo& op_features) const;
|
||||
Costs PredictNoOp(const OpInfo& op_features) const;
|
||||
Costs PredictBatchMatMul(const OpInfo& op_features) const;
|
||||
|
@ -4429,6 +4429,7 @@ filegroup(
|
||||
"depthtospace_op.h",
|
||||
"depthwise_conv_op.h",
|
||||
"fake_quant_ops_functor.h",
|
||||
"fused_batch_norm_op.h",
|
||||
"gemm_functors.h",
|
||||
"image_resizer_state.h",
|
||||
"maxpooling_op.h",
|
||||
|
@ -555,7 +555,7 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
|
||||
int col_stride, const Padding& padding, Tensor* filter_backprop,
|
||||
TensorFormat data_format) {
|
||||
using perftools::gputools::dnn::AlgorithmConfig;
|
||||
using perftools::gputools::dnn::AlgorithmType;
|
||||
using perftools::gputools::dnn::AlgorithmDesc;
|
||||
using perftools::gputools::dnn::ProfileResult;
|
||||
|
||||
std::vector<int32> strides(4, 1);
|
||||
@ -816,35 +816,40 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
|
||||
AlgorithmConfig algorithm_config;
|
||||
if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
|
||||
conv_parameters, &algorithm_config)) {
|
||||
std::vector<AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
|
||||
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
|
||||
ProfileResult best_result;
|
||||
ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveBackwardFilterScratchSize,
|
||||
ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardFilterWithAlgorithm(
|
||||
input_desc, input_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, filter_desc, &filter_backprop_ptr,
|
||||
&scratch_allocator, AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
CudnnScratchAllocator scratch_allocator(
|
||||
ConvolveBackwardFilterScratchSize, ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardFilterWithAlgorithm(
|
||||
input_desc, input_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, filter_desc, &filter_backprop_ptr,
|
||||
&scratch_allocator, AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -630,7 +630,7 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
|
||||
int col_stride, const Padding& padding, Tensor* in_backprop,
|
||||
TensorFormat data_format) {
|
||||
using perftools::gputools::dnn::AlgorithmConfig;
|
||||
using perftools::gputools::dnn::AlgorithmType;
|
||||
using perftools::gputools::dnn::AlgorithmDesc;
|
||||
using perftools::gputools::dnn::ProfileResult;
|
||||
|
||||
std::vector<int32> strides(4, 1);
|
||||
@ -870,34 +870,39 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
|
||||
AlgorithmConfig algorithm_config;
|
||||
if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
|
||||
conv_parameters, &algorithm_config)) {
|
||||
std::vector<AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
|
||||
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
|
||||
ProfileResult best_result;
|
||||
ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
|
||||
ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardDataWithAlgorithm(
|
||||
filter_desc, filter_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
|
||||
ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardDataWithAlgorithm(
|
||||
filter_desc, filter_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -649,40 +649,45 @@ class Conv3DBackpropInputOp<GPUDevice, T> : public OpKernel {
|
||||
};
|
||||
|
||||
using perftools::gputools::dnn::AlgorithmConfig;
|
||||
using perftools::gputools::dnn::AlgorithmType;
|
||||
using perftools::gputools::dnn::AlgorithmDesc;
|
||||
using perftools::gputools::dnn::ProfileResult;
|
||||
using perftools::gputools::dnn::kDefaultAlgorithm;
|
||||
AlgorithmConfig algorithm_config;
|
||||
if (cudnn_use_autotune_ && !AutoTuneConv3dBwdData::GetInstance()->Find(
|
||||
conv_parameters, &algorithm_config)) {
|
||||
std::vector<AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveBackwardDataAlgorithms(
|
||||
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
|
||||
ProfileResult best_result;
|
||||
ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveBackwardDataScratchSize,
|
||||
context);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardDataWithAlgorithm(
|
||||
filter_desc, filter_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, input_desc, &in_backprop_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
CudnnScratchAllocator scratch_allocator(
|
||||
ConvolveBackwardDataScratchSize, context);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardDataWithAlgorithm(
|
||||
filter_desc, filter_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, input_desc, &in_backprop_ptr,
|
||||
&scratch_allocator, AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -1016,41 +1021,45 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
|
||||
};
|
||||
|
||||
using perftools::gputools::dnn::AlgorithmConfig;
|
||||
using perftools::gputools::dnn::AlgorithmType;
|
||||
using perftools::gputools::dnn::AlgorithmDesc;
|
||||
using perftools::gputools::dnn::ProfileResult;
|
||||
using perftools::gputools::dnn::kDefaultAlgorithm;
|
||||
AlgorithmConfig algorithm_config;
|
||||
if (cudnn_use_autotune_ && !AutoTuneConv3dBwdFilter::GetInstance()->Find(
|
||||
conv_parameters, &algorithm_config)) {
|
||||
std::vector<AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveBackwardFilterAlgorithms(
|
||||
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
|
||||
ProfileResult best_result;
|
||||
ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(
|
||||
ConvolveBackwardFilterScratchSize, context);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardFilterWithAlgorithm(
|
||||
input_desc, input_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, filter_desc, &filter_backprop_ptr,
|
||||
&scratch_allocator, AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
CudnnScratchAllocator scratch_allocator(
|
||||
ConvolveBackwardFilterScratchSize, context);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveBackwardFilterWithAlgorithm(
|
||||
input_desc, input_ptr, output_desc, out_backprop_ptr,
|
||||
conv_desc, filter_desc, &filter_backprop_ptr,
|
||||
&scratch_allocator, AlgorithmConfig(profile_algorithm),
|
||||
&profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -447,9 +447,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
|
||||
int col_stride, const Padding& padding, Tensor* output,
|
||||
TensorFormat data_format) {
|
||||
using perftools::gputools::dnn::AlgorithmConfig;
|
||||
using perftools::gputools::dnn::AlgorithmType;
|
||||
using perftools::gputools::dnn::AlgorithmDesc;
|
||||
using perftools::gputools::dnn::ProfileResult;
|
||||
using perftools::gputools::dnn::kDefaultAlgorithm;
|
||||
auto* stream = ctx->op_device_context()->stream();
|
||||
OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));
|
||||
|
||||
@ -663,33 +662,38 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
|
||||
AlgorithmConfig algorithm_config;
|
||||
if (cudnn_use_autotune &&
|
||||
!AutoTuneConv::GetInstance()->Find(conv_parameters, &algorithm_config)) {
|
||||
std::vector<AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveAlgorithms(
|
||||
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
|
||||
ProfileResult best_result;
|
||||
ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveWithAlgorithm(
|
||||
input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
|
||||
output_desc, &output_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveWithAlgorithm(
|
||||
input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
|
||||
output_desc, &output_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -383,41 +383,45 @@ struct LaunchConvOp<GPUDevice, T> {
|
||||
};
|
||||
|
||||
using perftools::gputools::dnn::AlgorithmConfig;
|
||||
using perftools::gputools::dnn::AlgorithmType;
|
||||
using perftools::gputools::dnn::AlgorithmDesc;
|
||||
using perftools::gputools::dnn::ProfileResult;
|
||||
using perftools::gputools::dnn::kDefaultAlgorithm;
|
||||
|
||||
AlgorithmConfig algorithm_config;
|
||||
|
||||
if (cudnn_use_autotune && !AutoTuneConv3d::GetInstance()->Find(
|
||||
conv_parameters, &algorithm_config)) {
|
||||
std::vector<AlgorithmType> algorithms;
|
||||
std::vector<AlgorithmDesc::Index> algorithms;
|
||||
CHECK(stream->parent()->GetConvolveAlgorithms(
|
||||
conv_parameters.ShouldIncludeWinogradNonfusedAlgo<T>(), &algorithms));
|
||||
ProfileResult best_result;
|
||||
ProfileResult best_result_no_scratch;
|
||||
for (auto profile_algorithm : algorithms) {
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveWithAlgorithm(
|
||||
input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
|
||||
output_desc, &output_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
// TODO(benbarsdell): Ideally this should not attempt using tensor op math
|
||||
// if it's not enabled.
|
||||
for (bool use_tensor_ops : {false, true}) {
|
||||
for (auto algo_index : algorithms) {
|
||||
AlgorithmDesc profile_algorithm(algo_index, use_tensor_ops);
|
||||
// TODO(zhengxq): profile each algorithm multiple times to better
|
||||
// accuracy.
|
||||
CudnnScratchAllocator scratch_allocator(ConvolveScratchSize, ctx);
|
||||
ProfileResult profile_result;
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenConvolveWithAlgorithm(
|
||||
input_desc, input_ptr, filter_desc, filter_ptr, conv_desc,
|
||||
output_desc, &output_ptr, &scratch_allocator,
|
||||
AlgorithmConfig(profile_algorithm), &profile_result)
|
||||
.ok();
|
||||
if (cudnn_launch_status) {
|
||||
if (profile_result.is_valid()) {
|
||||
if (profile_result.elapsed_time_in_ms() <
|
||||
best_result.elapsed_time_in_ms()) {
|
||||
best_result = profile_result;
|
||||
}
|
||||
if (scratch_allocator.TotalByteSize() == 0 &&
|
||||
profile_result.elapsed_time_in_ms() <
|
||||
best_result_no_scratch.elapsed_time_in_ms()) {
|
||||
best_result_no_scratch = profile_result;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -81,21 +81,26 @@ struct GlimpseExtractionOp {
|
||||
for (Index i = 0; i < batch_size; ++i) {
|
||||
float x = offsets_[i].first, y = offsets_[i].second;
|
||||
|
||||
// Un-normalize coordinates back to pixel space if normalized.
|
||||
if (normalized_) {
|
||||
// Un-normalize coordinates back to pixel space if normalized.
|
||||
x *= input_width;
|
||||
y *= input_height;
|
||||
if (centered_) {
|
||||
// Un-center if coordinates are centered on the image center.
|
||||
x /= 2.0f;
|
||||
y /= 2.0f;
|
||||
x += input_width / 2.0f;
|
||||
y += input_height / 2.0f;
|
||||
// Remove half of the glimpse window.
|
||||
x -= width_ / 2.0f;
|
||||
y -= height_ / 2.0f;
|
||||
}
|
||||
} else {
|
||||
if (centered_) {
|
||||
x += input_width / 2.0f;
|
||||
y += input_height / 2.0f;
|
||||
}
|
||||
}
|
||||
// Un-center if coordinates are centered on the image center.
|
||||
if (centered_) {
|
||||
x /= 2.0f;
|
||||
y /= 2.0f;
|
||||
x += input_width / 2.0f;
|
||||
y += input_height / 2.0f;
|
||||
}
|
||||
// Remove half of the glimpse window.
|
||||
x -= width_ / 2.0f;
|
||||
y -= height_ / 2.0f;
|
||||
|
||||
const Index offset_x = (Index) x;
|
||||
const Index offset_y = (Index) y;
|
||||
|
@ -17,7 +17,6 @@ limitations under the License.
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
#define EIGEN_USE_GPU
|
||||
#include "tensorflow/core/kernels/fused_batch_norm_op.h"
|
||||
#include "tensorflow/core/kernels/conv_2d.h"
|
||||
#include "tensorflow/core/kernels/conv_ops_gpu.h"
|
||||
#include "tensorflow/core/util/stream_executor_util.h"
|
||||
@ -28,6 +27,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/framework/register_types.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
#include "tensorflow/core/kernels/fused_batch_norm_op.h"
|
||||
#include "tensorflow/core/util/tensor_format.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -39,7 +39,8 @@ namespace functor {
|
||||
// Functor used by FusedBatchNormOp to do the computations.
|
||||
template <typename Device, typename T>
|
||||
struct FusedBatchNorm;
|
||||
// Functor used by FusedBatchNormGradOp to do the computations.
|
||||
// Functor used by FusedBatchNormGradOp to do the computations when
|
||||
// is_training=True.
|
||||
template <typename Device, typename T>
|
||||
struct FusedBatchNormGrad;
|
||||
|
||||
@ -352,7 +353,7 @@ template <typename T>
|
||||
struct FusedBatchNormGrad<GPUDevice, T> {
|
||||
void operator()(OpKernelContext* context, const Tensor& y_backprop,
|
||||
const Tensor& x, const Tensor& scale, const Tensor& mean,
|
||||
const Tensor& variance, T epsilon, Tensor* x_backprop,
|
||||
const Tensor& inv_variance, T epsilon, Tensor* x_backprop,
|
||||
Tensor* scale_backprop, Tensor* offset_backprop,
|
||||
TensorFormat tensor_format) {
|
||||
auto* stream = context->op_device_context()->stream();
|
||||
@ -441,16 +442,18 @@ struct FusedBatchNormGrad<GPUDevice, T> {
|
||||
auto x_ptr = StreamExecutorUtil::AsDeviceMemory<T>(x_maybe_transformed);
|
||||
auto scale_ptr = StreamExecutorUtil::AsDeviceMemory<T>(scale);
|
||||
auto mean_ptr = StreamExecutorUtil::AsDeviceMemory<T>(mean);
|
||||
auto variance_ptr = StreamExecutorUtil::AsDeviceMemory<T>(variance);
|
||||
auto inv_variance_ptr = StreamExecutorUtil::AsDeviceMemory<T>(inv_variance);
|
||||
auto scale_backprop_ptr =
|
||||
StreamExecutorUtil::AsDeviceMemory<T>(*scale_backprop);
|
||||
auto offset_backprop_ptr =
|
||||
StreamExecutorUtil::AsDeviceMemory<T>(*offset_backprop);
|
||||
|
||||
// the cudnn kernel outputs inverse variance in forward and reuse it in
|
||||
// backward
|
||||
bool cudnn_launch_status =
|
||||
stream
|
||||
->ThenBatchNormalizationBackward(
|
||||
y_backprop_ptr, x_ptr, scale_ptr, mean_ptr, variance_ptr,
|
||||
y_backprop_ptr, x_ptr, scale_ptr, mean_ptr, inv_variance_ptr,
|
||||
x_desc, scale_offset_desc, static_cast<double>(epsilon),
|
||||
&x_backprop_ptr, &scale_backprop_ptr, &offset_backprop_ptr)
|
||||
.ok();
|
||||
@ -468,6 +471,20 @@ struct FusedBatchNormGrad<GPUDevice, T> {
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Forward declarations of the functor specializations for GPU.
|
||||
#define DECLARE_GPU_SPEC(T) \
|
||||
template <> \
|
||||
void FusedBatchNormFreezeGrad<GPUDevice, T>::operator()( \
|
||||
const GPUDevice& d, const Tensor& y_backprop_input, \
|
||||
const Tensor& x_input, const Tensor& scale_input, \
|
||||
const Tensor& mean_input, const Tensor& variance_input, T epsilon, \
|
||||
Tensor* x_backprop_output, Tensor* scale_backprop_output, \
|
||||
Tensor* offset_backprop_output, typename TTypes<T>::Vec scratch1, \
|
||||
typename TTypes<T>::Vec scratch2); \
|
||||
extern template struct FusedBatchNormFreezeGrad<GPUDevice, T>;
|
||||
DECLARE_GPU_SPEC(float);
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
} // namespace functor
|
||||
|
||||
@ -511,7 +528,7 @@ class FusedBatchNormOp : public OpKernel {
|
||||
if (is_training_) {
|
||||
OP_REQUIRES(
|
||||
context, estimated_mean.dim_size(0) == 0,
|
||||
errors::InvalidArgument("estimated_mean empty for training",
|
||||
errors::InvalidArgument("estimated_mean must be empty for training",
|
||||
estimated_mean.shape().DebugString()));
|
||||
OP_REQUIRES(context, estimated_variance.dim_size(0) == 0,
|
||||
errors::InvalidArgument(
|
||||
@ -531,14 +548,14 @@ class FusedBatchNormOp : public OpKernel {
|
||||
Tensor* saved_mean = nullptr;
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_output(3, scale.shape(), &saved_mean));
|
||||
Tensor* saved_inv_var = nullptr;
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_output(4, scale.shape(), &saved_inv_var));
|
||||
Tensor* saved_maybe_inv_var = nullptr;
|
||||
OP_REQUIRES_OK(context, context->allocate_output(4, scale.shape(),
|
||||
&saved_maybe_inv_var));
|
||||
|
||||
functor::FusedBatchNorm<Device, T>()(
|
||||
context, x, scale, offset, estimated_mean, estimated_variance, epsilon_,
|
||||
y, batch_mean, batch_var, saved_mean, saved_inv_var, tensor_format_,
|
||||
is_training_);
|
||||
y, batch_mean, batch_var, saved_mean, saved_maybe_inv_var,
|
||||
tensor_format_, is_training_);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -559,16 +576,21 @@ class FusedBatchNormGradOp : public OpKernel {
|
||||
OP_REQUIRES_OK(context, context->GetAttr("data_format", &tensor_format));
|
||||
OP_REQUIRES(context, FormatFromString(tensor_format, &tensor_format_),
|
||||
errors::InvalidArgument("Invalid data format"));
|
||||
OP_REQUIRES_OK(context, context->GetAttr("is_training", &is_training_));
|
||||
}
|
||||
|
||||
void Compute(OpKernelContext* context) override {
|
||||
const Tensor& y_backprop = context->input(0);
|
||||
const Tensor& x = context->input(1);
|
||||
const Tensor& scale = context->input(2);
|
||||
const Tensor& saved_mean = context->input(3);
|
||||
// The Eigen implementation saves variance in the forward pass, while cuDNN
|
||||
// When is_training=True, batch mean and variance/inverted variance are
|
||||
// saved in the forward pass to be reused here. When is_training=False,
|
||||
// population mean and variance need to be forwarded here to compute the
|
||||
// gradients.
|
||||
const Tensor& saved_mean_or_pop_mean = context->input(3);
|
||||
// The Eigen implementation saves variance in the forward pass, while cuDNN
|
||||
// saves inverted variance.
|
||||
const Tensor& saved_maybe_inv_var = context->input(4);
|
||||
const Tensor& saved_maybe_inv_var_or_pop_var = context->input(4);
|
||||
|
||||
OP_REQUIRES(context, y_backprop.dims() == 4,
|
||||
errors::InvalidArgument("input must be 4-dimensional",
|
||||
@ -579,13 +601,14 @@ class FusedBatchNormGradOp : public OpKernel {
|
||||
OP_REQUIRES(context, scale.dims() == 1,
|
||||
errors::InvalidArgument("scale must be 1-dimensional",
|
||||
scale.shape().DebugString()));
|
||||
OP_REQUIRES(context, saved_mean.dims() == 1,
|
||||
errors::InvalidArgument("saved mean must be 1-dimensional",
|
||||
saved_mean.shape().DebugString()));
|
||||
OP_REQUIRES(
|
||||
context, saved_maybe_inv_var.dims() == 1,
|
||||
errors::InvalidArgument("saved variance must be 1-dimensional",
|
||||
saved_maybe_inv_var.shape().DebugString()));
|
||||
context, saved_mean_or_pop_mean.dims() == 1,
|
||||
errors::InvalidArgument("saved mean must be 1-dimensional",
|
||||
saved_mean_or_pop_mean.shape().DebugString()));
|
||||
OP_REQUIRES(context, saved_maybe_inv_var_or_pop_var.dims() == 1,
|
||||
errors::InvalidArgument(
|
||||
"saved variance must be 1-dimensional",
|
||||
saved_maybe_inv_var_or_pop_var.shape().DebugString()));
|
||||
|
||||
Tensor* x_backprop = nullptr;
|
||||
OP_REQUIRES_OK(context,
|
||||
@ -607,14 +630,37 @@ class FusedBatchNormGradOp : public OpKernel {
|
||||
OP_REQUIRES_OK(
|
||||
context, context->allocate_output(4, TensorShape({}), &placeholder_2));
|
||||
|
||||
functor::FusedBatchNormGrad<Device, T>()(
|
||||
context, y_backprop, x, scale, saved_mean, saved_maybe_inv_var,
|
||||
epsilon_, x_backprop, scale_backprop, offset_backprop, tensor_format_);
|
||||
if (is_training_) {
|
||||
functor::FusedBatchNormGrad<Device, T>()(
|
||||
context, y_backprop, x, scale, saved_mean_or_pop_mean,
|
||||
saved_maybe_inv_var_or_pop_var, epsilon_, x_backprop, scale_backprop,
|
||||
offset_backprop, tensor_format_);
|
||||
|
||||
} else {
|
||||
// Necessary layout conversion is currently done in python.
|
||||
CHECK(tensor_format_ == FORMAT_NHWC)
|
||||
<< "The implementation of FusedBatchNormGrad with is_training=False "
|
||||
"only support "
|
||||
<< "NHWC tensor format for now.";
|
||||
Tensor scratch1, scratch2;
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_temp(DataTypeToEnum<T>::value,
|
||||
scale_offset_shape, &scratch1));
|
||||
OP_REQUIRES_OK(context,
|
||||
context->allocate_temp(DataTypeToEnum<T>::value,
|
||||
scale_offset_shape, &scratch2));
|
||||
functor::FusedBatchNormFreezeGrad<Device, T>()(
|
||||
context->eigen_device<Device>(), y_backprop, x, scale,
|
||||
saved_mean_or_pop_mean, saved_maybe_inv_var_or_pop_var, epsilon_,
|
||||
x_backprop, scale_backprop, offset_backprop, scratch1.vec<T>(),
|
||||
scratch2.vec<T>());
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
T epsilon_;
|
||||
TensorFormat tensor_format_;
|
||||
bool is_training_;
|
||||
};
|
||||
|
||||
REGISTER_KERNEL_BUILDER(Name("FusedBatchNorm").Device(DEVICE_CPU),
|
||||
|
@ -22,6 +22,8 @@ limitations under the License.
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
template struct FusedBatchNormFreezeGrad<Eigen::GpuDevice, float>;
|
||||
|
||||
template <class T>
|
||||
__global__ void VarianceToInvVarianceKernel(int nthreads, const T* input,
|
||||
double epsilon, T* output) {
|
||||
|
@ -17,9 +17,14 @@ limitations under the License.
|
||||
#define TENSORFLOW_KERNELS_FUSED_BATCH_NORM_OP_H_
|
||||
|
||||
#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
#include "tensorflow/core/framework/tensor_types.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace functor {
|
||||
|
||||
#if GOOGLE_CUDA
|
||||
|
||||
// There is a behavior difference between cuDNN v4 and v5 with regard to the
|
||||
// scaling factor for function cudnnBatchNormalizationForwardInference.
|
||||
// This function corrects the scaling factor if cuDNN v4 is used, so that
|
||||
@ -43,6 +48,72 @@ struct InvVarianceToVariance {
|
||||
void operator()(const Eigen::GpuDevice& d, double epsilon, int sample_size,
|
||||
int channels, T* variance);
|
||||
};
|
||||
|
||||
#endif // GOOGLE_CUDA
|
||||
|
||||
// Functor used by FusedBatchNormGradOp to do the computations when
|
||||
// is_training=False. Both CPU and GPU will use this functor.
|
||||
template <typename Device, typename T>
|
||||
struct FusedBatchNormFreezeGrad {
|
||||
void operator()(const Device& d, const Tensor& y_backprop_input,
|
||||
const Tensor& x_input, const Tensor& scale_input,
|
||||
const Tensor& pop_mean_input,
|
||||
const Tensor& pop_variance_input, T epsilon,
|
||||
Tensor* x_backprop_output, Tensor* scale_backprop_output,
|
||||
Tensor* offset_backprop_output,
|
||||
typename TTypes<T>::Vec scratch1,
|
||||
typename TTypes<T>::Vec scratch2) {
|
||||
typename TTypes<T, 4>::ConstTensor y_backprop(
|
||||
y_backprop_input.tensor<T, 4>());
|
||||
typename TTypes<T, 4>::ConstTensor input(x_input.tensor<T, 4>());
|
||||
typename TTypes<T>::ConstVec scale(scale_input.vec<T>());
|
||||
typename TTypes<T>::ConstVec pop_mean(pop_mean_input.vec<T>());
|
||||
typename TTypes<T>::ConstVec pop_var(pop_variance_input.vec<T>());
|
||||
typename TTypes<T, 4>::Tensor x_backprop(x_backprop_output->tensor<T, 4>());
|
||||
typename TTypes<T>::Vec scale_backprop(scale_backprop_output->vec<T>());
|
||||
typename TTypes<T>::Vec offset_backprop(offset_backprop_output->vec<T>());
|
||||
|
||||
const int depth = pop_mean.dimension(0);
|
||||
const int rest_size = input.size() / depth;
|
||||
|
||||
Eigen::DSizes<Eigen::Index, 2> rest_by_depth(rest_size, depth);
|
||||
#if !defined(EIGEN_HAS_INDEX_LIST)
|
||||
Eigen::DSizes<Eigen::Index, 2> one_by_depth(1, depth);
|
||||
Eigen::array<int, 1> reduction_axis{0};
|
||||
Eigen::array<int, 2> rest_by_one({rest_size, 1});
|
||||
#else
|
||||
Eigen::IndexList<Eigen::type2index<1>, Eigen::Index> one_by_depth;
|
||||
one_by_depth.set(1, depth);
|
||||
Eigen::IndexList<Eigen::type2index<0> > reduction_axis;
|
||||
Eigen::IndexList<Eigen::Index, Eigen::type2index<1> > rest_by_one;
|
||||
rest_by_one.set(0, rest_size);
|
||||
#endif
|
||||
|
||||
// offset_backprop = sum(y_backprop)
|
||||
// scale_backprop = y_backprop * ((x - pop_mean) * rsqrt(pop_var + epsilon))
|
||||
// x_backprop = y_backprop * (scale * rsqrt(pop_var + epsilon))
|
||||
offset_backprop.device(d) =
|
||||
y_backprop.reshape(rest_by_depth).sum(reduction_axis);
|
||||
|
||||
// scratch1 = rsqrt(pop_var + epsilon)
|
||||
scratch1.device(d) = (pop_var + pop_var.constant(epsilon)).rsqrt();
|
||||
|
||||
// scratch2 = sum(y_backprop * (x - mean))
|
||||
scratch2.device(d) =
|
||||
(y_backprop.reshape(rest_by_depth) *
|
||||
(input.reshape(rest_by_depth) -
|
||||
pop_mean.reshape(one_by_depth).broadcast(rest_by_one)))
|
||||
.sum(reduction_axis);
|
||||
|
||||
x_backprop.reshape(rest_by_depth).device(d) =
|
||||
y_backprop.reshape(rest_by_depth) * ((scratch1 * scale)
|
||||
.eval()
|
||||
.reshape(one_by_depth)
|
||||
.broadcast(rest_by_one));
|
||||
scale_backprop.device(d) = scratch2 * scratch1;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace functor
|
||||
} // namespace tensorflow
|
||||
|
||||
|
@ -54,17 +54,62 @@ class MklAddNOp : public OpKernel {
|
||||
GetMklShape(ctx, 1, &(mkl_context.input2_shape));
|
||||
bool input2_in_mkl_format = mkl_context.input2_shape.IsMklTensor();
|
||||
|
||||
// handle the case of a scalar
|
||||
if (!input1_in_mkl_format && input0.dims() == 0) {
|
||||
const TensorShape& o_shape = input0.shape();
|
||||
Tensor* out_tensor = nullptr;
|
||||
mkl_context.output_shape.SetMklTensor(false);
|
||||
AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
|
||||
mkl_context.output_shape);
|
||||
float user_i1 = (input0.scalar<T>()());
|
||||
;
|
||||
float user_i2 = (input1.scalar<T>()());
|
||||
;
|
||||
out_tensor->scalar<T>()() = std::plus<float>{}(user_i1, user_i2);
|
||||
return;
|
||||
}
|
||||
|
||||
mkl_context.in_dims = input1_in_mkl_format
|
||||
? mkl_context.input1_shape.GetDimension()
|
||||
: input0.dims();
|
||||
mkl_context.in_dims = input2_in_mkl_format
|
||||
? mkl_context.input2_shape.GetDimension()
|
||||
: input1.dims();
|
||||
|
||||
// If there is nothing to compute, return.
|
||||
if (!input1_in_mkl_format && !input2_in_mkl_format) {
|
||||
const TensorShape& o_shape = input0.shape();
|
||||
if (o_shape.num_elements() == 0) {
|
||||
Tensor* out_tensor = nullptr;
|
||||
mkl_context.output_shape.SetMklTensor(false);
|
||||
AllocateOutputSetMklShape(ctx, 0, &out_tensor, o_shape,
|
||||
mkl_context.output_shape);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
mkl_context.in_sizes = new size_t[mkl_context.in_dims];
|
||||
mkl_context.in_strides = new size_t[mkl_context.in_dims];
|
||||
// Generate size, stride for input if input is in MKL format.
|
||||
ExtractMklOpParams(&mkl_context.in1_sizes,
|
||||
&mkl_context.in1_strides, input0, &mkl_context.input1_shape);
|
||||
ExtractMklOpParams(&mkl_context.in2_sizes,
|
||||
&mkl_context.in2_strides, input1, &mkl_context.input2_shape);
|
||||
if (input1_in_mkl_format || input2_in_mkl_format) {
|
||||
const MklShape* tmp_mkl_shape = (input1_in_mkl_format)
|
||||
? &mkl_context.input1_shape
|
||||
: &mkl_context.input2_shape;
|
||||
for (int i = 0; i < mkl_context.in_dims; i++) {
|
||||
mkl_context.in_sizes[i] = tmp_mkl_shape->GetSizes()[i];
|
||||
mkl_context.in_strides[i] = tmp_mkl_shape->GetStrides()[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < mkl_context.in_dims; i++) {
|
||||
mkl_context.in_sizes[i] =
|
||||
input0.dim_size((mkl_context.in_dims - 1) - i);
|
||||
}
|
||||
mkl_context.in_strides[0] = 1;
|
||||
for (int i = 1; i < mkl_context.in_dims; i++) {
|
||||
mkl_context.in_strides[i] =
|
||||
mkl_context.in_strides[i - 1] * mkl_context.in_sizes[i - 1];
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<float> coeff(2, 1.0);
|
||||
mkl_context.MklCreateInputLayouts(ctx);
|
||||
@ -82,7 +127,7 @@ class MklAddNOp : public OpKernel {
|
||||
mkl_context.output_shape.SetMklLayout(mkl_context.Eltwise, dnnResourceDst);
|
||||
|
||||
mkl_context.output_shape.SetTfLayout(
|
||||
mkl_context.in_dims, mkl_context.in1_sizes, mkl_context.in1_strides);
|
||||
mkl_context.in_dims, mkl_context.in_sizes, mkl_context.in_strides);
|
||||
if (input1_in_mkl_format == true) {
|
||||
mkl_context.output_shape.SetTfDimOrder(mkl_context.in_dims,
|
||||
mkl_context.input1_shape.GetTfToMklDimMap());
|
||||
@ -113,44 +158,11 @@ class MklAddNOp : public OpKernel {
|
||||
mkl_context.MklCleanup();
|
||||
}
|
||||
|
||||
void ExtractMklOpParams(size_t** out_sizes, size_t** out_strides,
|
||||
const Tensor& input, const MklShape* input_shape) {
|
||||
bool input_in_mkl_format = input_shape->IsMklTensor();
|
||||
int in_dims = input_in_mkl_format
|
||||
? input_shape->GetDimension()
|
||||
: input.dims();
|
||||
size_t* in_sizes = new size_t[in_dims];
|
||||
size_t* in_strides = new size_t[in_dims];
|
||||
|
||||
if (input_in_mkl_format) {
|
||||
for (int i = 0; i < in_dims; i++) {
|
||||
in_sizes[i] = input_shape->GetSizes()[i];
|
||||
in_strides[i] = input_shape->GetStrides()[i];
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < in_dims; i++) {
|
||||
in_sizes[i] =
|
||||
input.dim_size((in_dims - 1) - i);
|
||||
}
|
||||
in_strides[0] = 1;
|
||||
for (int i = 1; i < in_dims; i++) {
|
||||
in_strides[i] =
|
||||
in_strides[i - 1] * in_sizes[i - 1];
|
||||
}
|
||||
}
|
||||
*out_sizes = in_sizes;
|
||||
*out_strides = in_strides;
|
||||
}
|
||||
|
||||
|
||||
private:
|
||||
typedef struct {
|
||||
int in_dims;
|
||||
size_t* in1_sizes;
|
||||
size_t* in1_strides;
|
||||
|
||||
size_t* in2_sizes;
|
||||
size_t* in2_strides;
|
||||
size_t* in_sizes = nullptr;
|
||||
size_t* in_strides = nullptr;
|
||||
dnnPrimitive_t Eltwise = nullptr;
|
||||
dnnPrimitiveAttributes_t attributes = nullptr;
|
||||
void* Eltwise_res[dnnResourceNumber];
|
||||
@ -160,18 +172,16 @@ class MklAddNOp : public OpKernel {
|
||||
void MklCreateInputLayouts(OpKernelContext* context) {
|
||||
bool input1_in_mkl_format = input1_shape.IsMklTensor();
|
||||
if (!input1_in_mkl_format) {
|
||||
CHECK_EQ(
|
||||
dnnLayoutCreate_F32(<_input1, in_dims, in1_sizes, in1_strides),
|
||||
E_SUCCESS);
|
||||
CHECK_EQ(dnnLayoutCreate_F32(<_input1, in_dims, in_sizes, in_strides),
|
||||
E_SUCCESS);
|
||||
} else {
|
||||
lt_input1 = static_cast<dnnLayout_t>(input1_shape.GetCurLayout());
|
||||
}
|
||||
|
||||
bool input2_in_mkl_format = input2_shape.IsMklTensor();
|
||||
if (!input2_in_mkl_format) {
|
||||
CHECK_EQ(
|
||||
dnnLayoutCreate_F32(<_input2, in_dims, in2_sizes, in2_strides),
|
||||
E_SUCCESS);
|
||||
CHECK_EQ(dnnLayoutCreate_F32(<_input2, in_dims, in_sizes, in_strides),
|
||||
E_SUCCESS);
|
||||
} else {
|
||||
lt_input2 = static_cast<dnnLayout_t>(input2_shape.GetCurLayout());
|
||||
}
|
||||
@ -246,15 +256,15 @@ class MklAddNOp : public OpKernel {
|
||||
bool input1_in_mkl_format = input1_shape.IsMklTensor();
|
||||
bool input2_in_mkl_format = input2_shape.IsMklTensor();
|
||||
dnnDelete_F32(Eltwise);
|
||||
if (!input1_in_mkl_format || !input2_in_mkl_format) {
|
||||
delete[] in_sizes;
|
||||
delete[] in_strides;
|
||||
}
|
||||
if (!input1_in_mkl_format) {
|
||||
dnnLayoutDelete_F32(lt_input1);
|
||||
delete [] in1_sizes;
|
||||
delete [] in1_strides;
|
||||
}
|
||||
if (!input2_in_mkl_format) {
|
||||
dnnLayoutDelete_F32(lt_input2);
|
||||
delete [] in2_sizes;
|
||||
delete [] in2_strides;
|
||||
}
|
||||
}
|
||||
} MklAddNOpContext;
|
||||
|
@ -213,7 +213,7 @@ struct LaunchPoolingOp<SYCLDevice, T, MAX> {
|
||||
}
|
||||
};
|
||||
// MaxPool3DGrad SYCL kernel. Expects the number of threads to be equal to the
|
||||
// number of elements in the output backprop tenor (i.e. the number of elements
|
||||
// number of elements in the output backprop tensor (i.e. the number of elements
|
||||
// in the input data tensor).
|
||||
//
|
||||
// For each output backprop element we compute the possible window of values in
|
||||
|
@ -173,7 +173,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
|
||||
JDIMENSION target_output_width = cinfo.output_width;
|
||||
JDIMENSION target_output_height = cinfo.output_height;
|
||||
JDIMENSION skipped_scanlines = 0;
|
||||
#if !defined(WIN32)
|
||||
#if defined(LIBJPEG_TURBO_VERSION)
|
||||
if (flags.crop) {
|
||||
// Update target output height and width based on crop window.
|
||||
target_output_height = flags.crop_height;
|
||||
@ -219,7 +219,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
|
||||
argball->height_ = target_output_height;
|
||||
argball->stride_ = stride;
|
||||
|
||||
#if defined(WIN32)
|
||||
#if !defined(LIBJPEG_TURBO_VERSION)
|
||||
uint8* dstdata = nullptr;
|
||||
if (flags.crop) {
|
||||
dstdata = new JSAMPLE[stride * target_output_height];
|
||||
@ -336,7 +336,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
|
||||
delete[] tempdata;
|
||||
tempdata = nullptr;
|
||||
|
||||
#if !defined(WIN32)
|
||||
#if defined(LIBJPEG_TURBO_VERSION)
|
||||
if (flags.crop && cinfo.output_scanline < cinfo.output_height) {
|
||||
// Skip the rest of scanlines, required by jpeg_destroy_decompress.
|
||||
jpeg_skip_scanlines(&cinfo,
|
||||
@ -418,7 +418,7 @@ uint8* UncompressLow(const void* srcdata, FewerArgsForCompiler* argball) {
|
||||
break;
|
||||
}
|
||||
|
||||
#if defined(WIN32)
|
||||
#if !defined(LIBJPEG_TURBO_VERSION)
|
||||
// TODO(tanmingxing): delete all these code after migrating to libjpeg_turbo
|
||||
// for Windows.
|
||||
if (flags.crop) {
|
||||
|
@ -635,15 +635,7 @@ REGISTER_OP("ImmutableConst")
|
||||
.Attr("shape: shape")
|
||||
.Attr("memory_region_name: string")
|
||||
.Output("tensor: dtype")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
TensorShape shape_from_attr;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape_from_attr));
|
||||
ShapeHandle output_shape;
|
||||
TF_RETURN_IF_ERROR(
|
||||
c->MakeShapeFromPartialTensorShape(shape_from_attr, &output_shape));
|
||||
c->set_output(0, output_shape);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Doc(R"doc(
|
||||
Returns immutable tensor from memory region.
|
||||
|
||||
@ -1307,15 +1299,7 @@ REGISTER_OP("_ParallelConcatStart")
|
||||
.Attr("shape: shape")
|
||||
.Attr("dtype: type")
|
||||
.SetIsStateful()
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle output_shape;
|
||||
TF_RETURN_IF_ERROR(
|
||||
c->MakeShapeFromPartialTensorShape(shape, &output_shape));
|
||||
c->set_output(0, output_shape);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Doc(R"doc(
|
||||
Creates an empty Tensor with shape `shape` and type `dtype`.
|
||||
|
||||
@ -3083,14 +3067,7 @@ REGISTER_OP("PlaceholderV2")
|
||||
.Output("output: dtype")
|
||||
.Attr("dtype: type")
|
||||
.Attr("shape: shape")
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle output;
|
||||
TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output));
|
||||
c->set_output(0, output);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Deprecated(23, "Placeholder now behaves the same as PlaceholderV2.")
|
||||
.Doc(R"doc(
|
||||
A placeholder op for a value that will be fed into the computation.
|
||||
|
@ -329,7 +329,7 @@ batch_variance: A 1D Tensor for the computed batch variance, to be used by
|
||||
reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
|
||||
in the gradient computation.
|
||||
reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
|
||||
in the cuDNN case), to be used in the gradient computation.
|
||||
in the cuDNN case), to be reused in the gradient computation.
|
||||
T: The data type for the elements of input and output Tensors.
|
||||
epsilon: A small float number added to the variance of x.
|
||||
data_format: The data format for x and y. Either "NHWC" (default) or "NCHW".
|
||||
@ -409,10 +409,14 @@ The size of 1D Tensors matches the dimension C of the 4D Tensors.
|
||||
y_backprop: A 4D Tensor for the gradient with respect to y.
|
||||
x: A 4D Tensor for input data.
|
||||
scale: A 1D Tensor for scaling factor, to scale the normalized x.
|
||||
reserve_space_1: A 1D Tensor for the computed batch mean, to be reused
|
||||
in the gradient computation.
|
||||
reserve_space_2: A 1D Tensor for the computed batch variance (inverted variance
|
||||
in the cuDNN case), to be used in the gradient computation.
|
||||
reserve_space_1: When is_training is True, a 1D Tensor for the computed batch mean
|
||||
to be reused in gradient computation.
|
||||
When is_training is False, a 1D Tensor for the population mean
|
||||
to be reused in both 1st and 2nd order gradient computation.
|
||||
reserve_space_2: When is_training is True, a 1D Tensor for the computed batch variance
|
||||
(inverted variance in the cuDNN case) to be reused in gradient computation.
|
||||
When is_training is False, a 1D Tensor for the population variance
|
||||
to be reused in both 1st and 2nd order gradient computation.
|
||||
x_backprop: A 4D Tensor for the gradient with respect to x.
|
||||
scale_backprop: A 1D Tensor for the gradient with respect to scale.
|
||||
offset_backprop: A 1D Tensor for the gradient with respect to offset.
|
||||
|
@ -15867,6 +15867,25 @@ op {
|
||||
}
|
||||
summary: "Transforms a serialized tensorflow.TensorProto proto into a Tensor."
|
||||
}
|
||||
op {
|
||||
name: "SerializeTensor"
|
||||
input_arg {
|
||||
name: "tensor"
|
||||
description: "A Tensor of type `T`."
|
||||
type: "T"
|
||||
}
|
||||
output_arg {
|
||||
name: "serialized"
|
||||
description: "A serialized TensorProto proto of the input tensor."
|
||||
type_attr: DT_STRING
|
||||
}
|
||||
attr {
|
||||
name: "T"
|
||||
type: "type"
|
||||
description: "The type of the input tensor."
|
||||
}
|
||||
summary: "Transforms a Tensor into a serialized TensorProto proto."
|
||||
}
|
||||
op {
|
||||
name: "Placeholder"
|
||||
output_arg {
|
||||
|
@ -28,15 +28,7 @@ REGISTER_OP("VariableV2")
|
||||
.Attr("container: string = ''")
|
||||
.Attr("shared_name: string = ''")
|
||||
.SetIsStateful()
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle output_shape;
|
||||
TF_RETURN_IF_ERROR(
|
||||
c->MakeShapeFromPartialTensorShape(shape, &output_shape));
|
||||
c->set_output(0, output_shape);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Doc(R"doc(
|
||||
Holds state in the form of a tensor that persists across steps.
|
||||
|
||||
@ -99,14 +91,7 @@ REGISTER_OP("TemporaryVariable")
|
||||
.Attr("dtype: type")
|
||||
.Attr("var_name: string = ''")
|
||||
.SetIsStateful()
|
||||
.SetShapeFn([](InferenceContext* c) {
|
||||
PartialTensorShape shape;
|
||||
TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
|
||||
ShapeHandle output;
|
||||
TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &output));
|
||||
c->set_output(0, output);
|
||||
return Status::OK();
|
||||
})
|
||||
.SetShapeFn(shape_inference::ExplicitShape)
|
||||
.Doc(R"doc(
|
||||
Returns a tensor that may be mutated, but only persists within a single step.
|
||||
|
||||
|
@ -255,7 +255,6 @@ class CPUIDInfo {
|
||||
int model_num() { return model_num_; }
|
||||
|
||||
private:
|
||||
int highest_eax_;
|
||||
int have_adx_ : 1;
|
||||
int have_aes_ : 1;
|
||||
int have_avx_ : 1;
|
||||
|
@ -19,12 +19,12 @@ limitations under the License.
|
||||
// TensorFlow uses semantic versioning, see http://semver.org/.
|
||||
|
||||
#define TF_MAJOR_VERSION 1
|
||||
#define TF_MINOR_VERSION 4
|
||||
#define TF_MINOR_VERSION 3
|
||||
#define TF_PATCH_VERSION 0
|
||||
|
||||
// TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
|
||||
// "-beta", "-rc", "-rc.1")
|
||||
#define TF_VERSION_SUFFIX "-dev"
|
||||
#define TF_VERSION_SUFFIX ""
|
||||
|
||||
#define TF_STR_HELPER(x) #x
|
||||
#define TF_STR(x) TF_STR_HELPER(x)
|
||||
|
@ -102,6 +102,7 @@ Feature columns provide a mechanism to map data to a model.
|
||||
* @{tf.contrib.layers.sparse_column_with_hash_bucket}
|
||||
* @{tf.contrib.layers.sparse_column_with_integerized_feature}
|
||||
* @{tf.contrib.layers.sparse_column_with_keys}
|
||||
* @{tf.contrib.layers.sparse_column_with_vocabulary_file}
|
||||
* @{tf.contrib.layers.weighted_sparse_column}
|
||||
* @{tf.contrib.layers.weighted_sum_from_feature_columns}
|
||||
* @{tf.contrib.layers.infer_real_valued_columns}
|
||||
|
@ -496,6 +496,6 @@ that allow the user to change the input pipeline without rebuilding the graph or
|
||||
session.
|
||||
|
||||
Note: Regardless of the implementation, many
|
||||
operations (like ${tf.layers.batch_normalization}, and @{tf.layers.dropout})
|
||||
operations (like @{tf.layers.batch_normalization}, and @{tf.layers.dropout})
|
||||
need to know if they are in training or evaluation mode, and you must be
|
||||
careful to set this appropriately if you change the data source.
|
||||
|
@ -444,19 +444,19 @@ Now that you know how to build a basic (and somewhat restricted) op and
|
||||
implementation, we'll look at some of the more complicated things you will
|
||||
typically need to build into your op. This includes:
|
||||
|
||||
* [Conditional checks and validation](#validate)
|
||||
* Op registration
|
||||
* [Conditional checks and validation](#conditional_checks_and_validation)
|
||||
* [Op registration](#op_registration)
|
||||
* [Attrs](#attrs)
|
||||
* [Attr types](#attr-types)
|
||||
* [Attr types](#attr_types)
|
||||
* [Polymorphism](#polymorphism)
|
||||
* [Inputs and outputs](#inputs-outputs)
|
||||
* [Backwards compatibility](#backward-compat)
|
||||
* [GPU support](#gpu-support)
|
||||
* [Compiling the kernel for the GPU device](#compiling-kernel)
|
||||
* [Implement the gradient in Python](#implement-gradient)
|
||||
* [Shape functions in C++](#shape-functions)
|
||||
* [Inputs and outputs](#inputs_and_outputs)
|
||||
* [Backwards compatibility](#backwards_compatibility)
|
||||
* [GPU support](#gpu_support)
|
||||
* [Compiling the kernel for the GPU device](#compiling_the_kernel_for_the_gpu_device)
|
||||
* [Implement the gradient in Python](#implement_the_gradient_in_python)
|
||||
* [Shape functions in C++](#shape_functions_in_c)
|
||||
|
||||
### Conditional checks and validation {#validate}
|
||||
### Conditional checks and validation
|
||||
|
||||
The example above assumed that the op applied to a tensor of any shape. What
|
||||
if it only applied to vectors? That means adding a check to the above OpKernel
|
||||
@ -497,7 +497,7 @@ function on error.
|
||||
|
||||
### Op registration
|
||||
|
||||
#### Attrs {#attrs}
|
||||
#### Attrs
|
||||
|
||||
Ops can have attrs, whose values are set when the op is added to a graph. These
|
||||
are used to configure the op, and their values can be accessed both within the
|
||||
@ -519,7 +519,7 @@ using the `Attr` method, which expects a spec of the form:
|
||||
|
||||
where `<name>` begins with a letter and can be composed of alphanumeric
|
||||
characters and underscores, and `<attr-type-expr>` is a type expression of the
|
||||
form [described below](#attr-types).
|
||||
form [described below](#attr_types).
|
||||
|
||||
For example, if you'd like the `ZeroOut` op to preserve a user-specified index,
|
||||
instead of only the 0th element, you can register the op like so:
|
||||
@ -530,7 +530,7 @@ REGISTER\_OP("ZeroOut")
|
||||
.Output("zeroed: int32");
|
||||
</code></pre>
|
||||
|
||||
(Note that the set of [attribute types](#attr-types) is different from the
|
||||
(Note that the set of [attribute types](#attr_types) is different from the
|
||||
@{tf.DType$tensor types} used for inputs and outputs.)
|
||||
|
||||
Your kernel can then access this attr in its constructor via the `context`
|
||||
@ -574,7 +574,7 @@ which can then be used in the `Compute` method:
|
||||
}
|
||||
</code></pre>
|
||||
|
||||
#### Attr types {#attr-types}
|
||||
#### Attr types
|
||||
|
||||
The following types are supported in an attr:
|
||||
|
||||
@ -707,7 +707,7 @@ REGISTER_OP("AttrDefaultExampleForAllTypes")
|
||||
Note in particular that the values of type `type`
|
||||
use @{tf.DType$the `DT_*` names for the types}.
|
||||
|
||||
#### Polymorphism {#polymorphism}
|
||||
#### Polymorphism
|
||||
|
||||
##### Type Polymorphism
|
||||
|
||||
@ -1009,7 +1009,7 @@ REGISTER_OP("MinimumLengthPolymorphicListExample")
|
||||
.Output("out: T");
|
||||
```
|
||||
|
||||
#### Inputs and Outputs {#inputs-outputs}
|
||||
#### Inputs and Outputs
|
||||
|
||||
To summarize the above, an op registration can have multiple inputs and outputs:
|
||||
|
||||
@ -1110,7 +1110,7 @@ expressions:
|
||||
For more details, see
|
||||
[`tensorflow/core/framework/op_def_builder.h`][op_def_builder].
|
||||
|
||||
#### Backwards compatibility {#backward-compat}
|
||||
#### Backwards compatibility
|
||||
|
||||
Let's assume you have written a nice, custom op and shared it with others, so
|
||||
you have happy customers using your operation. However, you'd like to make
|
||||
@ -1172,7 +1172,7 @@ new optional arguments to the end. Generally incompatible changes may only be
|
||||
made when TensorFlow's changes major versions, and must conform to the
|
||||
@{$version_compat#compatibility_of_graphs_and_checkpoints$`GraphDef` version semantics}.
|
||||
|
||||
### GPU Support {#gpu-support}
|
||||
### GPU Support
|
||||
|
||||
You can implement different OpKernels and register one for CPU and another for
|
||||
GPU, just like you can [register kernels for different types](#polymorphism).
|
||||
@ -1204,7 +1204,7 @@ kept on the CPU, add a `HostMemory()` call to the kernel registration, e.g.:
|
||||
PadOp<GPUDevice, T>)
|
||||
```
|
||||
|
||||
#### Compiling the kernel for the GPU device {#compiling-kernel}
|
||||
#### Compiling the kernel for the GPU device
|
||||
|
||||
Look at
|
||||
[cuda_op_kernel.cu.cc](https://www.tensorflow.org/code/tensorflow/examples/adding_an_op/cuda_op_kernel.cu.cc)
|
||||
@ -1237,7 +1237,7 @@ For example, add `-L /usr/local/cuda-8.0/lib64/` if your CUDA is installed in
|
||||
|
||||
> Note in some linux settings, additional options to `nvcc` compiling step are needed. Add `-D_MWAITXINTRIN_H_INCLUDED` to the `nvcc` command line to avoid errors from `mwaitxintrin.h`.
|
||||
|
||||
### Implement the gradient in Python {#implement-gradient}
|
||||
### Implement the gradient in Python
|
||||
|
||||
Given a graph of ops, TensorFlow uses automatic differentiation
|
||||
(backpropagation) to add new ops representing gradients with respect to the
|
||||
@ -1317,7 +1317,7 @@ Note that at the time the gradient function is called, only the data flow graph
|
||||
of ops is available, not the tensor data itself. Thus, all computation must be
|
||||
performed using other tensorflow ops, to be run at graph execution time.
|
||||
|
||||
### Shape functions in C++ {#shape-functions}
|
||||
### Shape functions in C++
|
||||
|
||||
The TensorFlow API has a feature called "shape inference" that provides
|
||||
information about the shapes of tensors without having to execute the
|
||||
|
@ -180,11 +180,11 @@ You can think of it as converting tallies
|
||||
of evidence into probabilities of our input being in each class.
|
||||
It's defined as:
|
||||
|
||||
$$\text{softmax}(x) = \text{normalize}(\exp(x))$$
|
||||
$$\text{softmax}(evidence) = \text{normalize}(\exp(evidence))$$
|
||||
|
||||
If you expand that equation out, you get:
|
||||
|
||||
$$\text{softmax}(x)_i = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$
|
||||
$$\text{softmax}(evidence)_i = \frac{\exp(evidence_i)}{\sum_j \exp(evidence_j)}$$
|
||||
|
||||
But it's often more helpful to think of softmax the first way: exponentiating
|
||||
its inputs and then normalizing them. The exponentiation means that one more
|
||||
|
@ -35,7 +35,7 @@ enable TensorFlow for C:
|
||||
OS="linux" # Change to "darwin" for Mac OS
|
||||
TARGET_DIRECTORY="/usr/local"
|
||||
curl -L \
|
||||
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" |
|
||||
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" |
|
||||
sudo tar -C $TARGET_DIRECTORY -xz
|
||||
|
||||
The `tar` command extracts the TensorFlow C library into the `lib`
|
||||
|
@ -35,7 +35,7 @@ steps to install this library and enable TensorFlow for Go:
|
||||
TF_TYPE="cpu" # Change to "gpu" for GPU support
|
||||
TARGET_DIRECTORY='/usr/local'
|
||||
curl -L \
|
||||
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.4.0-dev.tar.gz" |
|
||||
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.3.0.tar.gz" |
|
||||
sudo tar -C $TARGET_DIRECTORY -xz
|
||||
|
||||
The `tar` command extracts the TensorFlow C library into the `lib`
|
||||
|
@ -34,7 +34,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
|
||||
<dependency>
|
||||
<groupId>org.tensorflow</groupId>
|
||||
<artifactId>tensorflow</artifactId>
|
||||
<version>1.4.0-dev</version>
|
||||
<version>1.3.0</version>
|
||||
</dependency>
|
||||
```
|
||||
|
||||
@ -63,7 +63,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
|
||||
<dependency>
|
||||
<groupId>org.tensorflow</groupId>
|
||||
<artifactId>tensorflow</artifactId>
|
||||
<version>1.4.0-dev</version>
|
||||
<version>1.3.0</version>
|
||||
</dependency>
|
||||
</dependencies>
|
||||
</project>
|
||||
@ -122,7 +122,7 @@ refer to the simpler instructions above instead.
|
||||
Take the following steps to install TensorFlow for Java on Linux or Mac OS:
|
||||
|
||||
1. Download
|
||||
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar),
|
||||
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar),
|
||||
which is the TensorFlow Java Archive (JAR).
|
||||
|
||||
2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
|
||||
@ -141,7 +141,7 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
|
||||
OS=$(uname -s | tr '[:upper:]' '[:lower:]')
|
||||
mkdir -p ./jni
|
||||
curl -L \
|
||||
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.4.0-dev.tar.gz" |
|
||||
"https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.3.0.tar.gz" |
|
||||
tar -xz -C ./jni
|
||||
|
||||
### Install on Windows
|
||||
@ -149,10 +149,10 @@ Take the following steps to install TensorFlow for Java on Linux or Mac OS:
|
||||
Take the following steps to install TensorFlow for Java on Windows:
|
||||
|
||||
1. Download
|
||||
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.4.0-dev.jar),
|
||||
[libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.3.0.jar),
|
||||
which is the TensorFlow Java Archive (JAR).
|
||||
2. Download the following Java Native Interface (JNI) file appropriate for
|
||||
[TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.4.0-dev.zip).
|
||||
[TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.3.0.zip).
|
||||
3. Extract this .zip file.
|
||||
|
||||
|
||||
@ -200,7 +200,7 @@ must be part of your `classpath`. For example, you can include the
|
||||
downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
|
||||
as follows:
|
||||
|
||||
<pre><b>javac -cp libtensorflow-1.4.0-dev.jar HelloTF.java</b></pre>
|
||||
<pre><b>javac -cp libtensorflow-1.3.0.jar HelloTF.java</b></pre>
|
||||
|
||||
|
||||
### Running
|
||||
@ -214,11 +214,11 @@ two files are available to the JVM:
|
||||
For example, the following command line executes the `HelloTF` program on Linux
|
||||
and Mac OS X:
|
||||
|
||||
<pre><b>java -cp libtensorflow-1.4.0-dev.jar:. -Djava.library.path=./jni HelloTF</b></pre>
|
||||
<pre><b>java -cp libtensorflow-1.3.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
|
||||
|
||||
And the following command line executes the `HelloTF` program on Windows:
|
||||
|
||||
<pre><b>java -cp libtensorflow-1.4.0-dev.jar;. -Djava.library.path=jni HelloTF</b></pre>
|
||||
<pre><b>java -cp libtensorflow-1.3.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
|
||||
|
||||
If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
|
||||
installed TensorFlow for Java and are ready to use the API. If the program
|
||||
|
@ -445,7 +445,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
|
||||
2. Create a conda environment named <tt>tensorflow</tt> to run a version
|
||||
of Python by invoking the following command:
|
||||
|
||||
<pre>$ <b>conda create -n tensorflow</b></pre>
|
||||
<pre>$ <b>conda create -n tensorflow python=2.7 # or python=3.3, etc.</b></pre>
|
||||
|
||||
3. Activate the conda environment by issuing the following command:
|
||||
|
||||
|
@ -321,7 +321,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
|
||||
2. Create a conda environment named `tensorflow`
|
||||
by invoking the following command:
|
||||
|
||||
<pre>$ <b>conda create -n tensorflow</b></pre>
|
||||
<pre>$ <b>conda create -n tensorflow python=2.7 # or python=3.3, etc.</b></pre>
|
||||
|
||||
3. Activate the conda environment by issuing the following command:
|
||||
|
||||
|
@ -26,7 +26,7 @@ on API >= 14 devices.
|
||||
in an overlay on the camera image.
|
||||
2. [TF Detect](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/android/src/org/tensorflow/demo/DetectorActivity.java):
|
||||
Demonstrates an SSD-Mobilenet model trained using the
|
||||
[Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/object_detection/)
|
||||
[Tensorflow Object Detection API](https://github.com/tensorflow/models/tree/master/research/object_detection/)
|
||||
introduced in [Speed/accuracy trade-offs for modern convolutional object detectors](https://arxiv.org/abs/1611.10012) to
|
||||
localize and track objects (from 80 categories) in the camera preview
|
||||
in real-time.
|
||||
|
@ -32,6 +32,5 @@ py_test(
|
||||
deps = [
|
||||
"//tensorflow:tensorflow_py",
|
||||
"//third_party/py/numpy",
|
||||
"//third_party/py/pandas",
|
||||
],
|
||||
)
|
||||
|
@ -75,6 +75,7 @@ import os.path
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
from six.moves import xrange # pylint: disable=redefined-builtin
|
||||
import tensorflow as tf
|
||||
|
||||
import input_data
|
||||
@ -113,8 +114,8 @@ def main(_):
|
||||
# example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
|
||||
# will run 13,000 training loops in total, with a rate of 0.001 for the first
|
||||
# 10,000, and 0.0001 for the final 3,000.
|
||||
training_steps_list = map(int, FLAGS.how_many_training_steps.split(','))
|
||||
learning_rates_list = map(float, FLAGS.learning_rate.split(','))
|
||||
training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
|
||||
learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
|
||||
if len(training_steps_list) != len(learning_rates_list):
|
||||
raise Exception(
|
||||
'--how_many_training_steps and --learning_rate must be equal length '
|
||||
|
@ -89,6 +89,10 @@ func (s *Session) Run(feeds map[Output]*Tensor, fetches []Output, targets []*Ope
|
||||
ptrOutput(c.fetches), ptrTensor(c.fetchTensors), C.int(len(fetches)),
|
||||
ptrOperation(c.targets), C.int(len(targets)),
|
||||
nil, status.c)
|
||||
|
||||
// Make sure GC won't harvest input tensors until SessionRun() is finished
|
||||
runtime.KeepAlive(feeds)
|
||||
|
||||
if err := status.Err(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -100,7 +100,7 @@ func NewTensor(value interface{}) (*Tensor, error) {
|
||||
}
|
||||
} else {
|
||||
e := stringEncoder{offsets: buf, data: raw[nflattened*8 : len(raw)], status: newStatus()}
|
||||
if e.encode(reflect.ValueOf(value)); err != nil {
|
||||
if err := e.encode(reflect.ValueOf(value)); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if int64(buf.Len()) != nflattened*8 {
|
||||
|
@ -25,6 +25,7 @@ import java.nio.FloatBuffer;
|
||||
import java.nio.IntBuffer;
|
||||
import java.nio.LongBuffer;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
|
||||
/**
|
||||
* A typed multi-dimensional array.
|
||||
@ -97,9 +98,19 @@ public final class Tensor implements AutoCloseable {
|
||||
* using {@link #create(DataType, long[], ByteBuffer)} instead.
|
||||
*/
|
||||
public static Tensor create(Object obj) {
|
||||
return create(obj, dataTypeOf(obj));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a Tensor of data type {@code dtype} from a Java object.
|
||||
*
|
||||
* @param dtype the intended tensor data type. It must match the the run-time type of the object.
|
||||
*/
|
||||
static Tensor create(Object obj, DataType dtype) {
|
||||
Tensor t = new Tensor();
|
||||
t.dtype = dataTypeOf(obj);
|
||||
t.shapeCopy = new long[numDimensions(obj)];
|
||||
t.dtype = dtype;
|
||||
t.shapeCopy = new long[numDimensions(obj, dtype)];
|
||||
assert objectCompatWithType(obj, dtype);
|
||||
fillShape(obj, 0, t.shapeCopy);
|
||||
if (t.dtype != DataType.STRING) {
|
||||
int byteSize = elemByteSize(t.dtype) * numElements(t.shapeCopy);
|
||||
@ -190,8 +201,7 @@ public final class Tensor implements AutoCloseable {
|
||||
*
|
||||
* <p>Creates a Tensor with the provided shape of any type where the tensor's data has been
|
||||
* encoded into {@code data} as per the specification of the TensorFlow <a
|
||||
* href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C
|
||||
* API</a>.
|
||||
* href="https://www.tensorflow.org/code/tensorflow/c/c_api.h">C API</a>.
|
||||
*
|
||||
* @param dataType the tensor datatype.
|
||||
* @param shape the tensor shape.
|
||||
@ -537,56 +547,70 @@ public final class Tensor implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
private static HashMap<Class<?>, DataType> classDataTypes = new HashMap<>();
|
||||
|
||||
static {
|
||||
classDataTypes.put(int.class, DataType.INT32);
|
||||
classDataTypes.put(Integer.class, DataType.INT32);
|
||||
classDataTypes.put(long.class, DataType.INT64);
|
||||
classDataTypes.put(Long.class, DataType.INT64);
|
||||
classDataTypes.put(float.class, DataType.FLOAT);
|
||||
classDataTypes.put(Float.class, DataType.FLOAT);
|
||||
classDataTypes.put(double.class, DataType.DOUBLE);
|
||||
classDataTypes.put(Double.class, DataType.DOUBLE);
|
||||
classDataTypes.put(byte.class, DataType.STRING);
|
||||
classDataTypes.put(Byte.class, DataType.STRING);
|
||||
classDataTypes.put(boolean.class, DataType.BOOL);
|
||||
classDataTypes.put(Boolean.class, DataType.BOOL);
|
||||
}
|
||||
|
||||
private static DataType dataTypeOf(Object o) {
|
||||
if (o.getClass().isArray()) {
|
||||
if (Array.getLength(o) == 0) {
|
||||
throw new IllegalArgumentException("cannot create Tensors with a 0 dimension");
|
||||
}
|
||||
// byte[] is a DataType.STRING scalar.
|
||||
Object e = Array.get(o, 0);
|
||||
if (e == null) {
|
||||
throwExceptionIfNotByteOfByteArrays(o);
|
||||
return DataType.STRING;
|
||||
}
|
||||
if (Byte.class.isInstance(e) || byte.class.isInstance(e)) {
|
||||
return DataType.STRING;
|
||||
}
|
||||
return dataTypeOf(e);
|
||||
Class<?> c = o.getClass();
|
||||
while (c.isArray()) {
|
||||
c = c.getComponentType();
|
||||
}
|
||||
if (Float.class.isInstance(o) || float.class.isInstance(o)) {
|
||||
return DataType.FLOAT;
|
||||
} else if (Double.class.isInstance(o) || double.class.isInstance(o)) {
|
||||
return DataType.DOUBLE;
|
||||
} else if (Integer.class.isInstance(o) || int.class.isInstance(o)) {
|
||||
return DataType.INT32;
|
||||
} else if (Long.class.isInstance(o) || long.class.isInstance(o)) {
|
||||
return DataType.INT64;
|
||||
} else if (Boolean.class.isInstance(o) || boolean.class.isInstance(o)) {
|
||||
return DataType.BOOL;
|
||||
} else {
|
||||
throw new IllegalArgumentException("cannot create Tensors of " + o.getClass().getName());
|
||||
DataType ret = classDataTypes.get(c);
|
||||
if (ret != null) {
|
||||
return ret;
|
||||
}
|
||||
throw new IllegalArgumentException("cannot create Tensors of type " + c.getName());
|
||||
}
|
||||
|
||||
private static int numDimensions(Object o) {
|
||||
if (o.getClass().isArray()) {
|
||||
Object e = Array.get(o, 0);
|
||||
if (e == null) {
|
||||
throwExceptionIfNotByteOfByteArrays(o);
|
||||
return 1;
|
||||
} else if (Byte.class.isInstance(e) || byte.class.isInstance(e)) {
|
||||
return 0;
|
||||
}
|
||||
return 1 + numDimensions(e);
|
||||
/**
|
||||
* Returns the number of dimensions of a tensor of type dtype when represented by the object o.
|
||||
*/
|
||||
private static int numDimensions(Object o, DataType dtype) {
|
||||
int ret = numArrayDimensions(o);
|
||||
if (dtype == DataType.STRING && ret > 0) {
|
||||
return ret - 1;
|
||||
}
|
||||
return 0;
|
||||
return ret;
|
||||
}
|
||||
|
||||
/** Returns the number of dimensions of the array object o. Returns 0 if o is not an array. */
|
||||
private static int numArrayDimensions(Object o) {
|
||||
Class<?> c = o.getClass();
|
||||
int i = 0;
|
||||
while (c.isArray()) {
|
||||
c = c.getComponentType();
|
||||
i++;
|
||||
}
|
||||
return i;
|
||||
}
|
||||
|
||||
/**
|
||||
* Fills in the remaining entries in the shape array starting from position {@code dim} with the
|
||||
* dimension sizes of the multidimensional array o. Checks that all arrays reachable from o have
|
||||
* sizes consistent with the filled-in shape, throwing IllegalArgumentException otherwise.
|
||||
*/
|
||||
private static void fillShape(Object o, int dim, long[] shape) {
|
||||
if (shape == null || dim == shape.length) {
|
||||
return;
|
||||
}
|
||||
final int len = Array.getLength(o);
|
||||
if (len == 0) {
|
||||
throw new IllegalArgumentException("cannot create Tensors with a 0 dimension");
|
||||
}
|
||||
if (shape[dim] == 0) {
|
||||
shape[dim] = len;
|
||||
} else if (shape[dim] != len) {
|
||||
@ -598,15 +622,27 @@ public final class Tensor implements AutoCloseable {
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns whether the object {@code obj} can represent a tensor with data type {@code dtype}. */
|
||||
private static boolean objectCompatWithType(Object obj, DataType dtype) {
|
||||
DataType dto = dataTypeOf(obj);
|
||||
if (dto.equals(dtype)) {
|
||||
return true;
|
||||
}
|
||||
if (dto == DataType.STRING && dtype == DataType.UINT8) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private void throwExceptionIfTypeIsIncompatible(Object o) {
|
||||
final int rank = numDimensions();
|
||||
final int oRank = numDimensions(o);
|
||||
final int oRank = numDimensions(o, dtype);
|
||||
if (oRank != rank) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format(
|
||||
"cannot copy Tensor with %d dimensions into an object with %d", rank, oRank));
|
||||
}
|
||||
if (dataTypeOf(o) != dtype) {
|
||||
if (!objectCompatWithType(o, dtype)) {
|
||||
throw new IllegalArgumentException(
|
||||
String.format(
|
||||
"cannot copy Tensor with DataType %s into an object of type %s",
|
||||
|
@ -410,6 +410,19 @@ public class TensorTest {
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUInt8Tensor() {
|
||||
byte[] vector = new byte[] { 1, 2, 3, 4 };
|
||||
try (Tensor t = Tensor.create(vector, DataType.UINT8)) {
|
||||
assertEquals(DataType.UINT8, t.dataType());
|
||||
assertEquals(1, t.numDimensions());
|
||||
assertArrayEquals(new long[] {4}, t.shape());
|
||||
|
||||
byte[] got = t.copyTo(new byte[4]);
|
||||
assertArrayEquals(got, vector);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void failCreateOnMismatchedDimensions() {
|
||||
int[][][] invalid = new int[3][1][];
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user