From 302358459136384bb23c075b633f4c0e3159df49 Mon Sep 17 00:00:00 2001 From: Clayne Robison Date: Tue, 21 May 2019 16:46:36 -0700 Subject: [PATCH] Add Dockerfile partials to support Mkl + MPI + Horovod; Remove trailing whitespace from python.partial.Dockerfile --- .../partials/devel-horovod.partial.Dockerfile | 3 ++ .../partials/horovod.partial.Dockerfile | 2 + .../partials/mpi.partial.Dockerfile | 44 ++++++++++++++++++ .../partials/ubuntu/python.partial.Dockerfile | 2 +- tensorflow/tools/dockerfiles/spec.yml | 33 ++++++++++++- .../dockerfiles/tests/build-mkl-horovod.sh | 46 +++++++++++++++++++ .../dockerfiles/tests/import-mkl-horovod.sh | 18 ++++++++ 7 files changed, 146 insertions(+), 2 deletions(-) create mode 100644 tensorflow/tools/dockerfiles/partials/devel-horovod.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/horovod.partial.Dockerfile create mode 100644 tensorflow/tools/dockerfiles/partials/mpi.partial.Dockerfile create mode 100755 tensorflow/tools/dockerfiles/tests/build-mkl-horovod.sh create mode 100755 tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh diff --git a/tensorflow/tools/dockerfiles/partials/devel-horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/devel-horovod.partial.Dockerfile new file mode 100644 index 00000000000..dab42914df3 --- /dev/null +++ b/tensorflow/tools/dockerfiles/partials/devel-horovod.partial.Dockerfile @@ -0,0 +1,3 @@ +# Check out horovod source code if --build-arg CHECKOUT_HOROVOD_SRC=1 +ARG CHECKOUT_HOROVOD_SRC=0 +RUN test "${CHECKOUT_HOROVOD_SRC}" -eq 1 && git clone --recursive https://github.com/uber/horovod.git /horovod_src || true diff --git a/tensorflow/tools/dockerfiles/partials/horovod.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/horovod.partial.Dockerfile new file mode 100644 index 00000000000..b8b6aab3af2 --- /dev/null +++ b/tensorflow/tools/dockerfiles/partials/horovod.partial.Dockerfile @@ -0,0 +1,2 @@ +# Install Horovod +RUN ${PIP} install --no-cache-dir horovod diff --git a/tensorflow/tools/dockerfiles/partials/mpi.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/mpi.partial.Dockerfile new file mode 100644 index 00000000000..5c0de90549f --- /dev/null +++ b/tensorflow/tools/dockerfiles/partials/mpi.partial.Dockerfile @@ -0,0 +1,44 @@ +# install libnuma, openssh, wget +RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \ + libnuma-dev \ + openssh-server \ + openssh-client \ + wget && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* || \ + yum -y update && yum -y install \ + numactl-devel \ + openssh-server \ + openssh-clients \ + wget && \ + yum clean all || \ + echo "Unsupported Linux distribution. Aborting!" && exit 1 + +# Install Open MPI +RUN mkdir /tmp/openmpi && \ + cd /tmp/openmpi && \ + wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-4.0.0.tar.gz && \ + tar zxf openmpi-4.0.0.tar.gz && \ + cd openmpi-4.0.0 && \ + ./configure --enable-orterun-prefix-by-default && \ + make -j $(nproc) all && \ + make install && \ + ldconfig && \ + rm -rf /tmp/openmpi + +# Create a wrapper for OpenMPI to allow running as root by default +RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ + echo '#!/bin/bash' > /usr/local/bin/mpirun && \ + echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ + chmod a+x /usr/local/bin/mpirun + +# Configure OpenMPI to run good defaults: +RUN echo "btl_tcp_if_exclude = lo,docker0" >> /usr/local/etc/openmpi-mca-params.conf + +# Install OpenSSH for MPI to communicate between containers +RUN mkdir -p /var/run/sshd + +# Allow OpenSSH to talk to containers without asking for confirmation +RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \ + echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \ + mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile index 6af47319538..602bdbf5606 100644 --- a/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile +++ b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile @@ -15,4 +15,4 @@ RUN ${PIP} --no-cache-dir install --upgrade \ setuptools # Some TF tools expect a "python" binary -RUN ln -s $(which ${PYTHON}) /usr/local/bin/python +RUN ln -s $(which ${PYTHON}) /usr/local/bin/python diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml index 6fddfe000c6..ea5a70222f5 100644 --- a/tensorflow/tools/dockerfiles/spec.yml +++ b/tensorflow/tools/dockerfiles/spec.yml @@ -1,5 +1,5 @@ header: | - # Copyright 2018 The TensorFlow Authors. All Rights Reserved. + # Copyright 2019 The TensorFlow Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -83,6 +83,21 @@ slice_sets: - ubuntu/python - tensorflow - shell + - add_to_name: "-horovod" + dockerfile_exclusive_name: "horovod" + dockerfile_subdirectory: "mkl" + partials: + - ubuntu/version + - ubuntu/cpu + - ubuntu/python + - tensorflow + - mpi + - horovod + - shell + tests: + - import-mkl-horovod.sh + args: + - TF_PACKAGE=intel-tensorflow - add_to_name: "-gpu" dockerfile_exclusive_name: "gpu" args: @@ -110,6 +125,22 @@ slice_sets: - build-cpu.sh args: - CHECKOUT_TF_SRC=1 + - add_to_name: "devel-horovod" + dockerfile_exclusive_name: "devel-horovod" + dockerfile_subdirectory: "mkl" + partials: + - ubuntu/version + - ubuntu/devel-cpu + - ubuntu/python + - ubuntu/bazel + - mpi + - devel-horovod + - shell + tests: + - build-mkl-horovod.sh + args: + - CHECKOUT_TF_SRC=1 + - CHECKOUT_HOROVOD_SRC=1 - add_to_name: "devel-gpu" dockerfile_exclusive_name: "devel-gpu" partials: diff --git a/tensorflow/tools/dockerfiles/tests/build-mkl-horovod.sh b/tensorflow/tools/dockerfiles/tests/build-mkl-horovod.sh new file mode 100755 index 00000000000..62c2ffbc471 --- /dev/null +++ b/tensorflow/tools/dockerfiles/tests/build-mkl-horovod.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + + + +# Download and build TensorFlow. +set -euxo pipefail +git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow +cd /tensorflow + +ln -s $(which ${PYTHON}) /usr/local/bin/python + +# Build TensorFlow with support for Intel(R) MKL-DNN +yes "" | ${PYTHON} configure.py && \ + bazel build -c opt --config=mkl --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \ + tensorflow/tools/pip_package:build_pip_package && \ + bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \ + pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \ + rm -rf /tmp/pip && \ + rm -rf /root/.cache + + +# download and build Horovod +git clone --recursive https://github.com/uber/horovod.git +cd horovod +# export environment +export HOROVOD_WITHOUT_PYTORCH=1 +export HOROVOD_WITH_TENSORFLOW=1 +python setup.py sdist +pip --no-cache-dir install --upgrade sdist/horovod*.tar.gz && \ + rm -rf sdist && \ + rm -rf /root/.cache diff --git a/tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh b/tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh new file mode 100755 index 00000000000..b1cae48c6ee --- /dev/null +++ b/tensorflow/tools/dockerfiles/tests/import-mkl-horovod.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +# Copyright 2019 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +python -c 'from tensorflow.python import pywrap_tensorflow; pywrap_tensorflow.IsMklEnabled() or exit(1); import horovod.tensorflow as hvd'