Merge pull request from Intel-tensorflow:ganand1/master_icx_launch

PiperOrigin-RevId: 350040548
Change-Id: I7bc616fb7d1e01e073b3d4da3977be2e5523f71f
This commit is contained in:
TensorFlower Gardener 2021-01-04 13:16:59 -08:00
commit df7465d3f1
4 changed files with 141 additions and 19 deletions

View File

@ -16,10 +16,14 @@ ARG ENABLE_SECURE_BUILD
ARG BAZEL_VERSION=""
ARG ENABLE_DNNL1=""
ARG ENABLE_HOROVOD=""
ARG ENABLE_GCC8=""
ARG OPENMPI_VERSION=""
ARG OPENMPI_DOWNLOAD_URL=""
ARG HOROVOD_VERSION=""
ARG INSTALL_HOROVOD_FROM_COMMIT=""
ARG BUILD_SSH=""
ARG TF_NIGHTLY_FLAG=""
ARG RELEASE_CONTAINER=""
ENV DEBIAN_FRONTEND=noninteractive
@ -31,6 +35,15 @@ RUN if [ "${BAZEL_VERSION}" != "" ]; then \
rm -rf bazel-$BAZEL_VERSION-installer-linux-x86_64.sh; \
fi
# Upgrade gcc-8 if argument is passed
RUN if [ "${ENABLE_GCC8}" = "yes" ]; then \
add-apt-repository ppa:ubuntu-toolchain-r/test -y && \
apt-get update && \
apt-get install gcc-8 g++-8 -y && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-8 80 --slave /usr/bin/g++ g++ /usr/bin/g++-8 --slave /usr/bin/gcov gcov /usr/bin/gcov-8 && \
update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-7 70 --slave /usr/bin/g++ g++ /usr/bin/g++-7 --slave /usr/bin/gcov gcov /usr/bin/gcov-7 ;\
fi
# Download and build TensorFlow from the latest sources found in the root container
# make sure that if they pass in a tag, that it is loaded or we'll get an error
WORKDIR /
@ -66,10 +79,17 @@ RUN bazel --bazelrc=/root/.bazelrc build -c opt \
COPY install_openmpi_horovod.sh .
RUN if [ "${ENABLE_HOROVOD}" = "yes" ]; then \
chmod +x install_openmpi_horovod.sh && \
OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
OPENMPI_VERSION=${OPENMPI_VERSION} OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL} BUILD_SSH=${BUILD_SSH} \
INSTALL_HOROVOD_FROM_COMMIT=${INSTALL_HOROVOD_FROM_COMMIT} HOROVOD_VERSION=${HOROVOD_VERSION} ./install_openmpi_horovod.sh && \
rm -rf install_openmpi_horovod.sh; \
fi
# Remove crypto python packages for software compliance check.
RUN if [ "${RELEASE_CONTAINER}" = "yes" ]; then \
${PIP} uninstall --yes cryptography && \
rm -rf /usr/lib/python3/dist-packages/pycrypto-2.6.1.egg-info; \
fi
# TensorBoard
EXPOSE 6006
# IPython

View File

@ -57,6 +57,8 @@ BUILD_AVX_CONTAINERS=${BUILD_AVX_CONTAINERS:-no}
BUILD_AVX2_CONTAINERS=${BUILD_AVX2_CONTAINERS:-no}
BUILD_SKX_CONTAINERS=${BUILD_SKX_CONTAINERS:-no}
BUILD_CLX_CONTAINERS=${BUILD_CLX_CONTAINERS:-no}
BUILD_ICX_CLIENT_CONTAINERS=${BUILD_ICX_CLIENT_CONTAINERS:-no}
BUILD_ICX_SERVER_CONTAINERS=${BUILD_ICX_SERVER_CONTAINERS:-no}
CONTAINER_PORT=${TF_DOCKER_BUILD_PORT:-8888}
BUILD_TF_V2_CONTAINERS=${BUILD_TF_V2_CONTAINERS:-yes}
BUILD_TF_BFLOAT16_CONTAINERS=${BUILD_TF_BFLOAT16_CONTAINERS:-no}
@ -65,10 +67,14 @@ BAZEL_VERSION=${BAZEL_VERSION}
BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS:-no}
ENABLE_DNNL1=${ENABLE_DNNL1:-no}
ENABLE_HOROVOD=${ENABLE_HOROVOD:-no}
INSTALL_HOROVOD_FROM_COMMIT=${INSTALL_HOROVOD_FROM_COMMIT:-no}
ENABLE_GCC8=${ENABLE_GCC8:-no}
OPENMPI_VERSION=${OPENMPI_VERSION}
OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}
HOROVOD_VERSION=${HOROVOD_VERSION}
BUILD_SSH=${BUILD_SSH:-no}
IS_NIGHTLY=${IS_NIGHTLY:-no}
RELEASE_CONTAINER=${RELEASE_CONTAINER:-no}
debug "ROOT_CONTAINER=${ROOT_CONTAINER}"
debug "TF_ROOT_CONTAINER_TAG=${TF_ROOT_CONTAINER_TAG}"
@ -80,18 +86,24 @@ debug "BUILD_AVX_CONTAINERS=${BUILD_AVX_CONTAINERS}"
debug "BUILD_AVX2_CONTAINERS=${BUILD_AVX2_CONTAINERS}"
debug "BUILD_SKX_CONTAINERS=${BUILD_SKX_CONTAINERS}"
debug "BUILD_CLX_CONTAINERS=${BUILD_CLX_CONTAINERS}"
debug "BUILD_ICX_CLIENT_CONTAINERS=${BUILD_ICX_CLIENT_CONTAINERS}"
debug "BUILD_ICX_SERVER_CONTAINERS=${BUILD_ICX_SERVER_CONTAINERS}"
debug "BUILD_TF_V2_CONTAINERS=${BUILD_TF_V2_CONTAINERS}"
debug "BUILD_TF_BFLOAT16_CONTAINERS=${BUILD_TF_BFLOAT16_CONTAINERS}"
debug "ENABLE_SECURE_BUILD=${ENABLE_SECURE_BUILD}"
debug "TMP_DIR=${TMP_DIR}"
debug "BAZEL_VERSION=${BAZEL_VERSION}"
debug "ENABLE_GCC8=${ENABLE_GCC8}"
debug "BUILD_PY2_CONTAINERS=${BUILD_PY2_CONTAINERS}"
debug "ENABLE_DNNL1=${ENABLE_DNNL1}"
debug "ENABLE_HOROVOD=${ENABLE_HOROVOD}"
debug "INSTALL_HOROVOD_FROM_COMMIT=${INSTALL_HOROVOD_FROM_COMMIT}"
debug "OPENMPI_VERSION=${OPENMPI_VERSION}"
debug "OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}"
debug "HOROVOD_VERSION=${HOROVOD_VERSION}"
debug "BUILD_SSH=${BUILD_SSH}"
debug "IS_NIGHTLY=${IS_NIGHTLY}"
debug "RELEASE_CONTAINER=${RELEASE_CONTAINER}"
function build_container()
{
@ -147,6 +159,8 @@ function build_container()
TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_VERSION=${OPENMPI_VERSION}")
TF_DOCKER_BUILD_ARGS+=("--build-arg OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL}")
TF_DOCKER_BUILD_ARGS+=("--build-arg HOROVOD_VERSION=${HOROVOD_VERSION}")
TF_DOCKER_BUILD_ARGS+=("--build-arg INSTALL_HOROVOD_FROM_COMMIT=${INSTALL_HOROVOD_FROM_COMMIT}")
TF_DOCKER_BUILD_ARGS+=("--build-arg BUILD_SSH=${BUILD_SSH}")
fi
# Add build arg --nightly_flag for the nightly build
@ -154,6 +168,11 @@ function build_container()
TF_DOCKER_BUILD_ARGS+=("--build-arg TF_NIGHTLY_FLAG=--nightly_flag")
fi
# Add build arg GCC8 install
TF_DOCKER_BUILD_ARGS+=("--build-arg ENABLE_GCC8=${ENABLE_GCC8}")
TF_DOCKER_BUILD_ARGS+=("--build-arg RELEASE_CONTAINER=${RELEASE_CONTAINER}")
# Perform docker build
debug "Building docker image with image name and tag: ${TEMP_IMAGE_NAME}"
CMD="${DOCKER_BINARY} build ${TF_DOCKER_BUILD_ARGS[@]} --no-cache --pull -t ${TEMP_IMAGE_NAME} -f Dockerfile.devel-mkl ."
@ -305,6 +324,14 @@ if [[ ${BUILD_CLX_CONTAINERS} == "yes" ]]; then
PLATFORMS+=("icelake")
fi
if [[ ${BUILD_ICX_CLIENT_CONTAINERS} == "yes" ]]; then
PLATFORMS+=("icelake-client")
fi
if [[ ${BUILD_ICX_SERVER_CONTAINERS} == "yes" ]]; then
PLATFORMS+=("icelake-server")
fi
# Checking out sources needs to be done only once
checkout_tensorflow "${TF_REPO}" "${TF_BUILD_VERSION}" "${TF_BUILD_VERSION_IS_PR}"
@ -330,6 +357,14 @@ do
FINAL_TAG="${FINAL_TAG}-avx512-VNNI"
fi
if [[ ${PLATFORM} == "icelake-client" ]]; then
FINAL_TAG="${FINAL_TAG}-icx-client"
fi
if [[ ${PLATFORM} == "icelake-server" ]]; then
FINAL_TAG="${FINAL_TAG}-icx-server"
fi
# Add -devel-mkl to the image tag
FINAL_TAG="${FINAL_TAG}-devel-mkl"
if [[ "${PYTHON}" == "python3" ]]; then

View File

@ -22,7 +22,10 @@ set -e
# Set default
OPENMPI_VERSION=${OPENMPI_VERSION:-openmpi-2.1.1}
OPENMPI_DOWNLOAD_URL=${OPENMPI_DOWNLOAD_URL:-https://www.open-mpi.org/software/ompi/v2.1/downloads/openmpi-2.1.1.tar.gz}
INSTALL_HOROVOD_FROM_COMMIT=${INSTALL_HOROVOD_FROM_COMMIT:-no}
BUILD_SSH=${BUILD_SSH:-no}
HOROVOD_VERSION=${HOROVOD_VERSION:-0.19.1}
SSH_CONFIG_PATH=/etc/ssh/ssh_config
# Install Open MPI
echo "Installing OpenMPI version ${OPENMPI_VERSION} ..."
@ -54,27 +57,49 @@ echo 'OpenMPI version:'
mpirun --version
# Install OpenSSH for MPI to communicate between containers
apt-get clean && apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
openssh-client openssh-server libnuma-dev && \
rm -rf /var/lib/apt/lists/*
if [[ $? == "0" ]]; then
echo "PASS: OpenSSH installation"
if [[ ${BUILD_SSH} == "yes" ]]; then
mkdir /tmp/buildssh
cd /tmp/buildssh && curl -fSsL -O http://www.zlib.net/zlib-1.2.11.tar.gz && tar -xzvf zlib-1.2.11.tar.gz && \
cd /tmp/buildssh/zlib-1.2.11 && ./configure && make && make install
cd /tmp/buildssh && curl -fSsL -O https://www.openssl.org/source/openssl-1.1.1.tar.gz && tar -xzvf openssl-1.1.1.tar.gz && \
cd /tmp/buildssh/openssl-1.1.1 && ./config && make && make test && make install
cd /tmp/buildssh && curl -fSsL -O https://mirrors.sonic.net/pub/OpenBSD/OpenSSH/portable/openssh-8.4p1.tar.gz && \
tar -xzvf openssh-8.4p1.tar.gz && cd /tmp/buildssh/openssh-8.4p1 && \
./configure --with-md5-passwords && make && \
groupadd sshd && useradd -M -g sshd -c 'sshd privsep' -d /var/empty -s /sbin/nologin sshd && passwd -l sshd && \
make install
apt-get clean && apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
libnuma-dev cmake
SSH_CONFIG_PATH=/usr/local/etc/ssh_config
else
yum -y update && yum -y install numactl-devel openssh-server openssh-clients && \
yum clean all
if [[ $? == "0" ]]; then
echo "PASS: OpenSSH installation"
else
echo "Unsupported Linux distribution. Aborting!" && exit 1
fi
apt-get clean && apt-get update && \
apt-get install -y --no-install-recommends --fix-missing \
openssh-client openssh-server libnuma-dev cmake && \
rm -rf /var/lib/apt/lists/*
if [[ $? == "0" ]]; then
echo "PASS: OpenSSH installation"
else
yum -y update && yum -y install numactl-devel openssh-server openssh-clients cmake && \
yum clean all
if [[ $? == "0" ]]; then
echo "PASS: OpenSSH installation"
else
echo "Unsupported Linux distribution. Aborting!" && exit 1
fi
fi
fi
mkdir -p /var/run/sshd
grep -v StrictHostKeyChecking ${SSH_CONFIG_PATH} > /etc/ssh/ssh_config.new
# Allow OpenSSH to talk to containers without asking for confirmation
grep -v StrictHostKeyChecking /etc/ssh/ssh_config > /etc/ssh/ssh_config.new
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
mv /etc/ssh/ssh_config.new ${SSH_CONFIG_PATH}
# Install Horovod
HOROVOD_WITH_TENSORFLOW=1
python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
if [[ ${INSTALL_HOROVOD_FROM_COMMIT} == "yes" ]]; then
HOROVOD_WITH_TENSORFLOW=1
python3 -m pip install --no-cache-dir git+https://github.com/horovod/horovod.git@${HOROVOD_VERSION}
else
HOROVOD_WITH_TENSORFLOW=1
python3 -m pip install --no-cache-dir horovod==${HOROVOD_VERSION}
fi

View File

@ -180,6 +180,46 @@ class CascadelakePlatform(IntelPlatform):
CASCADELAKE_ARCH_NEW + " "
class IcelakeClientPlatform(IntelPlatform):
def __init__(self):
IntelPlatform.__init__(self, 8, 4)
def get_bazel_gcc_flags(self):
ICELAKE_ARCH_OLD = "skylake-avx512"
ICELAKE_ARCH_NEW = "icelake-client"
AVX512_FLAGS = ["avx512f", "avx512cd"]
if IntelPlatform.use_old_arch_names(self, 8, 4):
ret_val = self.BAZEL_PREFIX_ + self.ARCH_PREFIX_ + \
ICELAKE_ARCH_OLD + " "
for flag in AVX512_FLAGS:
ret_val += self.BAZEL_PREFIX_ + self.FLAG_PREFIX_ + flag + " "
return ret_val
else:
return self.BAZEL_PREFIX_ + self.ARCH_PREFIX_ + \
ICELAKE_ARCH_NEW + " "
class IcelakeServerPlatform(IntelPlatform):
def __init__(self):
IntelPlatform.__init__(self, 8, 4)
def get_bazel_gcc_flags(self):
ICELAKE_ARCH_OLD = "skylake-avx512"
ICELAKE_ARCH_NEW = "icelake-server"
AVX512_FLAGS = ["avx512f", "avx512cd"]
if IntelPlatform.use_old_arch_names(self, 8, 4):
ret_val = self.BAZEL_PREFIX_ + self.ARCH_PREFIX_ + \
ICELAKE_ARCH_OLD + " "
for flag in AVX512_FLAGS:
ret_val += self.BAZEL_PREFIX_ + self.FLAG_PREFIX_ + flag + " "
return ret_val
else:
return self.BAZEL_PREFIX_ + self.ARCH_PREFIX_ + \
ICELAKE_ARCH_NEW + " "
class BuildEnvSetter(object):
"""Prepares the proper environment settings for various Intel platforms."""
default_platform_ = "haswell"
@ -189,7 +229,9 @@ class BuildEnvSetter(object):
"sandybridge": SandyBridgePlatform(),
"haswell": HaswellPlatform(),
"skylake": SkylakePlatform(),
"cascadelake": CascadelakePlatform()
"cascadelake": CascadelakePlatform(),
"icelake-client": IcelakeClientPlatform(),
"icelake-server": IcelakeServerPlatform(),
}
def __init__(self):