STT-tensorflow/tensorflow/tools/ci_build/ctpu/ctpu.sh
A. Unique TensorFlower 8f399978bd Kokoro related change for switching nighlty-2.x to the nightly and removing older nightly which was pointing to nightly-1.15.
PiperOrigin-RevId: 298924942
Change-Id: Id4cf52e5ff5d82efd6ef129740bbecde7b05c9fe
2020-03-04 13:27:39 -08:00

134 lines
3.9 KiB
Bash

#!/bin/bash
# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Reusable functions for using CTPU in a Kokoro build.
# These functions are unit tested in ctpu_test.sh.
# Installs the Cloud TPU CLI to the current directory.
# Pass pip command as first arg, ex: install_ctpu pip3.7
function install_ctpu {
PIP_CMD="${1:-pip}"
# TPUClusterResolver has a runtime dependency cloud-tpu-client when
# resolving a Cloud TPU. It's very likely we want this installed if we're
# using CTPU.
# Replace cloud-tpu-client with google-api-python-client oauth2client to test
# the client at head.
"${PIP_CMD}" install --user --upgrade cloud-tpu-client
wget -nv "https://dl.google.com/cloud_tpu/ctpu/latest/linux/ctpu"
chmod a+x ctpu
}
# Starts a Cloud TPU, storing metadata into artifacts dir for export.
#
# This function supports overriding the default parameters, using optional
# single-letter flags.
#
# Usage:
# ctpu_up -n [tpu name] -z [zone] -s [tpu size] -v [tf-version] \
# -p [cloud project] -g [gcp-network]
function ctpu_up {
local OPTIND o # Used for flag parsing
# Generate a unique random name for TPU, as we might be running multiple builds in parallel.
local name="kokoro-tpu-${RANDOM}"
local zone="us-central1-c"
local size="v2-8"
local version="nightly"
local project # Project automatically detected from environment.
local gcp_network # Network needed only if project default is Legacy.
# Override any of the above params from flags.
while getopts ":n:z:p:s:v:g:" o; do
case "${o}" in
n)
name="${OPTARG}"
;;
z)
zone="${OPTARG}"
;;
p)
project="${OPTARG}"
;;
s)
size="${OPTARG}"
;;
v)
version="${OPTARG}"
;;
g)
gcp_network="${OPTARG}"
;;
*)
echo "Unexpected parameter for ctpu_up: ${o}"
exit 1
esac
done
shift $((OPTIND-1))
export TPU_NAME="${name}"
export TPU_ZONE="${zone}"
# Store name and zone into artifacts dir so cleanup job has access.
echo "${TPU_NAME}" > "${TF_ARTIFACTS_DIR}/tpu_name"
echo "${TPU_ZONE}" > "${TF_ARTIFACTS_DIR}/tpu_zone"
local args=(
"--zone=${zone}"
"--tf-version=${version}"
"--name=${name}"
"--tpu-size=${size}"
"--tpu-only"
"-noconf"
)
# "-v" is a bash 4.2 builtin for checking that a variable is set.
if [[ -v gcp_network ]]; then
args+=("--gcp-network=${gcp_network}")
fi
if [[ -v project ]]; then
args+=("--project=${project}")
export TPU_PROJECT="${project}"
echo "${project}" > "${TF_ARTIFACTS_DIR}/tpu_project"
fi
./ctpu up "${args[@]}"
}
# Delete the Cloud TPU specified by the metadata in the gfile directory.
function ctpu_delete {
export TPU_NAME="$(cat "${TF_GFILE_DIR}/tpu_name")"
export TPU_ZONE="$(cat "${TF_GFILE_DIR}/tpu_zone")"
TPU_PROJECT_FILE="${TF_GFILE_DIR}/tpu_project"
if [ -f "${TPU_PROJECT_FILE}" ]; then
export TPU_PROJECT="$(cat ${TPU_PROJECT_FILE})"
else
export TPU_PROJECT="tensorflow-testing"
fi
# Retry due to rare race condition where TPU creation hasn't propagated by
# the time we try to delete it.
for i in 1 2 3; do
./ctpu delete \
--project=${TPU_PROJECT} \
--zone="${TPU_ZONE}" \
--name="${TPU_NAME}" \
--tpu-only \
-noconf && return 0 || sleep 60
done
return 1
}