hadoop/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-submarine/installation/scripts/nvidia-docker.sh

100 lines
3.7 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
## @description download nvidia docker bin
## @audience public
## @stability stable
function download_nvidia_docker_bin()
{
# download http server
if [[ -n "$DOWNLOAD_HTTP" ]]; then
MY_NVIDIA_DOCKER_RPM_URL="${DOWNLOAD_HTTP}/downloads/nvidia-docker/${NVIDIA_DOCKER_RPM}"
else
MY_NVIDIA_DOCKER_RPM_URL=${NVIDIA_DOCKER_RPM_URL}
fi
if [[ -f "${DOWNLOAD_DIR}/nvidia-docker/${NVIDIA_DOCKER_RPM}" ]]; then
echo "${DOWNLOAD_DIR}/nvidia-docker/${NVIDIA_DOCKER_RPM} is exist."
else
echo "download ${MY_NVIDIA_DOCKER_RPM_URL} ..."
wget -P ${DOWNLOAD_DIR}/nvidia-docker/ ${MY_NVIDIA_DOCKER_RPM_URL}
fi
}
## @description install nvidia docker
## @audience public
## @stability stable
function install_nvidia_docker()
{
download_nvidia_docker_bin
sudo rpm -i ${DOWNLOAD_DIR}/nvidia-docker/${NVIDIA_DOCKER_RPM}
echo -e "\033[32m===== Start nvidia-docker =====\033[0m"
sudo systemctl start nvidia-docker
echo -e "\033[32m===== Check nvidia-docker status =====\033[0m"
systemctl status nvidia-docker
echo -e "\033[32m===== Check nvidia-docker log =====\033[0m"
journalctl -u nvidia-docker
echo -e "\033[32m===== Test nvidia-docker-plugin =====\033[0m"
curl http://localhost:3476/v1.0/docker/cli
# create nvidia driver library path
if [ ! -d "/var/lib/nvidia-docker/volumes/nvidia_driver" ]; then
echo "WARN: /var/lib/nvidia-docker/volumes/nvidia_driver folder path is not exist!"
mkdir -p /var/lib/nvidia-docker/volumes/nvidia_driver
fi
local nvidiaVersion=`get_nvidia_version`
echo -e "\033[31m nvidia detect version is ${nvidiaVersion}\033[0m"
mkdir /var/lib/nvidia-docker/volumes/nvidia_driver/${nvidiaVersion}
mkdir /var/lib/nvidia-docker/volumes/nvidia_driver/${nvidiaVersion}/bin
mkdir /var/lib/nvidia-docker/volumes/nvidia_driver/${nvidiaVersion}/lib64
cp /usr/bin/nvidia* /var/lib/nvidia-docker/volumes/nvidia_driver/${nvidiaVersion}/bin
cp /usr/lib64/libcuda* /var/lib/nvidia-docker/volumes/nvidia_driver/${nvidiaVersion}/lib64
cp /usr/lib64/libnvidia* /var/lib/nvidia-docker/volumes/nvidia_driver/${nvidiaVersion}/lib64
echo -e "\033[32m===== Please manually execute the following command =====\033[0m"
echo -e "\033[32mshell:> nvidia-docker run --rm ${DOCKER_REGISTRY}/nvidia/cuda:9.0-devel nvidia-smi
# If you don't see the list of graphics cards above, the NVIDIA driver installation failed. =====
\033[0m"
echo -e "\033[32m===== Please manually execute the following command =====\033[0m"
echo -e "\033[32m# Test with tf.test.is_gpu_available()
shell:> nvidia-docker run -it ${DOCKER_REGISTRY}/tensorflow/tensorflow:1.9.0-gpu bash
# In docker container
container:> python
python:> import tensorflow as tf
python:> tf.test.is_gpu_available()
python:> exit()
\033[0m"
}
## @description uninstall nvidia docker
## @audience public
## @stability stable
function uninstall_nvidia_docker()
{
echo "This method is not implemented."
}