-需要看看显卡硬件有没正确安装到计算机,我们可以通过命令lspci查看一下
lspci
---启用gpu加速计算
--Docker19.03之后,内置gpu支持
--提前禁用nouveau
lsmod | grep nouveau
-没有输出即禁用了
--显卡驱动
-安装nvidia驱动、cuda
-安装nvidia-container-runtime
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
---centos---
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
yum install -y nvidia-container-toolkit nvidia-container-runtime
---debian/Ubuntu---
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
---
curl -fsSL http://h.htmltoo.com/backup/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
apt-get update && apt-get install -y nvidia-container-toolkit nvidia-docker2
--Configuration
nvidia-ctk runtime configure --runtime=docker --config=/etc/docker/daemon.json
systemctl restart docker
--Configuring containerd (for Kubernetes)
nvidia-ctk runtime configure --runtime=containerd
systemctl restart containerd
--Configuring CRI-O
nvidia-ctk runtime configure --runtime=crio
systemctl restart crio
-查看GPU设备
nvidia-smi
-运行容器时,添加--gpu参数启用gpu支持
docker run --gpus all
-使用两个GPU
docker run --gpus 2
-指定GPU运行
docker run --gpus '"device=1,2"' ...
docker run --gpus '"device=UUID-ABCDEF,1"' ...
# Proxmox VE, docker-nvidia显卡
#gpu 版本
apt-get install -y dkms
-检查显卡硬件是否安装正确,查看列表中是否有NVIDIA显卡的信息;
lspci -nn
-删除所有的已安装的NVIDIA驱动的相关文件:
apt-get remove --purge '^nvidia-.*'
apt-get --purge remove "*nvidia*" "libxnvctrl*"
-图形
----
-nvidia, tesla-v100
https://www.nvidia.cn/drivers/lookup/
cd /data/site/htmltoo.opt/common
chmod -R 777 ubuntu2404-560.35.03_1.0-1_amd64.deb
dpkg -i ubuntu2404-560.35.03_1.0-1_amd64.deb
-
apt search nvidia-driver
ubuntu-drivers autoinstall
-
apt-get install -y nvidia-driver
echo "options nvidia NVreg_OpenRmEnableUnsupportedGpus=1" | sudo tee /etc/modprobe.d/nvidia-gsp.conf
apt-get install -y nvidia-open
apt-get install -y nvidia-container-toolkit nvidia-docker2
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/x86_64/cuda-keyring_1.1-1_all.deb
dpkg -i cuda-keyring_1.1-1_all.deb
apt-get update
apt-get -y install cuda-toolkit-12-6
----
sh /opt/NVIDIA-Linux-x86_64-560.28.03.run --no-x-check
sh /opt/NVIDIA-Linux-x86_64-560.28.03.run -no-opengl-files
sh /opt/NVIDIA-Linux-x86_64-560.28.03.run -no-x-check -no-nouveau-check -no-opengl-files --dkms
-cuda
https://developer.nvidia.com/cuda-downloads
wget https://developer.download.nvidia.com/compute/cuda/repos/debian12/x86_64/cuda-keyring_1.1-1_all.deb
dpkg -i cuda-keyring_1.1-1_all.deb
apt-get update
apt-get install -y cuda-drivers
apt-get install -y cuda-toolkit
apt-get install -y nvidia-gds
wget https://developer.download.nvidia.com/compute/cuda/12.6.0/local_installers/cuda_12.6.0_560.28.03_linux.run
dracut --force
-在出现的对话框中不勾选驱动安装(因为之前已经安装过了)
chmod -R 777 /opt/cuda_12.6.0_560.28.03_linux.run
sh /opt/cuda_12.6.0_560.28.03_linux.run --silent
nvidia-xconfig
-保存退出后更新服务
update-initramfs -u
update-grub
reboot
-查看已安装驱动的版本信息
ls /usr/src | grep nvidia
-验证 CUDA 安装
nvidia-smi
/usr/local/cuda/bin/nvcc --version
-
curl -fsSL http://h.htmltoo.com/backup/gpgkey
-
curl -fsSL https://nvidia.github.io/libnvidia-container/gpgkey | sudo gpg --dearmor -o /usr/share/keyrings/nvidia-container-toolkit-keyring.gpg \
&& curl -s -L https://nvidia.github.io/libnvidia-container/stable/deb/nvidia-container-toolkit.list | \
sed 's#deb https://#deb [signed-by=/usr/share/keyrings/nvidia-container-toolkit-keyring.gpg] https://#g' | \
sudo tee /etc/apt/sources.list.d/nvidia-container-toolkit.list
sed -i -e '/experimental/ s/^#//g' /etc/apt/sources.list.d/nvidia-container-toolkit.list
apt-get update
apt-get install -y nvidia-container-toolkit
-
vim /etc/docker/daemon.json
{
"exec-opts": [
"native.cgroupdriver=systemd"
],
"insecure-registries": [
"hub.htmltoo.com:5000"
],
"log-driver": "json-file",
"log-opts": {
"max-file": "3",
"max-size": "5m"
},
"max-concurrent-downloads": 1,
"max-concurrent-uploads": 1,
"registry-mirrors": [
"https://registry.docker-cn.com"
],
"runtimes": {
"nvidia": {
"args": [],
"path": "nvidia-container-runtime"
}
},
"storage-driver": "overlay2"
}nvidia-ctk runtime configure --runtime=docker --config=/etc/docker/daemon.json
systemctl --user restart docker
nvidia-ctk config --set nvidia-container-cli.no-cgroups --in-place
systemctl daemon-reload && systemctl restart docker
# 卸载NVIDIA显卡驱动
#先查看驱动以及版本安装情况,命令如下:
ls /usr/src | grep nvidia
cd /usr/bin
ls nvidia-*
nvidia-uninstall
#如果需要卸载干净所有英伟达驱动命令,如下
apt-get remove --purge nvidia-*(/nvidia*)
apt autoremove
---2---
./显卡驱动包名称 --uninstall
apt-get purge nvidia*
apt-get autoremove
reboot
-停止NVIDIA相关服务:
service lightdm stop
-运行以下命令以卸载NVIDIA驱动程序:
apt-get purge nvidia-*
-运行以下命令以删除任何残余的配置文件或依赖项:
apt-get autoremove
sudo systemctl stop nvidia-persistenced
sudo systemctl stop nvidia-fallback
接着,输入以下命令来卸载NVIDIA显卡驱动:
sudo apt-get purge nvidia-*
这个命令会将系统中所有与NVIDIA显卡相关的驱动程序和配置文件全部清除。在卸载完成后,为了确保系统的稳定性,建议重启系统。
重启之后,可以通过以下命令来检查NVIDIA显卡驱动是否已成功卸载:
lsmod | grep nouveau
如果没有显示任何内容,说明NVIDIA显卡驱动已经成功卸载。