From 68849b86f973db7dec46f683f1f849f623ebedc0 Mon Sep 17 00:00:00 2001 From: dvirlabs Date: Sun, 1 Mar 2026 17:24:03 +0200 Subject: [PATCH] fix: target NVIDIA device plugin to GPU node k3s-worker-5-gpu with proper tolerations --- .../nvidia-device-plugin.yaml | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml b/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml index 409a610..c50cc06 100644 --- a/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml +++ b/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml @@ -28,13 +28,16 @@ spec: labels: name: nvidia-device-plugin-ds spec: - # nodeSelector removed for initial deployment to avoid chicken-egg problem - # The nvidia.com/gpu.present label is applied BY the device plugin itself - # FAIL_ON_INIT_ERROR=false ensures graceful skip on nodes without GPU - # After first deployment, you can optionally re-enable with: - # nodeSelector: - # nvidia.com/gpu.present: "true" + # Schedule ONLY on the GPU node by hostname + nodeSelector: + kubernetes.io/hostname: k3s-worker-5-gpu tolerations: + # Tolerate the gpu=true:NoSchedule taint on the GPU node + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule + # Also tolerate nvidia.com/gpu taint if present - key: nvidia.com/gpu operator: Exists effect: NoSchedule