fix: target NVIDIA device plugin to GPU node k3s-worker-5-gpu with proper tolerations

2026-03-01 17:24:03 +02:00 · 2026-03-01 17:24:03 +02:00 · 68849b86f9
commit 68849b86f9
parent 19d1d5fa4a
1 changed files with 9 additions and 6 deletions
--- a/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml
+++ b/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml
@ -28,13 +28,16 @@ spec:
      labels:
        name: nvidia-device-plugin-ds
    spec:
-      # nodeSelector removed for initial deployment to avoid chicken-egg problem
+      # Schedule ONLY on the GPU node by hostname
-      # The nvidia.com/gpu.present label is applied BY the device plugin itself
+      nodeSelector:
-      # FAIL_ON_INIT_ERROR=false ensures graceful skip on nodes without GPU
+        kubernetes.io/hostname: k3s-worker-5-gpu
      # After first deployment, you can optionally re-enable with:
      # nodeSelector:
      #   nvidia.com/gpu.present: "true"
      tolerations:
      # Tolerate the gpu=true:NoSchedule taint on the GPU node
      - key: gpu
        operator: Equal
        value: "true"
        effect: NoSchedule
      # Also tolerate nvidia.com/gpu taint if present
      - key: nvidia.com/gpu
        operator: Exists
        effect: NoSchedule