From a5b53fd50b9a687af72f1713fbad8e39bb652b25 Mon Sep 17 00:00:00 2001 From: dvirlabs Date: Sun, 1 Mar 2026 16:39:20 +0200 Subject: [PATCH] Add nvidia-device-plugin --- argocd-apps/nvidia-device-plugin.yaml | 20 +++++++ .../nvidia-device-plugin.yaml | 58 +++++++++++++++++++ 2 files changed, 78 insertions(+) create mode 100644 argocd-apps/nvidia-device-plugin.yaml create mode 100644 manifests/nvidia-device-plugin/nvidia-device-plugin.yaml diff --git a/argocd-apps/nvidia-device-plugin.yaml b/argocd-apps/nvidia-device-plugin.yaml new file mode 100644 index 0000000..137a926 --- /dev/null +++ b/argocd-apps/nvidia-device-plugin.yaml @@ -0,0 +1,20 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: nvidia-device-plugin + namespace: argocd +spec: + project: infra + source: + repoURL: 'https://git.dvirlabs.com/dvirlabs/infra.git' + targetRevision: HEAD + path: manifests/nvidia-device-plugin + destination: + server: https://kubernetes.default.svc + namespace: kube-system + syncPolicy: + automated: + prune: true + selfHeal: true + syncOptions: + - CreateNamespace=false \ No newline at end of file diff --git a/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml b/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml new file mode 100644 index 0000000..d490e48 --- /dev/null +++ b/manifests/nvidia-device-plugin/nvidia-device-plugin.yaml @@ -0,0 +1,58 @@ +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nvidia-device-plugin-daemonset + namespace: kube-system +spec: + selector: + matchLabels: + name: nvidia-device-plugin-ds + updateStrategy: + type: RollingUpdate + template: + metadata: + labels: + name: nvidia-device-plugin-ds + spec: + nodeSelector: + nvidia.com/gpu.present: "true" + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + # Mark this pod as a critical add-on; when enabled, the critical add-on + # scheduler reserves resources for critical add-on pods so that they can + # be rescheduled after a failure. + # See https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/ + priorityClassName: "system-node-critical" + containers: + - image: nvcr.io/nvidia/k8s-device-plugin:v0.15.0 + name: nvidia-device-plugin-ctr + env: + - name: FAIL_ON_INIT_ERROR + value: "false" + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: ["ALL"] + volumeMounts: + - name: device-plugin + mountPath: /var/lib/kubelet/device-plugins + volumes: + - name: device-plugin + hostPath: + path: /var/lib/kubelet/device-plugins \ No newline at end of file