Compare commits

..

No commits in common. "fbe623fd76c0b86fba178cd691926636f2dfa233" and "9322dd049dca1d137f201a9e3b64a2904a8feed9" have entirely different histories.

2 changed files with 9 additions and 48 deletions

View File

@ -1,35 +1,15 @@
service: service:
type: ClusterIP type: ClusterIP
# Enable GPU support
ollama:
gpu:
enabled: true
type: nvidia
number: 1
resources: resources:
requests: requests:
cpu: 250m
memory: 1Gi
limits:
cpu: 2 cpu: 2
memory: 4Gi memory: 4Gi
limits:
cpu: 4
memory: 8Gi
# Schedule on GPU worker node
nodeSelector:
kubernetes.io/hostname: k3s-worker-5-gpu
# Tolerate GPU node taint
tolerations:
- key: gpu
operator: Exists
effect: NoSchedule
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
persistentVolume: persistentVolume:
enabled: true enabled: true
size: 50Gi size: 30Gi
storageClass: local-path storageClass: nfs-client

View File

@ -1,17 +1,16 @@
service: service:
type: ClusterIP type: ClusterIP
# IMPORTANT: disable the embedded Ollama that the chart can deploy env:
ollama: - name: OLLAMA_BASE_URL
enabled: false value: http://ollama.ai-stack.svc.cluster.local:11434
# IMPORTANT: set BOTH vars to your existing Ollama service
extraEnvVars: extraEnvVars:
- name: OLLAMA_BASE_URL - name: OLLAMA_BASE_URL
value: http://ollama:11434 value: http://ollama:11434
- name: OLLAMA_BASE_URLS - name: OLLAMA_BASE_URLS
value: http://ollama:11434 value: http://ollama:11434
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
@ -19,21 +18,3 @@ resources:
limits: limits:
cpu: 500m cpu: 500m
memory: 1Gi memory: 1Gi
persistence:
enabled: true
storageClass: local-path
size: 2Gi
# Schedule on GPU worker node (same as Ollama for low latency)
nodeSelector:
kubernetes.io/hostname: k3s-worker-5-gpu
# Tolerate GPU node taint
tolerations:
- key: gpu
operator: Exists
effect: NoSchedule
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule