From 4710193c147a64bc6cbebc294abfccbd35ab7a73 Mon Sep 17 00:00:00 2001 From: Robert Love Date: Sun, 12 Jan 2025 09:07:54 -0500 Subject: [PATCH] ai-prio-class --- kubernetes/main/apps/ai/kustomization.yaml | 1 + kubernetes/main/apps/ai/ollama/app/helmrelease.yaml | 2 ++ kubernetes/main/apps/ai/priority-class.yaml | 8 ++++++++ .../apps/ai/stable-diffusion/comfyui/helmrelease.yaml | 2 ++ kubernetes/main/apps/media/immich/app/helmrelease.yaml | 2 ++ .../exporters/dcgm-exporter/app/helm-release.yaml | 1 + 6 files changed, 16 insertions(+) create mode 100644 kubernetes/main/apps/ai/priority-class.yaml diff --git a/kubernetes/main/apps/ai/kustomization.yaml b/kubernetes/main/apps/ai/kustomization.yaml index 55b952fe3c..ea2660c8af 100644 --- a/kubernetes/main/apps/ai/kustomization.yaml +++ b/kubernetes/main/apps/ai/kustomization.yaml @@ -3,6 +3,7 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: + - ./priority-class.yaml - ./k8sgpt/ks.yaml - ./ollama/ks.yaml - ./stable-diffusion/ks.yaml diff --git a/kubernetes/main/apps/ai/ollama/app/helmrelease.yaml b/kubernetes/main/apps/ai/ollama/app/helmrelease.yaml index 8479f812f7..a6c6b5c4db 100644 --- a/kubernetes/main/apps/ai/ollama/app/helmrelease.yaml +++ b/kubernetes/main/apps/ai/ollama/app/helmrelease.yaml @@ -69,6 +69,8 @@ spec: runtimeClassName: nvidia + priorityClassName: ai-gpu-critical + securityContext: privileged: true diff --git a/kubernetes/main/apps/ai/priority-class.yaml b/kubernetes/main/apps/ai/priority-class.yaml new file mode 100644 index 0000000000..a9122e9be2 --- /dev/null +++ b/kubernetes/main/apps/ai/priority-class.yaml @@ -0,0 +1,8 @@ +--- +apiVersion: scheduling.k8s.io/v1 +description: Used for pods that must run on a node with a GPU. +kind: PriorityClass +metadata: + name: ai-gpu-critical +preemptionPolicy: PreemptLowerPriority +value: 100040 diff --git a/kubernetes/main/apps/ai/stable-diffusion/comfyui/helmrelease.yaml b/kubernetes/main/apps/ai/stable-diffusion/comfyui/helmrelease.yaml index 8679dedbf5..6667c072e7 100644 --- a/kubernetes/main/apps/ai/stable-diffusion/comfyui/helmrelease.yaml +++ b/kubernetes/main/apps/ai/stable-diffusion/comfyui/helmrelease.yaml @@ -36,6 +36,8 @@ spec: pod: runtimeClassName: nvidia + priorityClassName: ai-gpu-critical + affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: diff --git a/kubernetes/main/apps/media/immich/app/helmrelease.yaml b/kubernetes/main/apps/media/immich/app/helmrelease.yaml index cae3c51d59..94ecddc803 100644 --- a/kubernetes/main/apps/media/immich/app/helmrelease.yaml +++ b/kubernetes/main/apps/media/immich/app/helmrelease.yaml @@ -111,6 +111,8 @@ spec: machine-learning: runtimeClassName: nvidia + priorityClassName: ai-gpu-critical + securityContext: privileged: true diff --git a/kubernetes/main/apps/monitoring/exporters/dcgm-exporter/app/helm-release.yaml b/kubernetes/main/apps/monitoring/exporters/dcgm-exporter/app/helm-release.yaml index d1a49f2a36..f918373ee2 100644 --- a/kubernetes/main/apps/monitoring/exporters/dcgm-exporter/app/helm-release.yaml +++ b/kubernetes/main/apps/monitoring/exporters/dcgm-exporter/app/helm-release.yaml @@ -37,6 +37,7 @@ spec: nodeSelector: nvidia.com/gpu.present: "true" runtimeClassName: nvidia + priorityClassName: ai-gpu-critical resources: limits: nvidia.com/gpu: 1