From 3d1e1ccec077252c09ff8b98e866685b8a7f190b Mon Sep 17 00:00:00 2001 From: Dmitry Shmulevich Date: Mon, 30 Dec 2024 11:57:12 -0800 Subject: [PATCH] add Grafana dashboard for GPU occupancy (#116) Signed-off-by: Dmitry Shmulevich --- resources/dashboards/GPU-occupancy.json | 254 ++++++++++++++++++++++++ 1 file changed, 254 insertions(+) create mode 100644 resources/dashboards/GPU-occupancy.json diff --git a/resources/dashboards/GPU-occupancy.json b/resources/dashboards/GPU-occupancy.json new file mode 100644 index 0000000..e4c9d8c --- /dev/null +++ b/resources/dashboards/GPU-occupancy.json @@ -0,0 +1,254 @@ +{ + "__inputs": [ + { + "name": "DS_PROMETHEUS", + "label": "Prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__elements": {}, + "__requires": [ + { + "type": "panel", + "id": "gauge", + "name": "Gauge", + "version": "" + }, + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "11.0.0" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" + } + ], + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "red", + "value": null + }, + { + "color": "orange", + "value": 80 + }, + { + "color": "yellow", + "value": 90 + }, + { + "color": "green", + "value": 95 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true, + "sizing": "auto" + }, + "pluginVersion": "11.0.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(avg_over_time(node_resource_occupancy{resource=\"nvidia.com/gpu\"}[$__range]))", + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Avg. cluster GPU occupancy (%)", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "max": 110, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 14, + "x": 4, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "maxHeight": 600, + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "avg(node_resource_occupancy{resource=\"nvidia.com/gpu\"})", + "instant": false, + "interval": "10s", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Avg. node GPU occupancy (%)", + "type": "timeseries" + } + ], + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timeRangeUpdatedDuringEditOrView": false, + "timepicker": {}, + "timezone": "browser", + "title": "GPU occupancy", + "uid": "ddt1v72mdqxa8a", + "version": 18, + "weekStart": "" +} \ No newline at end of file