From a52c9c0561c216b9deb848093f1459a0a946a3b5 Mon Sep 17 00:00:00 2001 From: Christian Rohmann Date: Mon, 8 Jul 2024 11:26:15 +0200 Subject: [PATCH] [dashboard] Rework dashboard (MIG support, Grafana deprecations, Hostname) * Use PromQL aggregations to take MIG subdevices into account (see #353) * Update all panels to use Timeseries panels (instead of deprecated Graph) * Switch from instance to Hostname to select individual systems to avoid duplicated timeseries for Kubernetes daemonsets and their Pod names * Use DCGM_FI_DEV_FB_FREE instead of DCGM_FI_DEV_GPU_TEMP to also cover vGPU (~ PR #240) * Use DCGM_FI_PROF_GR_ENGINE_ACTIVE to determine utilization to cover MIG (and vGPU) Fixes: #353, #236 Signed-off-by: Christian Rohmann --- grafana/dcgm-exporter-dashboard.json | 1205 ++++++++++++++------------ 1 file changed, 658 insertions(+), 547 deletions(-) diff --git a/grafana/dcgm-exporter-dashboard.json b/grafana/dcgm-exporter-dashboard.json index c1681041..f4fdd174 100644 --- a/grafana/dcgm-exporter-dashboard.json +++ b/grafana/dcgm-exporter-dashboard.json @@ -1,4 +1,6 @@ { + "__inputs": [], + "__elements": {}, "__requires": [ { "type": "panel", @@ -10,136 +12,185 @@ "type": "grafana", "id": "grafana", "name": "Grafana", - "version": "6.7.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" + "version": "9.1.5" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" + }, + { + "type": "panel", + "id": "timeseries", + "name": "Time series", + "version": "" } ], "annotations": { "list": [ { - "$$hashKey": "object:192", "builtIn": 1, - "datasource": "-- Grafana --", + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, "type": "dashboard" } ] }, "description": "This dashboard is to display the metrics from DCGM Exporter on a Kubernetes (1.19+) cluster", "editable": true, + "fiscalYearStartMonth": 0, "gnetId": 12239, "graphTooltip": 0, "id": null, - "iteration": 1588401887165, "links": [], + "liveNow": false, "panels": [ { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 0 }, - "hiddenSeries": false, "id": 12, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.5", "targets": [ { - "expr": "DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"}", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$Hostname\", gpu=~\"$gpu\"}) by (Hostname, gpu, uuid)", "instant": false, "interval": "", - "legendFormat": "GPU {{gpu}}", + "legendFormat": "{{Hostname}} - GPU {{gpu}}", "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, "title": "GPU Temperature", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "celsius", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" }, { - "datasource": "$datasource", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 100, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 83 + }, + { + "color": "red", + "value": 87 + } + ] + }, + "unit": "celsius" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -148,151 +199,167 @@ }, "id": 14, "options": { - "fieldOptions": { + "orientation": "auto", + "reduceOptions": { "calcs": [ "mean" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 100, - "min": 0, - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 83 - }, - { - "color": "red", - "value": 87 - } - ] - }, - "unit": "celsius" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.1.5", "targets": [ { - "expr": "avg(DCGM_FI_DEV_GPU_TEMP{instance=~\"$instance\", gpu=~\"$gpu\"})", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "avg(DCGM_FI_DEV_GPU_TEMP{Hostname=~\"$Hostname\", gpu=~\"$gpu\"})", "interval": "", "legendFormat": "", + "range": true, "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Avg. Temp", "type": "gauge" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 18, "x": 0, "y": 8 }, - "hiddenSeries": false, "id": 10, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", "options": { - "dataLinks": [] - }, - "percentage": false, - "pluginVersion": "6.5.2", - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.5", "targets": [ { - "expr": "DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"}", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$Hostname\", gpu=~\"$gpu\"}) by (Hostname, gpu, uuid)", "interval": "", - "legendFormat": "GPU {{gpu}}", + "legendFormat": "{{Hostname}} - GPU {{gpu}}", + "range": true, "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, "title": "GPU Power Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "watt", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" }, { - "cacheTimeout": null, - "datasource": "$datasource", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "mappings": [], + "max": 2400, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "#EAB839", + "value": 1800 + }, + { + "color": "red", + "value": 2200 + } + ] + }, + "unit": "watt" + }, + "overrides": [] + }, "gridPos": { "h": 8, "w": 6, @@ -302,48 +369,25 @@ "id": 16, "links": [], "options": { - "fieldOptions": { + "orientation": "horizontal", + "reduceOptions": { "calcs": [ "sum" ], - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "max": 2400, - "min": 0, - "nullValueMode": "connected", - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "#EAB839", - "value": 1800 - }, - { - "color": "red", - "value": 2200 - } - ] - }, - "unit": "watt" - }, - "overrides": [], + "fields": "", "values": false }, - "orientation": "horizontal", "showThresholdLabels": false, "showThresholdMarkers": true }, - "pluginVersion": "6.7.3", + "pluginVersion": "9.1.5", "targets": [ { - "expr": "sum(DCGM_FI_DEV_POWER_USAGE{instance=~\"$instance\", gpu=~\"$gpu\"})", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "sum(max(DCGM_FI_DEV_POWER_USAGE{Hostname=~\"$Hostname\", gpu=~\"$gpu\"}) by (gpu))", "instant": true, "interval": "", "legendFormat": "", @@ -351,386 +395,451 @@ "refId": "A" } ], - "timeFrom": null, - "timeShift": null, "title": "GPU Power Total", "type": "gauge" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "hertz" + }, + "overrides": [] + }, "gridPos": { - "h": 8, - "w": 12, + "h": 9, + "w": 24, "x": 0, "y": 16 }, - "hiddenSeries": false, "id": 2, "interval": "", - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.5", "targets": [ { - "expr": "DCGM_FI_DEV_SM_CLOCK{instance=~\"$instance\", gpu=~\"$gpu\"} * 1000000", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max(DCGM_FI_DEV_SM_CLOCK{Hostname=~\"$Hostname\", gpu=~\"$gpu\"} * 1000000) by (Hostname, UUID, gpu)", "format": "time_series", "interval": "", "intervalFactor": 1, - "legendFormat": "GPU {{gpu}}", + "legendFormat": "{{Hostname}} - GPU {{gpu}}", + "range": true, "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, "title": "GPU SM Clocks", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "hertz", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, "gridPos": { "h": 8, - "w": 12, + "w": 24, "x": 0, - "y": 24 + "y": 25 }, - "hiddenSeries": false, "id": 6, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.5", "targets": [ { - "expr": "DCGM_FI_DEV_GPU_UTIL{instance=~\"$instance\", gpu=~\"$gpu\"}", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max(DCGM_FI_PROF_GR_ENGINE_ACTIVE{Hostname=~\"$Hostname\", gpu=~\"$gpu\"}) by (Hostname, gpu, GPU_I_ID, GPU_I_PROFILE)", "interval": "", - "legendFormat": "GPU {{gpu}}", + "legendFormat": "GPU {{gpu}}/{{GPU_I_ID}} ({{GPU_I_PROFILE}})", + "range": true, "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, "title": "GPU Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": "100", - "min": "0", - "show": true - }, + "transformations": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "id": "renameByRegex", + "options": { + "regex": "(.*)/ \\(\\)$", + "renamePattern": "$1" + } } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "decimals": 1, + "links": [], + "mappings": [], + "max": 1, + "min": 0, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percentunit" + }, + "overrides": [] + }, "gridPos": { - "h": 8, - "w": 12, + "h": 9, + "w": 24, "x": 0, - "y": 32 + "y": 33 }, - "hiddenSeries": false, - "id": 18, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", + "id": 4, "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.5", "targets": [ { - "expr": "DCGM_FI_DEV_FB_USED{instance=~\"$instance\", gpu=~\"$gpu\"}", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max(DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{Hostname=~\"$Hostname\", gpu=~\"$gpu\"}) by (Hostname, gpu, GPU_I_ID, GPU_I_PROFILE)", "interval": "", - "legendFormat": "GPU {{gpu}}", + "legendFormat": "{{Hostname}} GPU {{gpu}}/{{GPU_I_ID}} ({{GPU_I_PROFILE}})", + "range": true, "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "GPU Framebuffer Mem Used", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decmbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, + "title": "Tensor Core Utilization", + "transformations": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "id": "renameByRegex", + "options": { + "regex": "(.*)/ \\(\\)$", + "renamePattern": "$1" + } } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" }, { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "$datasource", - "fill": 1, - "fillGradient": 0, + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "never", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "links": [], + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, "gridPos": { - "h": 8, - "w": 12, + "h": 9, + "w": 24, "x": 0, - "y": 24 + "y": 42 }, - "hiddenSeries": false, - "id": 4, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "nullPointMode": "null", + "id": 18, "options": { - "dataLinks": [] - }, - "percentage": false, - "pointradius": 2, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, + "legend": { + "calcs": [ + "mean", + "lastNotNull", + "max" + ], + "displayMode": "table", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "multi", + "sort": "none" + } + }, + "pluginVersion": "9.1.5", "targets": [ { - "expr": "DCGM_FI_PROF_PIPE_TENSOR_ACTIVE{instance=~\"$instance\", gpu=~\"$gpu\"}", + "datasource": { + "uid": "$datasource" + }, + "editorMode": "code", + "expr": "max(DCGM_FI_DEV_FB_USED{Hostname=~\"$Hostname\", gpu=~\"$gpu\"}) by (Hostname, gpu, GPU_I_ID, GPU_I_PROFILE)", "interval": "", - "legendFormat": "GPU {{gpu}}", + "legendFormat": "{{Hostname}} - GPU {{gpu}}/{{GPU_I_ID}} ({{GPU_I_PROFILE}})", + "range": true, "refId": "A" } ], - "thresholds": [], - "timeFrom": null, - "timeRegions": [], - "timeShift": null, - "title": "Tensor Core Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": "1", - "min": "0", - "show": true - }, + "title": "GPU Framebuffer Mem Used", + "transformations": [ { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true + "id": "renameByRegex", + "options": { + "regex": "(.*)/ \\(\\)$", + "renamePattern": "$1" + } } ], - "yaxis": { - "align": false, - "alignLevel": null - } + "type": "timeseries" } ], "refresh": false, - "schemaVersion": 22, + "schemaVersion": 37, "style": "dark", "tags": [], "templating": { "list": [ { "current": { - "selected": true, + "selected": false, "text": "Prometheus", "value": "Prometheus" }, @@ -747,47 +856,51 @@ "type": "datasource" }, { - "allValue": null, "current": {}, - "datasource": "$datasource", - "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "definition": "label_values(DCGM_FI_DEV_FB_FREE, Hostname)", "hide": 0, "includeAll": true, - "index": -1, - "label": null, "multi": true, - "name": "instance", + "name": "Hostname", "options": [], - "query": "label_values(DCGM_FI_DEV_GPU_TEMP, instance)", + "query": { + "query": "label_values(DCGM_FI_DEV_FB_FREE, Hostname)", + "refId": "StandardVariableQuery" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { - "allValue": null, "current": {}, - "datasource": "$datasource", - "definition": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "datasource": { + "type": "prometheus", + "uid": "$datasource" + }, + "definition": "label_values(DCGM_FI_DEV_FB_FREE{Hostname=~\"$Hostname\"}, gpu)", "hide": 0, "includeAll": true, - "index": -1, - "label": null, "multi": true, "name": "gpu", "options": [], - "query": "label_values(DCGM_FI_DEV_GPU_TEMP, gpu)", + "query": { + "query": "label_values(DCGM_FI_DEV_FB_FREE{Hostname=~\"$Hostname\"}, gpu)", + "refId": "StandardVariableQuery" + }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", - "tags": [], "tagsQuery": "", "type": "query", "useTags": false @@ -815,8 +928,6 @@ "timezone": "", "title": "NVIDIA DCGM Exporter Dashboard", "uid": "Oxed_c6Wz", - "variables": { - "list": [] - }, - "version": 1 + "version": 1, + "weekStart": "" }