-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocker-compose-node-gpu.yaml
80 lines (75 loc) · 1.79 KB
/
docker-compose-node-gpu.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
services:
dcgm_exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_IMAGE_TAG}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
restart: unless-stopped
environment:
- DCGM_EXPORTER_NO_HOSTNAME=1
cap_add:
- SYS_ADMIN
ports:
- "${DCGM_EXPORTER_HOST_PORT}:9400"
networks:
- gpu_metrics
# nvidia-smi-exporter:
# privileged: true
# build: ./nvidia-smi-exporter
# runtime: nvidia
# pid: "host"
# volumes:
# - prometheus_textfiles:/run/prometheus
# - /var/run/docker.sock:/var/run/docker.sock
# restart: unless-stopped
node_exporter:
image: prom/node-exporter:latest
restart: unless-stopped
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--web.listen-address=:9401'
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9401:9401"
networks:
- gpu_metrics
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.1
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
#- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
restart: unless-stopped
command:
- '-port=8098'
ports:
- "8098:8098"
networks:
- gpu_metrics
volumes:
prometheus_data:
driver: local
grafana_data:
driver: local
prometheus_textfiles:
driver_opts:
type: tmpfs
device: tmpfs
networks:
gpu_metrics:
driver: bridge