-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdocker-compose.yaml
101 lines (95 loc) · 2.6 KB
/
docker-compose.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
services:
dcgm_exporter:
image: nvcr.io/nvidia/k8s/dcgm-exporter:${DCGM_EXPORTER_IMAGE_TAG}
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [ gpu ]
restart: always
environment:
- DCGM_EXPORTER_NO_HOSTNAME=1
cap_add:
- SYS_ADMIN
ports:
- "${DCGM_EXPORTER_HOST_PORT}:9400"
networks:
- gpu_metrics
node_exporter:
image: prom/node-exporter:latest
restart: unless-stopped
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--web.listen-address=:9401'
- '--path.procfs=/host/proc'
- '--path.rootfs=/rootfs'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
ports:
- "9401:9401"
networks:
- gpu_metrics
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.47.1
container_name: cadvisor
privileged: true
devices:
- /dev/kmsg:/dev/kmsg
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
#- /cgroup:/cgroup:ro #doesn't work on MacOS only for Linux
restart: unless-stopped
command:
- '-port=8098'
ports:
- "8098:8098"
networks:
- gpu_metrics
prometheus:
image: prom/prometheus:${PROMETHEUS_IMAGE_TAG}
ports:
- "${PROMETHEUS_HOST_PORT}:9090"
command:
- '--storage.tsdb.path=/prometheus'
- '--web.console.libraries=/usr/share/prometheus/console_libraries'
- '--web.console.templates=/usr/share/prometheus/consoles'
- '--config.file=/home/prometheus.yml'
- '--web.enable-lifecycle'
restart: always
volumes:
# make sure gpu-metrics-prometheus-config.yaml has 777
# sudo chmod -R 777 gpu-metrics-prometheus-config.yaml
- ./gpu-metrics-prometheus-config.yaml:/home/prometheus.yml
- prometheus_data:/prometheus
networks:
- gpu_metrics
grafana:
image: grafana/grafana:${GRAFANA_IMAGE_TAG}
ports:
- "${GRAFANA_HOST_PORT}:3000"
restart: always
environment:
- GF_SECURITY_ADMIN_USER=${GRAFANA_ADMIN_USER}
- GF_SECURITY_ADMIN_PASSWORD=${GRAFANA_ADMIN_PASSWORD}
volumes:
- grafana_data:/var/lib/grafana
- ./grafana_provisioning:/etc/grafana/provisioning:ro
- ./grafana_dashboards:/etc/dashboards:ro
networks:
- gpu_metrics
volumes:
prometheus_data:
driver: local
grafana_data:
driver: local
networks:
gpu_metrics:
driver: bridge