-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Adding monitoring for AMD CPU w/ NVIDIA
- Loading branch information
1 parent
ebfc85e
commit 91ac489
Showing
4 changed files
with
341 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
# build phase | ||
FROM python:slim-bookworm as builder | ||
|
||
WORKDIR /app | ||
|
||
RUN apt-get update -y && apt-get install -y gcc python3-dev | ||
|
||
COPY ./intel_x86/requirements.txt . | ||
|
||
RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt | ||
|
||
|
||
# stage two | ||
FROM python:slim-bookworm | ||
|
||
COPY --from=builder /app/wheels /wheels | ||
COPY --from=builder /app/requirements.txt . | ||
|
||
RUN pip3 install --no-cache /wheels/* | ||
|
||
COPY ./hw_monitoring_libraries ./hw_monitoring_libraries | ||
|
||
WORKDIR /hw_telemetry | ||
|
||
COPY ./amdcpu_nvidiagpu ./ | ||
|
||
ENTRYPOINT ["python3", "/hw_telemetry/main.py"] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
#!/usr/bin/env python | ||
# Markham Lee (C) 2023 | ||
# Hardware Monitor for Linux & Windows: | ||
# https://github.com/MarkhamLee/HardwareMonitoring | ||
# This is for Linux devices running on AMD CPUs | ||
# CLI instructions file_name + <MQTT topic name as a string> | ||
# + <Integer for sleep interval> | ||
# e.g., python3 monitor_amd_linux.py '/home/amd' 5 | ||
import gc | ||
import json | ||
import logging | ||
import os | ||
import sys | ||
from time import sleep | ||
|
||
# this allows us to import modules from the parent directory | ||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | ||
sys.path.append(parent_dir) | ||
|
||
from hw_monitoring_libraries.amd_apu import AMDCPUData # noqa: E402 | ||
from hw_monitoring_libraries.nvidia_gpu import NvidiaSensors # noqa: E402 | ||
from hw_monitoring_libraries.logging_util import logger # noqa: E402 | ||
from hw_monitoring_libraries.hw_monitoring\ | ||
import MonitoringUtilities # noqa: E402 | ||
|
||
|
||
def monitor(client: object, get_data: object, | ||
gpu_data: object, | ||
TOPIC: str, | ||
INTERVAL: int): | ||
|
||
while True: | ||
|
||
# get CPU utilization | ||
cpu_util = get_data.get_cpu_data() | ||
|
||
# get current RAM use | ||
ram_util = get_data.get_ram_data() | ||
|
||
# get current freq and core count | ||
cpu_freq, core_count = get_data.get_freq() | ||
|
||
# get CPU, iGPU and NVME temperatures | ||
nvme_temp, cpu_temp, amdgpu_temp = get_data.amd_linux_temp_data() | ||
|
||
# get NVIDIA data | ||
# get GPU data | ||
gpu_temp, gpu_load, gpu_vram, gpu_power, gpu_clock = gpu_data.\ | ||
gpu_query() | ||
|
||
payload = { | ||
"cpu_temp": cpu_temp, | ||
"amdgpu_temp": amdgpu_temp, | ||
"nvme_temp": nvme_temp, | ||
"cpu_freq": cpu_freq, | ||
"cpu_use": cpu_util, | ||
"ram_use": ram_util, | ||
"gpu_temp": gpu_temp, | ||
"gpu_load": gpu_load, | ||
"gpu_vram": gpu_vram, | ||
"gpu_power": gpu_power, | ||
"gpu_clock": gpu_clock | ||
} | ||
|
||
payload = json.dumps(payload) | ||
logger.info(payload) | ||
|
||
result = client.publish(TOPIC, payload) | ||
status = result[0] | ||
|
||
if status != 0: | ||
print(f'Failed to send {payload} to: {TOPIC}') | ||
logging.debug(f'MQTT publishing failure, return code: {status}') | ||
|
||
del payload, cpu_temp, amdgpu_temp, nvme_temp, cpu_freq, \ | ||
cpu_util, ram_util, status, result | ||
gc.collect() | ||
|
||
sleep(INTERVAL) | ||
|
||
|
||
def main(): | ||
|
||
# instantiate utilities class | ||
monitor_utilities = MonitoringUtilities() | ||
|
||
TOPIC = os.environ['TOPIC'] | ||
INTERVAL = int(os.environ['INTERVAL']) | ||
|
||
# load environmental variables | ||
MQTT_BROKER = os.environ["MQTT_BROKER"] | ||
MQTT_USER = os.environ['MQTT_USER'] | ||
MQTT_SECRET = os.environ['MQTT_SECRET'] | ||
MQTT_PORT = int(os.environ['MQTT_PORT']) | ||
|
||
# get unique client ID | ||
client_id = monitor_utilities.getClientID() | ||
|
||
# get mqtt client | ||
client, code = monitor_utilities.mqttClient(client_id, | ||
MQTT_USER, | ||
MQTT_SECRET, | ||
MQTT_BROKER, | ||
MQTT_PORT) | ||
|
||
# instantiate CPU & GPU data classes | ||
get_data = AMDCPUData() | ||
|
||
# gpu monitoring class | ||
gpu_data = NvidiaSensors() | ||
logger.info('CPU & GPU monitoring classes instantiated') | ||
|
||
# start data monitoring | ||
try: | ||
monitor(client, get_data, gpu_data, TOPIC, INTERVAL) | ||
|
||
finally: | ||
client.loop_stop() | ||
|
||
|
||
if __name__ == '__main__': | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
# Markham Lee (C) 2023 | ||
# Hardware Monitor for Linux & Windows: | ||
# https://github.com/MarkhamLee/k3s-data-platform-IoT | ||
# script to retrieve CPU related data on an AMD x86 machine | ||
# and then write to InfluxDB | ||
import json | ||
import psutil | ||
|
||
|
||
class AMDCPUData(): | ||
|
||
def __init__(self): | ||
|
||
# get the # of cores, as we can use that to iterate through and | ||
# get things like current speed for all CPU cores | ||
self.core_count = psutil.cpu_count(logical=False) | ||
|
||
def build_payload(self, inputFunction, index=0): | ||
|
||
temp_dict = {} | ||
|
||
while self.core_count > index: | ||
|
||
data = inputFunction[index].current | ||
data = round(data, 1) | ||
key = (f'core {index}') | ||
temp_dict[key] = data | ||
index += 1 | ||
|
||
payload = json.dumps(temp_dict) | ||
|
||
return payload | ||
|
||
# get average clock speed for all cores | ||
def get_freq(self, all_cpu=False): | ||
|
||
all_freq = psutil.cpu_freq(percpu=all_cpu)[0] | ||
all_freq = round(all_freq, 1) | ||
|
||
return all_freq, self.core_count | ||
|
||
# get frequency per core | ||
@staticmethod | ||
def freq_per_core(self, all_cpu=True): | ||
|
||
per_core_freq = self.buildPayload(psutil.cpu_freq(percpu=all_cpu)) | ||
|
||
return per_core_freq | ||
|
||
# CPU load | ||
@staticmethod | ||
def get_cpu_data(): | ||
|
||
cpu_util = (psutil.cpu_percent(interval=1)) | ||
cpu_util = round(cpu_util, 1) | ||
|
||
return cpu_util | ||
|
||
# get current RAM used | ||
@staticmethod | ||
def get_ram_data(): | ||
|
||
ram_use = (psutil.virtual_memory()[3]) / 1073741824 | ||
ram_use = round(ram_use, 2) | ||
|
||
return ram_use | ||
|
||
@staticmethod | ||
def amd_linux_temp_data(): | ||
|
||
nvme_temp = psutil.sensors_temperatures()['nvme'][0].current | ||
cpu_temp = psutil.sensors_temperatures()['k10temp'][0].current | ||
amdgpu_temp = psutil.sensors_temperatures()['amdgpu'][0].current | ||
|
||
return nvme_temp, cpu_temp, amdgpu_temp |
117 changes: 117 additions & 0 deletions
117
hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
# Markham Lee 2023 - 2024 | ||
# Retrieving temp, load, etc., data from NVIDIA GPUs | ||
# # https://github.com/MarkhamLee/HardwareMonitoring | ||
# All the GPU data comes from NVIDI SMI Queries, you can read more here: | ||
# https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries | ||
# this script is platform agnostic, should work exactly the same on Linux and | ||
# Windows devices running x86 processors | ||
import subprocess as sp | ||
|
||
|
||
class NvidiaSensors(): | ||
|
||
def __init__(self): | ||
|
||
pass | ||
|
||
# parsing the data from the smi query is fairly consistent, so created | ||
# this generalized query method I can either call directly with a query | ||
# or call via the other methods that have the "canned" queries already | ||
# set up. | ||
|
||
@staticmethod | ||
def smi_parser(query: str): | ||
|
||
cmd = "nvidia-smi --query-gpu=" + query + " --format=csv,noheader" | ||
data = sp.check_output(cmd, shell=True) | ||
data = data.decode("utf-8").strip().split("\n") | ||
data = data[0] | ||
data = data.split(',') | ||
data = [''.join(x for x in i if x.isdigit()) for i in data] | ||
|
||
return data | ||
|
||
# getting all the data in one query saves on quite a bit of latency, | ||
# as whether it's one item or six, these queries run in about 30-40ms, | ||
# doing them separate it was closer to 250ms | ||
@staticmethod | ||
def gpu_query(): | ||
|
||
query = ("temperature.gpu,utilization.gpu,memory.used,power.draw," | ||
"clocks.current.graphics,encoder.stats.averageFps") | ||
|
||
data = NvidiaSensors.smi_parser(query) | ||
|
||
# split out each value from the returned list of values | ||
|
||
temp = int(data[0]) | ||
gpu_load = int(data[1]) / 100 | ||
gpu_vram = round(((float(data[2])) / 1024), 2) | ||
gpu_power = round((float(data[3])) / 100, 2) | ||
gpu_clock = int(data[4]) | ||
|
||
# commented this out, as the FPS query shows the average across | ||
# all apps not useful for when I'm playing games. Will need to look | ||
# at an alternate approach like Riva Statistics Tuner. Also the | ||
# NVIDIA overlay app shows this data, so there is probably a way to | ||
# get this data via SMI queries that I just haven't discovered yet. | ||
# fps = int(data[5]) # | ||
|
||
return temp, gpu_load, gpu_vram, gpu_power, gpu_clock | ||
|
||
# the following are the individual queries for the data points included in | ||
# the GPU query, I just put these here in case I needed them/needed to just | ||
# get an individual data point. | ||
@staticmethod | ||
def gpu_load(): | ||
|
||
query = "utilization.gpu" | ||
data = NvidiaSensors.smi_parser(query) | ||
data = int(data[0]) | ||
|
||
return data | ||
|
||
@staticmethod | ||
def gpu_temp(): | ||
|
||
query = "temperature.gpu" | ||
data = NvidiaSensors.smi_parser(query) | ||
data = int(data[0]) | ||
|
||
return data | ||
|
||
@staticmethod | ||
def vram_used(): | ||
|
||
query = "memory.used" | ||
data = NvidiaSensors.smi_parser(query) | ||
data = round((float(data[0]) / 1024), 2) | ||
|
||
return data | ||
|
||
@staticmethod | ||
def gpu_power(): | ||
|
||
query = 'power.draw' | ||
data = NvidiaSensors.smi_parser(query) | ||
data = round((float(data[0])) / 100, 2) | ||
|
||
return data | ||
|
||
@staticmethod | ||
def gpu_fps(): | ||
|
||
query = 'encoder.stats.averageFps' | ||
data = NvidiaSensors.smi_parser(query) | ||
data = int(data[0]) | ||
|
||
return data | ||
|
||
@staticmethod | ||
def gpu_clock(): | ||
|
||
query = 'clocks.current.graphics' | ||
data = NvidiaSensors.smi_parser(query) | ||
data = int(data[0]) | ||
|
||
return data |