From 91ac4899eb7693a39976f52b9b4117da30380c6d Mon Sep 17 00:00:00 2001 From: Markham Lee Date: Mon, 30 Sep 2024 19:49:44 -0700 Subject: [PATCH] Adding monitoring for AMD CPU w/ NVIDIA --- .../amdcpu_nvidiagpu/Dockerfile | 27 ++++ hardware_monitoring/amdcpu_nvidiagpu/main.py | 122 ++++++++++++++++++ .../hw_monitoring_libraries/amd_apu.py | 75 +++++++++++ .../hw_monitoring_libraries/nvidia_gpu.py | 117 +++++++++++++++++ 4 files changed, 341 insertions(+) create mode 100644 hardware_monitoring/amdcpu_nvidiagpu/Dockerfile create mode 100644 hardware_monitoring/amdcpu_nvidiagpu/main.py create mode 100644 hardware_monitoring/hw_monitoring_libraries/amd_apu.py create mode 100644 hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py diff --git a/hardware_monitoring/amdcpu_nvidiagpu/Dockerfile b/hardware_monitoring/amdcpu_nvidiagpu/Dockerfile new file mode 100644 index 0000000..0cbf182 --- /dev/null +++ b/hardware_monitoring/amdcpu_nvidiagpu/Dockerfile @@ -0,0 +1,27 @@ +# build phase +FROM python:slim-bookworm as builder + +WORKDIR /app + +RUN apt-get update -y && apt-get install -y gcc python3-dev + +COPY ./intel_x86/requirements.txt . + +RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt + + +# stage two +FROM python:slim-bookworm + +COPY --from=builder /app/wheels /wheels +COPY --from=builder /app/requirements.txt . + +RUN pip3 install --no-cache /wheels/* + +COPY ./hw_monitoring_libraries ./hw_monitoring_libraries + +WORKDIR /hw_telemetry + +COPY ./amdcpu_nvidiagpu ./ + +ENTRYPOINT ["python3", "/hw_telemetry/main.py"] \ No newline at end of file diff --git a/hardware_monitoring/amdcpu_nvidiagpu/main.py b/hardware_monitoring/amdcpu_nvidiagpu/main.py new file mode 100644 index 0000000..63c479e --- /dev/null +++ b/hardware_monitoring/amdcpu_nvidiagpu/main.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# Markham Lee (C) 2023 +# Hardware Monitor for Linux & Windows: +# https://github.com/MarkhamLee/HardwareMonitoring +# This is for Linux devices running on AMD CPUs +# CLI instructions file_name + +# + +# e.g., python3 monitor_amd_linux.py '/home/amd' 5 +import gc +import json +import logging +import os +import sys +from time import sleep + +# this allows us to import modules from the parent directory +parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +sys.path.append(parent_dir) + +from hw_monitoring_libraries.amd_apu import AMDCPUData # noqa: E402 +from hw_monitoring_libraries.nvidia_gpu import NvidiaSensors # noqa: E402 +from hw_monitoring_libraries.logging_util import logger # noqa: E402 +from hw_monitoring_libraries.hw_monitoring\ + import MonitoringUtilities # noqa: E402 + + +def monitor(client: object, get_data: object, + gpu_data: object, + TOPIC: str, + INTERVAL: int): + + while True: + + # get CPU utilization + cpu_util = get_data.get_cpu_data() + + # get current RAM use + ram_util = get_data.get_ram_data() + + # get current freq and core count + cpu_freq, core_count = get_data.get_freq() + + # get CPU, iGPU and NVME temperatures + nvme_temp, cpu_temp, amdgpu_temp = get_data.amd_linux_temp_data() + + # get NVIDIA data + # get GPU data + gpu_temp, gpu_load, gpu_vram, gpu_power, gpu_clock = gpu_data.\ + gpu_query() + + payload = { + "cpu_temp": cpu_temp, + "amdgpu_temp": amdgpu_temp, + "nvme_temp": nvme_temp, + "cpu_freq": cpu_freq, + "cpu_use": cpu_util, + "ram_use": ram_util, + "gpu_temp": gpu_temp, + "gpu_load": gpu_load, + "gpu_vram": gpu_vram, + "gpu_power": gpu_power, + "gpu_clock": gpu_clock + } + + payload = json.dumps(payload) + logger.info(payload) + + result = client.publish(TOPIC, payload) + status = result[0] + + if status != 0: + print(f'Failed to send {payload} to: {TOPIC}') + logging.debug(f'MQTT publishing failure, return code: {status}') + + del payload, cpu_temp, amdgpu_temp, nvme_temp, cpu_freq, \ + cpu_util, ram_util, status, result + gc.collect() + + sleep(INTERVAL) + + +def main(): + + # instantiate utilities class + monitor_utilities = MonitoringUtilities() + + TOPIC = os.environ['TOPIC'] + INTERVAL = int(os.environ['INTERVAL']) + + # load environmental variables + MQTT_BROKER = os.environ["MQTT_BROKER"] + MQTT_USER = os.environ['MQTT_USER'] + MQTT_SECRET = os.environ['MQTT_SECRET'] + MQTT_PORT = int(os.environ['MQTT_PORT']) + + # get unique client ID + client_id = monitor_utilities.getClientID() + + # get mqtt client + client, code = monitor_utilities.mqttClient(client_id, + MQTT_USER, + MQTT_SECRET, + MQTT_BROKER, + MQTT_PORT) + + # instantiate CPU & GPU data classes + get_data = AMDCPUData() + + # gpu monitoring class + gpu_data = NvidiaSensors() + logger.info('CPU & GPU monitoring classes instantiated') + + # start data monitoring + try: + monitor(client, get_data, gpu_data, TOPIC, INTERVAL) + + finally: + client.loop_stop() + + +if __name__ == '__main__': + main() diff --git a/hardware_monitoring/hw_monitoring_libraries/amd_apu.py b/hardware_monitoring/hw_monitoring_libraries/amd_apu.py new file mode 100644 index 0000000..28dad4c --- /dev/null +++ b/hardware_monitoring/hw_monitoring_libraries/amd_apu.py @@ -0,0 +1,75 @@ +# Markham Lee (C) 2023 +# Hardware Monitor for Linux & Windows: +# https://github.com/MarkhamLee/k3s-data-platform-IoT +# script to retrieve CPU related data on an AMD x86 machine +# and then write to InfluxDB +import json +import psutil + + +class AMDCPUData(): + + def __init__(self): + + # get the # of cores, as we can use that to iterate through and + # get things like current speed for all CPU cores + self.core_count = psutil.cpu_count(logical=False) + + def build_payload(self, inputFunction, index=0): + + temp_dict = {} + + while self.core_count > index: + + data = inputFunction[index].current + data = round(data, 1) + key = (f'core {index}') + temp_dict[key] = data + index += 1 + + payload = json.dumps(temp_dict) + + return payload + + # get average clock speed for all cores + def get_freq(self, all_cpu=False): + + all_freq = psutil.cpu_freq(percpu=all_cpu)[0] + all_freq = round(all_freq, 1) + + return all_freq, self.core_count + + # get frequency per core + @staticmethod + def freq_per_core(self, all_cpu=True): + + per_core_freq = self.buildPayload(psutil.cpu_freq(percpu=all_cpu)) + + return per_core_freq + + # CPU load + @staticmethod + def get_cpu_data(): + + cpu_util = (psutil.cpu_percent(interval=1)) + cpu_util = round(cpu_util, 1) + + return cpu_util + + # get current RAM used + @staticmethod + def get_ram_data(): + + ram_use = (psutil.virtual_memory()[3]) / 1073741824 + ram_use = round(ram_use, 2) + + return ram_use + + @staticmethod + def amd_linux_temp_data(): + + nvme_temp = psutil.sensors_temperatures()['nvme'][0].current + cpu_temp = psutil.sensors_temperatures()['k10temp'][0].current + amdgpu_temp = psutil.sensors_temperatures()['amdgpu'][0].current + + return nvme_temp, cpu_temp, amdgpu_temp diff --git a/hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py b/hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py new file mode 100644 index 0000000..6153d0e --- /dev/null +++ b/hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py @@ -0,0 +1,117 @@ +# Markham Lee 2023 - 2024 +# Retrieving temp, load, etc., data from NVIDIA GPUs +# # https://github.com/MarkhamLee/HardwareMonitoring +# All the GPU data comes from NVIDI SMI Queries, you can read more here: +# https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries +# this script is platform agnostic, should work exactly the same on Linux and +# Windows devices running x86 processors +import subprocess as sp + + +class NvidiaSensors(): + + def __init__(self): + + pass + + # parsing the data from the smi query is fairly consistent, so created + # this generalized query method I can either call directly with a query + # or call via the other methods that have the "canned" queries already + # set up. + + @staticmethod + def smi_parser(query: str): + + cmd = "nvidia-smi --query-gpu=" + query + " --format=csv,noheader" + data = sp.check_output(cmd, shell=True) + data = data.decode("utf-8").strip().split("\n") + data = data[0] + data = data.split(',') + data = [''.join(x for x in i if x.isdigit()) for i in data] + + return data + + # getting all the data in one query saves on quite a bit of latency, + # as whether it's one item or six, these queries run in about 30-40ms, + # doing them separate it was closer to 250ms + @staticmethod + def gpu_query(): + + query = ("temperature.gpu,utilization.gpu,memory.used,power.draw," + "clocks.current.graphics,encoder.stats.averageFps") + + data = NvidiaSensors.smi_parser(query) + + # split out each value from the returned list of values + + temp = int(data[0]) + gpu_load = int(data[1]) / 100 + gpu_vram = round(((float(data[2])) / 1024), 2) + gpu_power = round((float(data[3])) / 100, 2) + gpu_clock = int(data[4]) + + # commented this out, as the FPS query shows the average across + # all apps not useful for when I'm playing games. Will need to look + # at an alternate approach like Riva Statistics Tuner. Also the + # NVIDIA overlay app shows this data, so there is probably a way to + # get this data via SMI queries that I just haven't discovered yet. + # fps = int(data[5]) # + + return temp, gpu_load, gpu_vram, gpu_power, gpu_clock + + # the following are the individual queries for the data points included in + # the GPU query, I just put these here in case I needed them/needed to just + # get an individual data point. + @staticmethod + def gpu_load(): + + query = "utilization.gpu" + data = NvidiaSensors.smi_parser(query) + data = int(data[0]) + + return data + + @staticmethod + def gpu_temp(): + + query = "temperature.gpu" + data = NvidiaSensors.smi_parser(query) + data = int(data[0]) + + return data + + @staticmethod + def vram_used(): + + query = "memory.used" + data = NvidiaSensors.smi_parser(query) + data = round((float(data[0]) / 1024), 2) + + return data + + @staticmethod + def gpu_power(): + + query = 'power.draw' + data = NvidiaSensors.smi_parser(query) + data = round((float(data[0])) / 100, 2) + + return data + + @staticmethod + def gpu_fps(): + + query = 'encoder.stats.averageFps' + data = NvidiaSensors.smi_parser(query) + data = int(data[0]) + + return data + + @staticmethod + def gpu_clock(): + + query = 'clocks.current.graphics' + data = NvidiaSensors.smi_parser(query) + data = int(data[0]) + + return data