Adding monitoring for AMD CPU w/ NVIDIA

MarkhamLee · Oct 1, 2024 · 91ac489 · 91ac489
1 parent ebfc85e
commit 91ac489
Show file tree

Hide file tree

Showing 4 changed files with 341 additions and 0 deletions.
diff --git a/hardware_monitoring/amdcpu_nvidiagpu/Dockerfile b/hardware_monitoring/amdcpu_nvidiagpu/Dockerfile
@@ -0,0 +1,27 @@
+# build phase
+FROM python:slim-bookworm as builder
+
+WORKDIR /app
+
+RUN apt-get update -y && apt-get install -y gcc  python3-dev
+
+COPY ./intel_x86/requirements.txt .
+
+RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt
+
+
+# stage two
+FROM python:slim-bookworm
+
+COPY --from=builder /app/wheels /wheels
+COPY --from=builder /app/requirements.txt .
+
+RUN pip3 install --no-cache /wheels/*
+
+COPY ./hw_monitoring_libraries ./hw_monitoring_libraries
+
+WORKDIR /hw_telemetry
+
+COPY ./amdcpu_nvidiagpu ./
+
+ENTRYPOINT ["python3", "/hw_telemetry/main.py"] 
diff --git a/hardware_monitoring/amdcpu_nvidiagpu/main.py b/hardware_monitoring/amdcpu_nvidiagpu/main.py
@@ -0,0 +1,122 @@
+#!/usr/bin/env python
+# Markham Lee (C) 2023
+# Hardware Monitor for Linux & Windows:
+# https://github.com/MarkhamLee/HardwareMonitoring
+# This is for Linux devices running on AMD CPUs
+# CLI instructions file_name + <MQTT topic name as a string>
+# + <Integer for sleep interval>
+# e.g., python3 monitor_amd_linux.py '/home/amd' 5
+import gc
+import json
+import logging
+import os
+import sys
+from time import sleep
+
+# this allows us to import modules from the parent directory
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from hw_monitoring_libraries.amd_apu import AMDCPUData  # noqa: E402
+from hw_monitoring_libraries.nvidia_gpu import NvidiaSensors  # noqa: E402
+from hw_monitoring_libraries.logging_util import logger  # noqa: E402
+from hw_monitoring_libraries.hw_monitoring\
+    import MonitoringUtilities  # noqa: E402
+
+
+def monitor(client: object, get_data: object,
+            gpu_data: object,
+            TOPIC: str,
+            INTERVAL: int):
+
+    while True:
+
+        # get CPU utilization
+        cpu_util = get_data.get_cpu_data()
+
+        # get current RAM use
+        ram_util = get_data.get_ram_data()
+
+        # get current freq and core count
+        cpu_freq, core_count = get_data.get_freq()
+
+        # get CPU, iGPU and NVME temperatures
+        nvme_temp, cpu_temp, amdgpu_temp = get_data.amd_linux_temp_data()
+
+        # get NVIDIA data
+        # get GPU data
+        gpu_temp, gpu_load, gpu_vram, gpu_power, gpu_clock = gpu_data.\
+            gpu_query()
+
+        payload = {
+            "cpu_temp": cpu_temp,
+            "amdgpu_temp": amdgpu_temp,
+            "nvme_temp": nvme_temp,
+            "cpu_freq": cpu_freq,
+            "cpu_use": cpu_util,
+            "ram_use": ram_util,
+            "gpu_temp": gpu_temp,
+            "gpu_load": gpu_load,
+            "gpu_vram": gpu_vram,
+            "gpu_power": gpu_power,
+            "gpu_clock": gpu_clock
+        }
+
+        payload = json.dumps(payload)
+        logger.info(payload)
+
+        result = client.publish(TOPIC, payload)
+        status = result[0]
+
+        if status != 0:
+            print(f'Failed to send {payload} to: {TOPIC}')
+            logging.debug(f'MQTT publishing failure, return code: {status}')
+
+        del payload, cpu_temp, amdgpu_temp, nvme_temp, cpu_freq, \
+            cpu_util, ram_util, status, result
+        gc.collect()
+
+        sleep(INTERVAL)
+
+
+def main():
+
+    # instantiate utilities class
+    monitor_utilities = MonitoringUtilities()
+
+    TOPIC = os.environ['TOPIC']
+    INTERVAL = int(os.environ['INTERVAL'])
+
+    # load environmental variables
+    MQTT_BROKER = os.environ["MQTT_BROKER"]
+    MQTT_USER = os.environ['MQTT_USER']
+    MQTT_SECRET = os.environ['MQTT_SECRET']
+    MQTT_PORT = int(os.environ['MQTT_PORT'])
+
+    # get unique client ID
+    client_id = monitor_utilities.getClientID()
+
+    # get mqtt client
+    client, code = monitor_utilities.mqttClient(client_id,
+                                                MQTT_USER,
+                                                MQTT_SECRET,
+                                                MQTT_BROKER,
+                                                MQTT_PORT)
+
+    # instantiate CPU & GPU data classes
+    get_data = AMDCPUData()
+
+    # gpu monitoring class
+    gpu_data = NvidiaSensors()
+    logger.info('CPU & GPU monitoring classes instantiated')
+
+    # start data monitoring
+    try:
+        monitor(client, get_data, gpu_data, TOPIC, INTERVAL)
+
+    finally:
+        client.loop_stop()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/hardware_monitoring/hw_monitoring_libraries/amd_apu.py b/hardware_monitoring/hw_monitoring_libraries/amd_apu.py
@@ -0,0 +1,75 @@
+# Markham Lee (C) 2023
+# Hardware Monitor for Linux & Windows:
+# https://github.com/MarkhamLee/k3s-data-platform-IoT
+# script to retrieve CPU related data on an AMD x86 machine
+# and then write to InfluxDB
+import json
+import psutil
+
+
+class AMDCPUData():
+
+    def __init__(self):
+
+        # get the # of cores, as we can use that to iterate through and
+        # get things like current speed for all CPU cores
+        self.core_count = psutil.cpu_count(logical=False)
+
+    def build_payload(self, inputFunction, index=0):
+
+        temp_dict = {}
+
+        while self.core_count > index:
+
+            data = inputFunction[index].current
+            data = round(data, 1)
+            key = (f'core {index}')
+            temp_dict[key] = data
+            index += 1
+
+        payload = json.dumps(temp_dict)
+
+        return payload
+
+    # get average clock speed for all cores
+    def get_freq(self, all_cpu=False):
+
+        all_freq = psutil.cpu_freq(percpu=all_cpu)[0]
+        all_freq = round(all_freq, 1)
+
+        return all_freq, self.core_count
+
+    # get frequency per core
+    @staticmethod
+    def freq_per_core(self, all_cpu=True):
+
+        per_core_freq = self.buildPayload(psutil.cpu_freq(percpu=all_cpu))
+
+        return per_core_freq
+
+    # CPU load
+    @staticmethod
+    def get_cpu_data():
+
+        cpu_util = (psutil.cpu_percent(interval=1))
+        cpu_util = round(cpu_util, 1)
+
+        return cpu_util
+
+    # get current RAM used
+    @staticmethod
+    def get_ram_data():
+
+        ram_use = (psutil.virtual_memory()[3]) / 1073741824
+        ram_use = round(ram_use, 2)
+
+        return ram_use
+
+    @staticmethod
+    def amd_linux_temp_data():
+
+        nvme_temp = psutil.sensors_temperatures()['nvme'][0].current
+        cpu_temp = psutil.sensors_temperatures()['k10temp'][0].current
+        amdgpu_temp = psutil.sensors_temperatures()['amdgpu'][0].current
+
+        return nvme_temp, cpu_temp, amdgpu_temp
diff --git a/hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py b/hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py
@@ -0,0 +1,117 @@
+# Markham Lee 2023 - 2024
+# Retrieving temp, load, etc., data from NVIDIA GPUs
+# # https://github.com/MarkhamLee/HardwareMonitoring
+# All the GPU data comes from NVIDI SMI Queries, you can read more here:
+# https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries
+# this script is platform agnostic, should work exactly the same on Linux and
+# Windows devices running x86 processors
+import subprocess as sp
+
+
+class NvidiaSensors():
+
+    def __init__(self):
+
+        pass
+
+    # parsing the data from the smi query is fairly consistent, so created
+    # this generalized query method I can either call directly with a query
+    # or call via the other methods that have the "canned" queries already
+    # set up.
+
+    @staticmethod
+    def smi_parser(query: str):
+
+        cmd = "nvidia-smi --query-gpu=" + query + " --format=csv,noheader"
+        data = sp.check_output(cmd, shell=True)
+        data = data.decode("utf-8").strip().split("\n")
+        data = data[0]
+        data = data.split(',')
+        data = [''.join(x for x in i if x.isdigit()) for i in data]
+
+        return data
+
+    # getting all the data in one query saves on quite a bit of latency,
+    # as whether it's one item or six, these queries run in about 30-40ms,
+    # doing them separate it was closer to 250ms
+    @staticmethod
+    def gpu_query():
+
+        query = ("temperature.gpu,utilization.gpu,memory.used,power.draw,"
+                 "clocks.current.graphics,encoder.stats.averageFps")
+
+        data = NvidiaSensors.smi_parser(query)
+
+        # split out each value from the returned list of values
+
+        temp = int(data[0])
+        gpu_load = int(data[1]) / 100
+        gpu_vram = round(((float(data[2])) / 1024), 2)
+        gpu_power = round((float(data[3])) / 100, 2)
+        gpu_clock = int(data[4])
+
+        # commented this out, as the FPS query shows the average across
+        # all apps not useful for when I'm playing games. Will need to look
+        # at an alternate approach like Riva Statistics Tuner. Also the
+        # NVIDIA overlay app shows this data, so there is probably a way to
+        # get this data via SMI queries that I just haven't discovered yet.
+        # fps = int(data[5]) #
+
+        return temp, gpu_load, gpu_vram, gpu_power, gpu_clock
+
+    # the following are the individual queries for the data points included in
+    # the GPU query, I just put these here in case I needed them/needed to just
+    # get an individual data point.
+    @staticmethod
+    def gpu_load():
+
+        query = "utilization.gpu"
+        data = NvidiaSensors.smi_parser(query)
+        data = int(data[0])
+
+        return data
+
+    @staticmethod
+    def gpu_temp():
+
+        query = "temperature.gpu"
+        data = NvidiaSensors.smi_parser(query)
+        data = int(data[0])
+
+        return data
+
+    @staticmethod
+    def vram_used():
+
+        query = "memory.used"
+        data = NvidiaSensors.smi_parser(query)
+        data = round((float(data[0]) / 1024), 2)
+
+        return data
+
+    @staticmethod
+    def gpu_power():
+
+        query = 'power.draw'
+        data = NvidiaSensors.smi_parser(query)
+        data = round((float(data[0])) / 100, 2)
+
+        return data
+
+    @staticmethod
+    def gpu_fps():
+
+        query = 'encoder.stats.averageFps'
+        data = NvidiaSensors.smi_parser(query)
+        data = int(data[0])
+
+        return data
+
+    @staticmethod
+    def gpu_clock():
+
+        query = 'clocks.current.graphics'
+        data = NvidiaSensors.smi_parser(query)
+        data = int(data[0])
+
+        return data