Skip to content

Commit

Permalink
Adding monitoring for AMD CPU w/ NVIDIA
Browse files Browse the repository at this point in the history
  • Loading branch information
MarkhamLee committed Oct 1, 2024
1 parent ebfc85e commit 91ac489
Show file tree
Hide file tree
Showing 4 changed files with 341 additions and 0 deletions.
27 changes: 27 additions & 0 deletions hardware_monitoring/amdcpu_nvidiagpu/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# build phase
FROM python:slim-bookworm as builder

WORKDIR /app

RUN apt-get update -y && apt-get install -y gcc python3-dev

COPY ./intel_x86/requirements.txt .

RUN pip3 wheel --no-cache-dir --no-deps --wheel-dir /app/wheels -r requirements.txt


# stage two
FROM python:slim-bookworm

COPY --from=builder /app/wheels /wheels
COPY --from=builder /app/requirements.txt .

RUN pip3 install --no-cache /wheels/*

COPY ./hw_monitoring_libraries ./hw_monitoring_libraries

WORKDIR /hw_telemetry

COPY ./amdcpu_nvidiagpu ./

ENTRYPOINT ["python3", "/hw_telemetry/main.py"]
122 changes: 122 additions & 0 deletions hardware_monitoring/amdcpu_nvidiagpu/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env python
# Markham Lee (C) 2023
# Hardware Monitor for Linux & Windows:
# https://github.com/MarkhamLee/HardwareMonitoring
# This is for Linux devices running on AMD CPUs
# CLI instructions file_name + <MQTT topic name as a string>
# + <Integer for sleep interval>
# e.g., python3 monitor_amd_linux.py '/home/amd' 5
import gc
import json
import logging
import os
import sys
from time import sleep

# this allows us to import modules from the parent directory
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)

from hw_monitoring_libraries.amd_apu import AMDCPUData # noqa: E402
from hw_monitoring_libraries.nvidia_gpu import NvidiaSensors # noqa: E402
from hw_monitoring_libraries.logging_util import logger # noqa: E402
from hw_monitoring_libraries.hw_monitoring\
import MonitoringUtilities # noqa: E402


def monitor(client: object, get_data: object,
gpu_data: object,
TOPIC: str,
INTERVAL: int):

while True:

# get CPU utilization
cpu_util = get_data.get_cpu_data()

# get current RAM use
ram_util = get_data.get_ram_data()

# get current freq and core count
cpu_freq, core_count = get_data.get_freq()

# get CPU, iGPU and NVME temperatures
nvme_temp, cpu_temp, amdgpu_temp = get_data.amd_linux_temp_data()

# get NVIDIA data
# get GPU data
gpu_temp, gpu_load, gpu_vram, gpu_power, gpu_clock = gpu_data.\
gpu_query()

payload = {
"cpu_temp": cpu_temp,
"amdgpu_temp": amdgpu_temp,
"nvme_temp": nvme_temp,
"cpu_freq": cpu_freq,
"cpu_use": cpu_util,
"ram_use": ram_util,
"gpu_temp": gpu_temp,
"gpu_load": gpu_load,
"gpu_vram": gpu_vram,
"gpu_power": gpu_power,
"gpu_clock": gpu_clock
}

payload = json.dumps(payload)
logger.info(payload)

result = client.publish(TOPIC, payload)
status = result[0]

if status != 0:
print(f'Failed to send {payload} to: {TOPIC}')
logging.debug(f'MQTT publishing failure, return code: {status}')

del payload, cpu_temp, amdgpu_temp, nvme_temp, cpu_freq, \
cpu_util, ram_util, status, result
gc.collect()

sleep(INTERVAL)


def main():

# instantiate utilities class
monitor_utilities = MonitoringUtilities()

TOPIC = os.environ['TOPIC']
INTERVAL = int(os.environ['INTERVAL'])

# load environmental variables
MQTT_BROKER = os.environ["MQTT_BROKER"]
MQTT_USER = os.environ['MQTT_USER']
MQTT_SECRET = os.environ['MQTT_SECRET']
MQTT_PORT = int(os.environ['MQTT_PORT'])

# get unique client ID
client_id = monitor_utilities.getClientID()

# get mqtt client
client, code = monitor_utilities.mqttClient(client_id,
MQTT_USER,
MQTT_SECRET,
MQTT_BROKER,
MQTT_PORT)

# instantiate CPU & GPU data classes
get_data = AMDCPUData()

# gpu monitoring class
gpu_data = NvidiaSensors()
logger.info('CPU & GPU monitoring classes instantiated')

# start data monitoring
try:
monitor(client, get_data, gpu_data, TOPIC, INTERVAL)

finally:
client.loop_stop()


if __name__ == '__main__':
main()
75 changes: 75 additions & 0 deletions hardware_monitoring/hw_monitoring_libraries/amd_apu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Markham Lee (C) 2023
# Hardware Monitor for Linux & Windows:
# https://github.com/MarkhamLee/k3s-data-platform-IoT
# script to retrieve CPU related data on an AMD x86 machine
# and then write to InfluxDB
import json
import psutil


class AMDCPUData():

def __init__(self):

# get the # of cores, as we can use that to iterate through and
# get things like current speed for all CPU cores
self.core_count = psutil.cpu_count(logical=False)

def build_payload(self, inputFunction, index=0):

temp_dict = {}

while self.core_count > index:

data = inputFunction[index].current
data = round(data, 1)
key = (f'core {index}')
temp_dict[key] = data
index += 1

payload = json.dumps(temp_dict)

return payload

# get average clock speed for all cores
def get_freq(self, all_cpu=False):

all_freq = psutil.cpu_freq(percpu=all_cpu)[0]
all_freq = round(all_freq, 1)

return all_freq, self.core_count

# get frequency per core
@staticmethod
def freq_per_core(self, all_cpu=True):

per_core_freq = self.buildPayload(psutil.cpu_freq(percpu=all_cpu))

return per_core_freq

# CPU load
@staticmethod
def get_cpu_data():

cpu_util = (psutil.cpu_percent(interval=1))
cpu_util = round(cpu_util, 1)

return cpu_util

# get current RAM used
@staticmethod
def get_ram_data():

ram_use = (psutil.virtual_memory()[3]) / 1073741824
ram_use = round(ram_use, 2)

return ram_use

@staticmethod
def amd_linux_temp_data():

nvme_temp = psutil.sensors_temperatures()['nvme'][0].current
cpu_temp = psutil.sensors_temperatures()['k10temp'][0].current
amdgpu_temp = psutil.sensors_temperatures()['amdgpu'][0].current

return nvme_temp, cpu_temp, amdgpu_temp
117 changes: 117 additions & 0 deletions hardware_monitoring/hw_monitoring_libraries/nvidia_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Markham Lee 2023 - 2024
# Retrieving temp, load, etc., data from NVIDIA GPUs
# # https://github.com/MarkhamLee/HardwareMonitoring
# All the GPU data comes from NVIDI SMI Queries, you can read more here:
# https://nvidia.custhelp.com/app/answers/detail/a_id/3751/~/useful-nvidia-smi-queries
# this script is platform agnostic, should work exactly the same on Linux and
# Windows devices running x86 processors
import subprocess as sp


class NvidiaSensors():

def __init__(self):

pass

# parsing the data from the smi query is fairly consistent, so created
# this generalized query method I can either call directly with a query
# or call via the other methods that have the "canned" queries already
# set up.

@staticmethod
def smi_parser(query: str):

cmd = "nvidia-smi --query-gpu=" + query + " --format=csv,noheader"
data = sp.check_output(cmd, shell=True)
data = data.decode("utf-8").strip().split("\n")
data = data[0]
data = data.split(',')
data = [''.join(x for x in i if x.isdigit()) for i in data]

return data

# getting all the data in one query saves on quite a bit of latency,
# as whether it's one item or six, these queries run in about 30-40ms,
# doing them separate it was closer to 250ms
@staticmethod
def gpu_query():

query = ("temperature.gpu,utilization.gpu,memory.used,power.draw,"
"clocks.current.graphics,encoder.stats.averageFps")

data = NvidiaSensors.smi_parser(query)

# split out each value from the returned list of values

temp = int(data[0])
gpu_load = int(data[1]) / 100
gpu_vram = round(((float(data[2])) / 1024), 2)
gpu_power = round((float(data[3])) / 100, 2)
gpu_clock = int(data[4])

# commented this out, as the FPS query shows the average across
# all apps not useful for when I'm playing games. Will need to look
# at an alternate approach like Riva Statistics Tuner. Also the
# NVIDIA overlay app shows this data, so there is probably a way to
# get this data via SMI queries that I just haven't discovered yet.
# fps = int(data[5]) #

return temp, gpu_load, gpu_vram, gpu_power, gpu_clock

# the following are the individual queries for the data points included in
# the GPU query, I just put these here in case I needed them/needed to just
# get an individual data point.
@staticmethod
def gpu_load():

query = "utilization.gpu"
data = NvidiaSensors.smi_parser(query)
data = int(data[0])

return data

@staticmethod
def gpu_temp():

query = "temperature.gpu"
data = NvidiaSensors.smi_parser(query)
data = int(data[0])

return data

@staticmethod
def vram_used():

query = "memory.used"
data = NvidiaSensors.smi_parser(query)
data = round((float(data[0]) / 1024), 2)

return data

@staticmethod
def gpu_power():

query = 'power.draw'
data = NvidiaSensors.smi_parser(query)
data = round((float(data[0])) / 100, 2)

return data

@staticmethod
def gpu_fps():

query = 'encoder.stats.averageFps'
data = NvidiaSensors.smi_parser(query)
data = int(data[0])

return data

@staticmethod
def gpu_clock():

query = 'clocks.current.graphics'
data = NvidiaSensors.smi_parser(query)
data = int(data[0])

return data

0 comments on commit 91ac489

Please sign in to comment.