Skip to content

Commit

Permalink
Updated monitoring class to send alerts via Slack, added excessive lo…
Browse files Browse the repository at this point in the history
…ad alerting to UPS monitoring
  • Loading branch information
MarkhamLee committed Apr 25, 2024
1 parent 379d8f9 commit 8d03d14
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 7 deletions.
31 changes: 24 additions & 7 deletions hardware_monitoring/cyberpowerpc_pfc1500lcda_ups/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
from hw_monitoring_libraries.hw_monitoring\
import MonitoringUtilities # noqa: E402

# instantiate hardware monitoring class
monitor_utilities = MonitoringUtilities()

UPS_ID = os.environ['UPS_ID']


Expand All @@ -29,6 +32,8 @@ def ups_monitoring(CMD: str, TOPIC: str, client: object):
INTERVAL = int(os.environ['UPS_INTERVAL'])

logger.info(f'Starting monitoring for {UPS_ID}')
excessive_load_count = 0
load_threshold = 900/INTERVAL

while True:

Expand Down Expand Up @@ -57,6 +62,20 @@ def ups_monitoring(CMD: str, TOPIC: str, client: object):
"device_model": test_dict['device.model']
}

# check load status, send alert if it's to high
# TODO: add a series of alerts based on the values above
# Note: running on battery already generates alerts via the
# Firewall.
if float(test_dict['ups.load']) > 50:
excessive_load_count += 1

if excessive_load_count > load_threshold:
SLACK_WEBHOOK = os.environ['SLACK_HW_ALERTS']
message = (f'Power load has exceeded 50% on {UPS_ID} for more than 15 minutes') # noqa: E501
logger.info(message)
monitor_utilities.send_slack_webhook(SLACK_WEBHOOK, message)
excessive_load_count = 0 # reset the timer

# build json payload
payload = json.dumps(payload)

Expand All @@ -71,26 +90,24 @@ def ups_monitoring(CMD: str, TOPIC: str, client: object):

except Exception as e:
logger.debug(f'Failed to read data from UPS: {UPS_ID} with error: {e}') # noqa: E501
# TODO: add Slack alert for when UPS goes down, low priority right
# now as the Firewall will send an alert if the UPS goes down
# TODO: add Slack alert for when UPS goes down, low priority for
# now as the firewall catches this, but will be needed once more
# UPS devices are added.
sleep(600)

sleep(INTERVAL)


def build_query():
def build_ups_query():

UPS_IP = os.environ['UPS_IP']

CMD = "upsc " + UPS_ID + "@" + UPS_IP

return CMD


def main():

# instantiate hardware monitoring class
monitor_utilities = MonitoringUtilities()
logger.info('Monitoring utilities class instantiated')

# operating parameters
Expand All @@ -102,7 +119,7 @@ def main():
MQTT_SECRET = os.environ['MQTT_SECRET']
MQTT_PORT = int(os.environ['MQTT_PORT'])

CMD = build_query()
CMD = build_ups_query()

# get unique client ID
clientID = monitor_utilities.getClientID()
Expand Down
30 changes: 30 additions & 0 deletions hardware_monitoring/hw_monitoring_libraries/hw_monitoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
# possible via the typical K8s HW monitoring tools. E.g., monitoring
# GPU temps for single board computers.
import os
import requests
import sys
import uuid
from paho.mqtt import client as mqtt
Expand Down Expand Up @@ -55,3 +56,32 @@ def connectionStatus(client, userdata, flags, code):
client.loop_start()

return client, code

@staticmethod
def send_slack_webhook(url: str, message: str):

headers = {
'Content-type': 'application/json'

}

payload = {
"text": message
}

response = requests.post(url, headers=headers, json=payload)
logger.debug(f'Slack pipeline failure alert published succesfully with code: {response.status_code}') # noqa: E501

return MonitoringUtilities.\
evaluate_slack_response(response.status_code, 'webhook')

@staticmethod
def evaluate_slack_response(code: int, type: str):

if code == 200:
logger.info(f'Publishing of alert to Slack {type} was successful')

else:
logger.debug(f'Publishing of alert to Slack {type} failed, with error code {code}') # noqa: E501

return code

0 comments on commit 8d03d14

Please sign in to comment.