Skip to content

Commit

Permalink
feat: simulate
Browse files Browse the repository at this point in the history
  • Loading branch information
shahargl committed Jan 25, 2024
1 parent df433c2 commit dae6b83
Show file tree
Hide file tree
Showing 10 changed files with 430 additions and 7 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,3 @@ storage

# otel files
tempo-data/

# simulation
simulate*
13 changes: 13 additions & 0 deletions keep/providers/base/base_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,3 +515,16 @@ def _push_alert(self, alert: dict):
self.logger.error(
f"Failed to push alert to {self.provider_id}: {response.content}"
)

@staticmethod
def simulate_alert(**kwargs) -> AlertDto:
"""
Simulate an alert.
Args:
**kwargs (dict): The provider context (with statement)
Returns:
AlertDto: The simulated alert.
"""
raise NotImplementedError("simulate_alert() method not implemented")
38 changes: 38 additions & 0 deletions keep/providers/datadog_provider/alerts_mock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
ALERTS = {
"high_cpu_usage": {
"payload": {
"title": "High CPU Usage",
"type": "metric alert",
"query": "avg(last_5m):avg:system.cpu.user{*} by {host} > 90",
"message": "CPU usage is over 90% on {{host.name}}.",
"tags": "environment:production, team:backend",
"priority": "P3",
"monitor_id": "1234567890",
},
"parameters": {
"tags": [
"environment:production,team:backend,monitor",
"environment:staging,team:backend,monitor",
],
"priority": ["P2", "P3", "P4"],
},
},
"low_disk_space": {
"payload": {
"title": "Low Disk Space",
"type": "metric alert",
"query": "avg(last_1h):min:system.disk.free{*} by {host} < 20",
"message": "Disk space is below 20% on {{host.name}}.",
"tags": "environment:production,team:database",
"priority": 4,
"monitor_id": "1234567891",
},
"parameters": {
"tags": [
"environment:production,team:analytics,monitor",
"environment:staging,team:database,monitor",
],
"priority": ["P1", "P3", "P4"],
},
},
}
34 changes: 34 additions & 0 deletions keep/providers/datadog_provider/datadog_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,40 @@ def get_logs(self, limit: int = 5) -> list:
def get_alert_schema():
return DatadogAlertFormatDescription.schema()

@staticmethod
def simulate_alert() -> dict:
# Choose a random alert type
import hashlib
import random

from keep.providers.datadog_provider.alerts_mock import ALERTS

alert_type = random.choice(list(ALERTS.keys()))
alert_data = ALERTS[alert_type]

# Start with the base payload
simulated_alert = alert_data["payload"].copy()

# Apply variability based on parameters
for param, choices in alert_data.get("parameters", {}).items():
# Split param on '.' for nested parameters (if any)
param_parts = param.split(".")
target = simulated_alert
for part in param_parts[:-1]:
target = target.setdefault(part, {})

# Choose a random value for the parameter
target[param_parts[-1]] = random.choice(choices)

simulated_alert["last_updated"] = int(time.time() * 1000)
simulated_alert["alert_transition"] = random.choice(
list(DatadogProvider.STATUS_MAP.keys())
)
simulated_alert["id"] = hashlib.sha256(
str(simulated_alert).encode()
).hexdigest()
return simulated_alert


if __name__ == "__main__":
# Output debug messages
Expand Down
134 changes: 134 additions & 0 deletions keep/providers/grafana_provider/alerts_mock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
ALERTS = {
"database_connection_failure": {
"payload": {
"condition": "A",
"data": [
{
"datasourceUid": "datasource1",
"model": {
"conditions": [
{
"evaluator": {"params": [1], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["A", "5m", "now"]},
"reducer": {"params": [], "type": "last"},
"type": "query",
}
],
"datasource": {"type": "grafana", "uid": "datasource1"},
"expression": "",
"hide": False,
"intervalMs": 1000,
"maxDataPoints": 100,
"refId": "A",
"type": "classic_conditions",
},
"queryType": "",
"refId": "A",
"relativeTimeRange": {"from": 300, "to": 0},
}
],
"execErrState": "Alerting",
"folderUID": "keep_alerts",
"for_": "5m",
"isPaused": False,
"labels": {"severity": "critical", "monitor": "database"},
"noDataState": "NoData",
"orgID": 1,
"ruleGroup": "keep_group_1",
"title": "Database Connection Failure",
},
"parameters": {
"labels.monitor": ["database1", "database2", "database3"],
"for_": ["5m", "10m", "15m"],
},
},
"high_memory_usage": {
"payload": {
"condition": "B",
"data": [
{
"datasourceUid": "datasource2",
"model": {
"conditions": [
{
"evaluator": {"params": [80], "type": "gt"},
"operator": {"type": "or"},
"query": {"params": ["B", "10m", "now"]},
"reducer": {"params": [], "type": "avg"},
"type": "query",
}
],
"datasource": {"type": "grafana", "uid": "datasource2"},
"expression": "",
"hide": False,
"intervalMs": 2000,
"maxDataPoints": 50,
"refId": "B",
"type": "classic_conditions",
},
"queryType": "",
"refId": "B",
"relativeTimeRange": {"from": 600, "to": 0},
}
],
"execErrState": "Alerting",
"folderUID": "keep_alerts",
"for_": "10m",
"isPaused": False,
"labels": {"severity": "warning", "monitor": "memory"},
"noDataState": "NoData",
"orgID": 1,
"ruleGroup": "keep_group_2",
"title": "High Memory Usage",
},
"parameters": {
"labels.monitor": ["server1", "server2", "server3"],
"for_": ["10m", "30m", "1h"],
},
},
"network_latency_high": {
"payload": {
"condition": "C",
"data": [
{
"datasourceUid": "datasource3",
"model": {
"conditions": [
{
"evaluator": {"params": [100], "type": "gt"},
"operator": {"type": "and"},
"query": {"params": ["C", "15m", "now"]},
"reducer": {"params": [], "type": "max"},
"type": "query",
}
],
"datasource": {"type": "grafana", "uid": "datasource3"},
"expression": "",
"hide": False,
"intervalMs": 3000,
"maxDataPoints": 30,
"refId": "C",
"type": "classic_conditions",
},
"queryType": "",
"refId": "C",
"relativeTimeRange": {"from": 900, "to": 0},
}
],
"execErrState": "Alerting",
"folderUID": "keep_alerts",
"for_": "15m",
"isPaused": False,
"labels": {"severity": "info", "monitor": "network"},
"noDataState": "NoData",
"orgID": 1,
"ruleGroup": "keep_group_3",
"title": "Network Latency High",
},
"parameters": {
"labels.monitor": ["router1", "router2", "router3"],
"for_": ["15m", "45m", "1h"],
},
},
}
48 changes: 48 additions & 0 deletions keep/providers/grafana_provider/grafana_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,6 +487,54 @@ def _get_alerts(self) -> list[AlertDto]:
return alerts
return []

@staticmethod
def simulate_alert(**kwargs) -> dict:
import hashlib
import json
import random

from keep.providers.grafana_provider.alerts_mock import ALERTS

alert_type = kwargs.get("alert_type")
if not alert_type:
alert_type = random.choice(list(ALERTS.keys()))

alert_payload = ALERTS[alert_type]["payload"]
alert_parameters = ALERTS[alert_type].get("parameters", {})
# Generate random data for parameters
for parameter, parameter_options in alert_parameters.items():
if "." in parameter:
parameter = parameter.split(".")
if parameter[0] not in alert_payload:
alert_payload[parameter[0]] = {}
alert_payload[parameter[0]][parameter[1]] = random.choice(
parameter_options
)
else:
alert_payload[parameter] = random.choice(parameter_options)

# Implement specific Grafana alert structure here
# For example:
alert_payload["state"] = AlertStatus.FIRING.value
alert_payload["evalMatches"] = [
{
"value": random.randint(0, 100),
"metric": "some_metric",
"tags": alert_payload.get("labels", {}),
}
]

# Generate fingerprint
fingerprint_src = json.dumps(alert_payload, sort_keys=True)
fingerprint = hashlib.md5(fingerprint_src.encode()).hexdigest()
alert_payload["fingerprint"] = fingerprint

return {
"alerts": [alert_payload],
"severity": alert_payload.get("labels", {}).get("severity"),
"title": "Grafana Alert - {}".format(alert_type),
}


if __name__ == "__main__":
# Output debug messages
Expand Down
53 changes: 53 additions & 0 deletions keep/providers/prometheus_provider/alerts_mock.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
ALERTS = {
"high_cpu_usage": {
"payload": {
"summary": "CPU usage is over 90%",
"labels": {
"instance": "example1",
"job": "example2",
"workfload": "somecoolworkload",
"severity": "critical",
},
},
"parameters": {
"labels.host": ["host1", "host2", "host3"],
"labels.instance": ["instance1", "instance2", "instance3"],
},
},
"mq_third_full": {
"payload": {
"summary": "Message queue is over 33% capacity",
"labels": {
"severity": "warning",
},
},
"parameters": {
"labels.queue": ["queue1", "queue2", "queue3"],
"labels.mq_manager": ["mq_manager1", "mq_manager2", "mq_manager3"],
},
},
"disk_space_low": {
"payload": {
"summary": "Disk space is below 20%",
"labels": {
"severity": "warning",
},
},
"parameters": {
"labels.host": ["host1", "host2", "host3"],
"labels.instance": ["instance1", "instance2", "instance3"],
},
},
"network_latency_high": {
"payload": {
"summary": "Network latency is higher than normal",
"labels": {
"severity": "info",
},
},
"parameters": {
"labels.host": ["host1", "host2", "host3"],
"labels.instance": ["instance1", "instance2", "instance3"],
},
},
}
Loading

0 comments on commit dae6b83

Please sign in to comment.