Skip to content

Commit

Permalink
feat: Export labels for incidents (#1981)
Browse files Browse the repository at this point in the history
  • Loading branch information
Matvey-Kuk authored Sep 24, 2024
1 parent a8febc1 commit 5ba341b
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 10 deletions.
23 changes: 23 additions & 0 deletions keep/api/core/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import logging
import random
import uuid
from collections import defaultdict
from datetime import datetime, timedelta, timezone
from enum import Enum
from typing import Any, Dict, List, Tuple, Union
Expand Down Expand Up @@ -2552,6 +2553,28 @@ def get_incident_unique_fingerprint_count(tenant_id: str, incident_id: str) -> i
).scalar()


def get_last_alerts_for_incidents(incident_ids: List[str | UUID]) -> Dict[str, List[Alert]]:
with Session(engine) as session:
query = (
session.query(
Alert,
AlertToIncident.incident_id,
)
.join(AlertToIncident, Alert.id == AlertToIncident.alert_id)
.filter(
AlertToIncident.incident_id.in_(incident_ids),
)
.order_by(Alert.timestamp.desc())
)

alerts = query.all()

incidents_alerts = defaultdict(list)
for alert, incident_id in alerts:
incidents_alerts[str(incident_id)].append(alert)

return incidents_alerts

def remove_alerts_to_incident_by_incident_id(
tenant_id: str, incident_id: str | UUID, alert_ids: List[UUID]
) -> Optional[int]:
Expand Down
48 changes: 40 additions & 8 deletions keep/api/routes/metrics.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
import chevron

from fastapi import Query
from typing import List
from fastapi import APIRouter, Depends, Response

from keep.api.core.db import get_last_incidents
from keep.api.models.alert import AlertDto
from keep.api.core.db import get_last_incidents, get_last_alerts_for_incidents
from keep.identitymanager.authenticatedentity import AuthenticatedEntity
from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory

Expand All @@ -11,6 +16,7 @@

@router.get("")
def get_metrics(
labels: List[str] = Query(None),
authenticated_entity: AuthenticatedEntity = Depends(
IdentityManagerFactory.get_auth_verifier(["read:metrics"])
),
Expand All @@ -26,12 +32,22 @@ def get_metrics(
```
scrape_configs:
- job_name: "scrape_keep"
scrape_interval: 5m # It's important to scrape not too often to avoid rate limiting.
static_configs:
- targets: ["https://api.keephq.dev"] # Or your own domain.
authorization:
type: Bearer
credentials: "{Your API Key}"
scrape_interval: 5m # It's important to scrape not too often to avoid rate limiting.
static_configs:
- targets: ["https://api.keephq.dev"] # Or your own domain.
authorization:
type: Bearer
credentials: "{Your API Key}"
# Optional, you can add labels to exported incidents.
# Label values will be equal to the last incident's alert payload value matching the label.
# Attention! Don't add "flaky" labels which could change from alert to alert within the same incident.
# Good labels: ['labels.department', 'labels.team'], bad labels: ['labels.severity', 'labels.pod_id']
# Check Keep -> Feed -> "extraPayload" column, it will help in writing labels.
params:
labels: ['labels.service', 'labels.queue']
# Will resuld as: "labels_service" and "labels_queue".
```
"""
# We don't use im-memory metrics countrs here which is typical for prometheus exporters,
Expand All @@ -50,9 +66,25 @@ def get_metrics(
limit=1000,
is_confirmed=True,
)

last_alerts_for_incidents = get_last_alerts_for_incidents([incident.id for incident in incidents])

for incident in incidents:
incident_name = incident.user_generated_name if incident.user_generated_name else incident.ai_generated_name
export += f'alerts_total{{incident_name="{incident_name}" incident_id="{incident.id}"}} {incident.alerts_count}\n'
extra_labels = ""
try:
last_alert = last_alerts_for_incidents[str(incident.id)][0]
last_alert_dto = AlertDto(**last_alert.event)
except IndexError:
last_alert_dto = None

if labels is not None:
for label in labels:
label_value = chevron.render("{{ " + label + " }}", last_alert_dto)
label = label.replace(".", "_")
extra_labels += f' {label}="{label_value}"'

export += f'alerts_total{{incident_name="{incident_name}" incident_id="{incident.id}"{extra_labels}}} {incident.alerts_count}\n'

# Exporting stats about open incidents
export += "\n\n"
Expand Down
4 changes: 2 additions & 2 deletions tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@ def test_add_remove_alert_to_incidents(

add_alerts_to_incident_by_incident_id("keep", incident.id, [a.id for a in alerts])

response = client.get("/metrics", headers={"X-API-KEY": "valid_api_key"})
response = client.get("/metrics?labels=a.b", headers={"X-API-KEY": "valid_api_key"})

# Checking for alert_total metric
assert (
f'alerts_total{{incident_name="test" incident_id="{incident.id}"}} 14'
f'alerts_total{{incident_name="test" incident_id="{incident.id}" a_b=""}} 14'
in response.text.split("\n")
)

Expand Down

0 comments on commit 5ba341b

Please sign in to comment.