From fc38842c9e4657dea5002c228b5d89d412316632 Mon Sep 17 00:00:00 2001 From: Shahar Glazner Date: Tue, 17 Sep 2024 15:42:20 +0300 Subject: [PATCH 01/16] fix: add minor sleep to setup_alerts to avoid lastRec (#1951) --- tests/conftest.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/conftest.py b/tests/conftest.py index 245fde4dc..641f885f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,6 +1,7 @@ import inspect import os import random +import time import uuid from datetime import datetime, timedelta, timezone from unittest.mock import Mock, patch @@ -488,6 +489,8 @@ def setup_alerts(elastic_client, db_session, request): alert_details = request.param.get("alert_details") alerts = [] for i, detail in enumerate(alert_details): + # sleep to avoid same lastReceived + time.sleep(0.02) detail["fingerprint"] = f"test-{i}" alerts.append( Alert( From a68909f3a5da832a8cdc33101b1ead0de2d84628 Mon Sep 17 00:00:00 2001 From: GlebBerjoskin <36132515+GlebBerjoskin@users.noreply.github.com> Date: Tue, 17 Sep 2024 17:03:17 +0200 Subject: [PATCH 02/16] feat: introduce iterative alert batch processing (#1913) --- ee/experimental/generative_utils.py | 239 +++++++ ee/experimental/graph_utils.py | 37 +- ee/experimental/incident_utils.py | 1015 ++++++++++----------------- keep/api/arq_worker.py | 9 +- keep/api/core/db.py | 166 ++--- keep/api/core/db_on_start.py | 3 +- keep/api/models/db/tenant.py | 2 +- keep/api/routes/incidents.py | 37 +- keep/api/utils/import_ee.py | 3 +- tests/test_alert_correlation.py | 133 ++++ 10 files changed, 837 insertions(+), 807 deletions(-) create mode 100644 ee/experimental/generative_utils.py create mode 100644 tests/test_alert_correlation.py diff --git a/ee/experimental/generative_utils.py b/ee/experimental/generative_utils.py new file mode 100644 index 000000000..5689eb7c0 --- /dev/null +++ b/ee/experimental/generative_utils.py @@ -0,0 +1,239 @@ +import logging +import os + +import numpy as np +from openai import OpenAI + +from keep.api.core.db import get_incident_by_id + +from keep.api.models.db.alert import Incident + +logger = logging.getLogger(__name__) + +SUMMARY_GENERATOR_VERBOSE_NAME = "Summary generator v0.1" +NAME_GENERATOR_VERBOSE_NAME = "Name generator v0.1" +MAX_SUMMARY_LENGTH = 900 +MAX_NAME_LENGTH = 75 + +def generate_incident_summary( + incident: Incident, + use_n_alerts_for_summary: int = -1, + generate_summary: str = None, + max_summary_length: int = None, +) -> str: + if "OPENAI_API_KEY" not in os.environ: + logger.error( + "OpenAI API key is not set. Incident summary generation is not available.", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, + "incident_id": incident.id, "tenant_id": incident.tenant_id} + ) + return "" + + if not generate_summary: + generate_summary = os.environ.get("GENERATE_INCIDENT_SUMMARY", "True") + + if generate_summary == "False": + logger.info(f"Incident summary generation is disabled. Aborting.", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + return "" + + if incident.user_summary: + return "" + + if not max_summary_length: + max_summary_length = os.environ.get( + "MAX_SUMMARY_LENGTH", MAX_SUMMARY_LENGTH) + + try: + client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + + incident = get_incident_by_id(incident.tenant_id, incident.id) + + description_strings = np.unique( + [f'{alert.event["name"]}' for alert in incident.alerts] + ).tolist() + + if use_n_alerts_for_summary > 0: + incident_description = "\n".join( + description_strings[:use_n_alerts_for_summary] + ) + else: + incident_description = "\n".join(description_strings) + + timestamps = [alert.timestamp for alert in incident.alerts] + incident_start = min(timestamps).replace(microsecond=0) + incident_end = max(timestamps).replace(microsecond=0) + + model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") + + summary = ( + client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": f"""You are a very skilled DevOps specialist who can summarize any incident based on alert descriptions. + When provided with information, summarize it in a 2-3 sentences explaining what happened and when. + ONLY SUMMARIZE WHAT YOU SEE. In the end add information about potential scenario of the incident. + When provided with information, answer with max a {int(max_summary_length * 0.9)} symbols excerpt + describing incident thoroughly. + + EXAMPLE: + An incident occurred between 2022-11-17 14:11:04 and 2022-11-22 22:19:04, involving a + total of 200 alerts. The alerts indicated critical and warning issues such as high CPU and memory + usage in pods and nodes, as well as stuck Kubernetes Daemonset rollout. Potential incident scenario: + Kubernetes Daemonset rollout stuck due to high CPU and memory usage in pods and nodes. This caused a + long tail of alerts on various topics.""", + }, + { + "role": "user", + "content": f"""Here are alerts of an incident for summarization:\n{incident_description}\n This incident started on + {incident_start}, ended on {incident_end}, included {incident.alerts_count} alerts.""", + }, + ], + ) + .choices[0] + .message.content + ) + + logger.info(f"Generated incident summary with length {len(summary)} symbols", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + + if len(summary) > max_summary_length: + logger.info(f"Generated incident summary is too long. Applying smart truncation", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + + summary = ( + client.chat.completions.create( + model=model, + messages=[ + { + "role": "system", + "content": f"""You are a very skilled DevOps specialist who can summarize any incident based on a description. + When provided with information, answer with max a {int(max_summary_length * 0.9)} symbols excerpt describing + incident thoroughly. + """, + }, + { + "role": "user", + "content": f"""Here is the description of an incident for summarization:\n{summary}""", + }, + ], + ) + .choices[0] + .message.content + ) + + logger.info(f"Generated new incident summary with length {len(summary)} symbols", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + + if len(summary) > max_summary_length: + logger.info(f"Generated incident summary is too long. Applying hard truncation", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + summary = summary[: max_summary_length] + + return summary + except Exception as e: + logger.error(f"Error in generating incident summary: {e}", + extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + return "" + + +def generate_incident_name(incident: Incident, generate_name: str = None, max_name_length: int = None, use_n_alerts_for_name: int = -1) -> str: + if "OPENAI_API_KEY" not in os.environ: + logger.error( + "OpenAI API key is not set. Incident name generation is not available.", + extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, + "incident_id": incident.id, "tenant_id": incident.tenant_id} + ) + return "" + + if not generate_name: + generate_name = os.environ.get("GENERATE_INCIDENT_NAME", "True") + + if generate_name == "False": + logger.info(f"Incident name generation is disabled. Aborting.", + extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + return "" + + if incident.user_generated_name: + return "" + + if not max_name_length: + max_name_length = os.environ.get( + "MAX_NAME_LENGTH", MAX_NAME_LENGTH) + + try: + client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + + incident = get_incident_by_id(incident.tenant_id, incident.id) + + description_strings = np.unique( + [f'{alert.event["name"]}' for alert in incident.alerts]).tolist() + + if use_n_alerts_for_name > 0: + incident_description = "\n".join( + description_strings[:use_n_alerts_for_name]) + else: + incident_description = "\n".join(description_strings) + + timestamps = [alert.timestamp for alert in incident.alerts] + incident_start = min(timestamps).replace(microsecond=0) + + model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") + + name = client.chat.completions.create(model=model, messages=[ + { + "role": "system", + "content": f"""You are a very skilled DevOps specialist who can name any incident based on alert descriptions. + When provided with information, output a short descriptive name of incident that could cause these alerts. + Add information about start time to the name. ONLY USE WHAT YOU SEE. Answer with max a {int(max_name_length * 0.9)} + symbols excerpt. + + EXAMPLE: + Kubernetes rollout stuck (started on 2022.11.17 14:11)""" + }, + { + "role": "user", + "content": f"""This incident started on {incident_start}. + Here are alerts of an incident:\n{incident_description}\n""" + } + ]).choices[0].message.content + + logger.info(f"Generated incident name with length {len(name)} symbols", + extra={"incident_id": incident.id, "tenant_id": incident.tenant_id}) + + if len(name) > max_name_length: + logger.info(f"Generated incident name is too long. Applying smart truncation", + extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + + name = client.chat.completions.create(model=model, messages=[ + { + "role": "system", + "content": f"""You are a very skilled DevOps specialist who can name any incident based on a description. + Add information about start time to the name.When provided with information, answer with max a + {int(max_name_length * 0.9)} symbols. + + EXAMPLE: + Kubernetes rollout stuck (started on 2022.11.17 14:11)""" + }, + { + "role": "user", + "content": f"""This incident started on {incident_start}. + Here is the description of an incident to name:\n{name}.""" + } + ]).choices[0].message.content + + logger.info(f"Generated new incident name with length {len(name)} symbols", + extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + + if len(name) > max_name_length: + logger.info(f"Generated incident name is too long. Applying hard truncation", + extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + name = name[: max_name_length] + + return name + except Exception as e: + logger.error(f"Error in generating incident name: {e}", + extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) + return "" diff --git a/ee/experimental/graph_utils.py b/ee/experimental/graph_utils.py index fc297f539..368e747f9 100644 --- a/ee/experimental/graph_utils.py +++ b/ee/experimental/graph_utils.py @@ -5,7 +5,7 @@ from typing import List, Tuple -from keep.api.core.db import get_pmi_values, get_pmi_values_from_temp_file +from keep.api.core.db import get_pmi_values_from_temp_file logger = logging.getLogger(__name__) @@ -49,7 +49,7 @@ def detect_knee_1d(y: List[float], curve: str, direction: str = 'increasing') -> return knee_index_convex, knee_y_convex -def create_graph(tenant_id: str, fingerprints: List[str], temp_dir: str, pmi_threshold: float = 0., knee_threshold: float = 0.8) -> nx.Graph: +def create_graph(tenant_id: str, fingerprints: List[str], pmi_values: np.ndarray, fingerprint2idx: dict, pmi_threshold: float = 0., delete_nodes: bool = False, knee_threshold: float = 0.8) -> nx.Graph: """ This function creates a graph from a list of fingerprints. The graph is created based on the PMI values between the fingerprints. The edges are created between the fingerprints that have a PMI value greater than the threshold. @@ -69,39 +69,38 @@ def create_graph(tenant_id: str, fingerprints: List[str], temp_dir: str, pmi_thr if len(fingerprints) == 1: graph.add_node(fingerprints[0]) return graph - - pmi_values, fingerpint2idx = get_pmi_values_from_temp_file(temp_dir) - - logger.info(f'Loaded PMI values for {len(pmi_values)**2} fingerprint pairs', extra={'tenant_id': tenant_id}) logger.info(f'Creating alert graph edges', extra={'tenant_id': tenant_id}) for idx_i, fingerprint_i in enumerate(fingerprints): - if fingerprint_i not in fingerpint2idx: + if fingerprint_i not in fingerprint2idx: continue for idx_j in range(idx_i + 1, len(fingerprints)): fingerprint_j = fingerprints[idx_j] - if fingerprint_j not in fingerpint2idx: + if fingerprint_j not in fingerprint2idx: continue - weight = pmi_values[fingerpint2idx[fingerprint_i], fingerpint2idx[fingerprint_j]] + weight = pmi_values[fingerprint2idx[fingerprint_i], fingerprint2idx[fingerprint_j]] if weight > pmi_threshold: graph.add_edge(fingerprint_i, fingerprint_j, weight=weight) - nodes_to_delete = [] - logger.info(f'Preparing candidate nodes for deletion', extra={'tenant_id': tenant_id}) - - for node in graph.nodes: - weights = sorted([edge['weight'] for edge in graph[node].values()]) + if delete_nodes: + nodes_to_delete = [] + logger.info(f'Preparing candidate nodes for deletion', extra={'tenant_id': tenant_id}) - knee_index, knee_statistic = detect_knee_1d_auto_increasing(weights) + for node in graph.nodes: + weights = sorted([edge['weight'] for edge in graph[node].values()]) + + knee_index, knee_statistic = detect_knee_1d_auto_increasing(weights) + + if knee_statistic < knee_threshold: + nodes_to_delete.append(node) - if knee_statistic < knee_threshold: - nodes_to_delete.append(node) - - graph.remove_nodes_from(nodes_to_delete) + logger.info(f'Removing nodes from graph, {len(nodes_to_delete)} nodes will be removed, {len(graph.nodes) - len(nodes_to_delete)} nodes will be left', + extra={'tenant_id': tenant_id}) + graph.remove_nodes_from(nodes_to_delete) return graph \ No newline at end of file diff --git a/ee/experimental/incident_utils.py b/ee/experimental/incident_utils.py index f313cd94a..6593ce830 100644 --- a/ee/experimental/incident_utils.py +++ b/ee/experimental/incident_utils.py @@ -1,16 +1,23 @@ import logging import os -from datetime import datetime, timedelta -from typing import Dict, List +import math import networkx as nx import numpy as np -import pandas as pd -from openai import OpenAI + +from tqdm import tqdm +from datetime import datetime, timedelta +from typing import Dict, List, Set, Tuple, Any +from arq.connections import ArqRedis from ee.experimental.graph_utils import create_graph from ee.experimental.statistical_utils import get_alert_pmi_matrix +from ee.experimental.generative_utils import generate_incident_summary, generate_incident_name, \ + SUMMARY_GENERATOR_VERBOSE_NAME, NAME_GENERATOR_VERBOSE_NAME + from keep.api.arq_pool import get_pool +from keep.api.core.dependencies import get_pusher_client +from keep.api.models.db.alert import Alert, Incident from keep.api.core.db import ( add_alerts_to_incident_by_incident_id, create_incident_from_dict, @@ -20,24 +27,24 @@ update_incident_summary, update_incident_name, write_pmi_matrix_to_temp_file, + get_pmi_values_from_temp_file, + get_tenant_config, + write_tenant_config, ) -from keep.api.core.dependencies import get_pusher_client -from keep.api.models.db.alert import Alert, Incident - logger = logging.getLogger(__name__) ALGORITHM_VERBOSE_NAME = "Correlation algorithm v0.2" -SUMMARY_GENERATOR_VERBOSE_NAME = "Summary generator v0.1" -NAME_GENERATOR_VERBOSE_NAME = "Name generator v0.1" USE_N_HISTORICAL_ALERTS_MINING = 10e4 USE_N_HISTORICAL_ALERTS_PMI = 10e4 USE_N_HISTORICAL_INCIDENTS = 10e4 MIN_ALERT_NUMBER = 100 +INCIDENT_VALIDITY_THRESHOLD = 3600 +ALERT_VALIDITY_THRESHOLD = 3600 +# We assume that incident / alert validity threshold is greater than a size of a batch +STRIDE_DENOMINATOR = 4 DEFAULT_TEMP_DIR_LOCATION = "./ee/experimental/ai_temp" -MAX_SUMMARY_LENGTH = 900 -MAX_NAME_LENGTH = 75 - +PMI_SLIDING_WINDOW = 3600 def calculate_pmi_matrix( ctx: dict | None, # arq context @@ -50,26 +57,20 @@ def calculate_pmi_matrix( offload_config: Dict = None, min_alert_number: int = None, ) -> dict: - logger.info( - "Calculating PMI coefficients for alerts", - extra={ - "tenant_id": tenant_id, - }, - ) + logger.info("Calculating PMI coefficients for alerts", extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) if not upper_timestamp: upper_timestamp = os.environ.get("PMI_ALERT_UPPER_TIMESTAMP", datetime.now()) if not use_n_historical_alerts: use_n_historical_alerts = os.environ.get( - "PMI_USE_N_HISTORICAL_ALERTS", USE_N_HISTORICAL_ALERTS_PMI - ) + "PMI_USE_N_HISTORICAL_ALERTS", USE_N_HISTORICAL_ALERTS_PMI) if not sliding_window: - sliding_window = os.environ.get("PMI_SLIDING_WINDOW", 4 * 60 * 60) + sliding_window = os.environ.get("PMI_SLIDING_WINDOW", PMI_SLIDING_WINDOW) if not stride: - stride = os.environ.get('PMI_STRIDE', int(sliding_window // 4)) + stride = os.environ.get("PMI_STRIDE", int(sliding_window // STRIDE_DENOMINATOR)) if not temp_dir: temp_dir = os.environ.get("AI_TEMP_FOLDER", DEFAULT_TEMP_DIR_LOCATION) @@ -87,38 +88,188 @@ def calculate_pmi_matrix( min_alert_number = os.environ.get("MIN_ALERT_NUMBER", MIN_ALERT_NUMBER) alerts = query_alerts( - tenant_id, limit=use_n_historical_alerts, upper_timestamp=upper_timestamp - ) + tenant_id, limit=use_n_historical_alerts, upper_timestamp=upper_timestamp, sort_ascending=True) if len(alerts) < min_alert_number: - logger.info( - "Not enough alerts to mine incidents", - extra={ - "tenant_id": tenant_id, - }, - ) + logger.info("Not enough alerts to mine incidents", extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) return {"status": "failed", "message": "Not enough alerts to mine incidents"} pmi_matrix, pmi_columns = get_alert_pmi_matrix( - alerts, "fingerprint", sliding_window, stride, offload_config - ) + alerts, "fingerprint", sliding_window, stride, offload_config) + + return {"status": "success", "pmi_matrix": pmi_matrix, "pmi_columns": pmi_columns} + + +def update_existing_incident(incident: Incident, alerts: List[Alert]) -> Tuple[str, bool]: + add_alerts_to_incident_by_incident_id(incident.tenant_id, incident.id, alerts) + return incident.id, True + + +def create_new_incident(component: Set[str], alerts: List[Alert], + tenant_id: str) -> Tuple[str, bool]: + incident_start_time = min(alert.timestamp for alert in alerts if alert.fingerprint in component) + incident_start_time = incident_start_time.replace(microsecond=0) + + incident = create_incident_from_dict(tenant_id, + {"ai_generated_name": f"Incident started at {incident_start_time}", + "generated_summary": "Summarization is Disabled", + "is_predicted": True}) + add_alerts_to_incident_by_incident_id( + tenant_id, incident.id, [ + alert.id for alert in alerts if alert.fingerprint in component],) + return incident.id, False + + +async def schedule_incident_processing(pool: ArqRedis, tenant_id: str, incident_id: str) -> None: + job_summary = await pool.enqueue_job("process_summary_generation", tenant_id=tenant_id, incident_id=incident_id,) + logger.info(f"Summary generation for incident {incident_id} scheduled, job: {job_summary}", extra={ + "algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "tenant_id": tenant_id, "incident_id": incident_id},) + + job_name = await pool.enqueue_job("process_name_generation", tenant_id=tenant_id, incident_id=incident_id) + logger.info(f"Name generation for incident {incident_id} scheduled, job: {job_name}", extra={ + "algorithm": NAME_GENERATOR_VERBOSE_NAME, "tenant_id": tenant_id, "incident_id": incident_id},) + + +def is_incident_accepting_updates(incident: Incident, current_time: datetime, + incident_validity_threshold: timedelta) -> bool: + return current_time - incident.last_seen_time < incident_validity_threshold + + +def get_component_first_seen_time(component: Set[str], alerts: List[Alert]) -> datetime: + return min(alert.timestamp for alert in alerts if alert.fingerprint in component) - logger.info( - "Calculating PMI coefficients for alerts finished. PMI matrix is being written to the database.", - extra={ - "tenant_id": tenant_id, - }, - ) - write_pmi_matrix_to_temp_file(tenant_id, pmi_matrix, pmi_columns, temp_dir) - logger.info( - "PMI matrix is written to the database.", - extra={ - "tenant_id": tenant_id, - }, - ) +def process_graph_component(component: Set[str], batch_incidents: List[Incident], batch_alerts: List[Alert], batch_fingerprints: Set[str], + tenant_id: str, min_incident_size: int, incident_validity_threshold: timedelta) -> Tuple[str, bool]: + is_component_merged = False + for incident in batch_incidents: + incident_fingerprints = set(alert.fingerprint for alert in incident.alerts) + if incident_fingerprints.issubset(component): + if not incident_fingerprints.intersection(batch_fingerprints): + continue + logger.info(f"Found possible extension for incident {incident.id}", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + + amendment_time = get_component_first_seen_time(component, batch_alerts) + if is_incident_accepting_updates(incident, amendment_time, incident_validity_threshold): + logger.info(f"Incident {incident.id} is accepting updates.", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + + existing_alert_ids = set([alert.id for alert in incident.alerts]) + appendable_alerts = [alert for alert in batch_alerts if alert.fingerprint in component and not alert.id in existing_alert_ids] + + logger.info(f"Appending {len(appendable_alerts)} alerts to incident {incident.id}", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + is_component_merged = True + return update_existing_incident_inmem(incident, appendable_alerts) + else: + logger.info(f"Incident {incident.id} is not accepting updates. Aborting merge operation.", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + + if not is_component_merged: + if len(component) >= min_incident_size: + logger.info(f"Creating new incident with {len(component)} alerts", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + return create_new_incident_inmem(component, batch_alerts, tenant_id) + else: + return None, False + + +def process_alert_batch(batch_alerts: List[Alert], batch_incidents: list[Incident], tenant_id: str, min_incident_size: int, + incident_validity_threshold: timedelta, pmi_values, fingerpint2idx, pmi_threshold, delete_nodes, knee_threshold) -> Tuple[str, bool]: + + batch_fingerprints = set([alert.fingerprint for alert in batch_alerts]) + + amended_fingerprints = set(batch_fingerprints) + for incident in batch_incidents: + incident_fingerprints = set(alert.fingerprint for alert in incident.alerts) + + amended_fingerprints = incident_fingerprints.union(batch_fingerprints) + + logger.info("Building alert graph", extra={"tenant_id": tenant_id, "algorithm": NAME_GENERATOR_VERBOSE_NAME}) + amended_graph = create_graph(tenant_id, list(amended_fingerprints), pmi_values, + fingerpint2idx, pmi_threshold, delete_nodes, knee_threshold) + + logger.info("Analyzing alert graph", extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + batch_incident_ids_for_processing = [] + batch_new_incidents = [] + batch_updated_incidents = [] + + for component in nx.connected_components(amended_graph): + incident, is_updated = process_graph_component(component, batch_incidents, batch_alerts, batch_fingerprints, tenant_id, min_incident_size, incident_validity_threshold) + if incident: + batch_incident_ids_for_processing.append(incident.id) + if is_updated: + batch_updated_incidents.append(incident) + else: + batch_new_incidents.append(incident) + + return batch_incident_ids_for_processing, batch_new_incidents, batch_updated_incidents - return {"status": "success"} + +async def generate_update_incident_summary(ctx, tenant_id: str, incident_id: str): + incident = get_incident_by_id(tenant_id, incident_id) + summary = generate_incident_summary(incident) + + if summary: + update_incident_summary(tenant_id, incident_id, summary) + + return summary + + +async def generate_update_incident_name(ctx, tenant_id: str, incident_id: str): + incident = get_incident_by_id(tenant_id, incident_id) + name = generate_incident_name(incident) + + if name: + update_incident_name(tenant_id, incident_id, name) + + return name + + +def get_last_incidents_inmem(incidents: List[Incident], upper_timestamp: datetime, lower_timestamp: datetime) -> List[Incident]: + return [incident for incident in incidents if lower_timestamp < incident.last_seen_time < upper_timestamp] + + +def add_alerts_to_incident_by_incident_id_inmem(incident: Incident, alerts: List[str]): + incident.alerts.extend(alerts) + return incident + + +def create_incident_from_dict_inmem(tenant_id: str, incident_dict: Dict[str, Any]) -> Incident: + return Incident(tenant_id=tenant_id, **incident_dict) + + +def create_new_incident_inmem(component: Set[str], alerts: List[Alert], tenant_id: str) -> Tuple[Incident, bool]: + incident_start_time = min(alert.timestamp for alert in alerts if alert.fingerprint in component) + incident_start_time = incident_start_time.replace(microsecond=0) + + incident = create_incident_from_dict_inmem(tenant_id, + {"name": f"Incident started at {incident_start_time}", + "description": "Summarization is Disabled", + "is_predicted": True}) + + incident = add_alerts_to_incident_by_incident_id_inmem( + incident, [alert for alert in alerts if alert.fingerprint in component],) + incident.last_seen_time = max([alert.timestamp for alert in incident.alerts]) + + return incident, False + + +def update_existing_incident_inmem(incident: Incident, alerts: List[str]) -> Tuple[str, bool]: + incident = add_alerts_to_incident_by_incident_id_inmem(incident, alerts) + incident.last_seen_time = max([alert.timestamp for alert in incident.alerts]) + return incident, True + + +def update_incident_summary_inmem(incident: Incident, summary: str): + incident.summary = summary + return incident + + +def update_incident_name_inmem(incident: Incident, name: str): + incident.name = name + return incident async def mine_incidents_and_create_objects( @@ -129,13 +280,16 @@ async def mine_incidents_and_create_objects( use_n_historical_alerts: int = None, incident_lower_timestamp: datetime = None, incident_upper_timestamp: datetime = None, - use_n_hist_incidents: int = None, + use_n_historical_incidents: int = None, pmi_threshold: float = None, + delete_nodes: bool = None, knee_threshold: float = None, min_incident_size: int = None, min_alert_number: int = None, incident_similarity_threshold: float = None, + incident_validity_threshold: timedelta = None, general_temp_dir: str = None, + alert_validity_threshold: int = None, ) -> Dict[str, List[Incident]]: """ This function mines incidents from alerts and creates incidents in the database. @@ -147,7 +301,7 @@ async def mine_incidents_and_create_objects( use_n_historical_alerts (int): number of historical alerts to use incident_lower_timestamp (datetime): lower timestamp for incidents incident_upper_timestamp (datetime): upper timestamp for incidents - use_n_hist_incidents (int): number of historical incidents to use + use_n_historical_incidents (int): number of historical incidents to use pmi_threshold (float): PMI threshold used for incident graph edges creation knee_threshold (float): knee threshold used for incident graph nodes creation min_incident_size (int): minimum incident size @@ -155,649 +309,206 @@ async def mine_incidents_and_create_objects( Returns: Dict[str, List[Incident]]: a dictionary containing the created incidents - """ + """ + # obtain tenant_config + if not general_temp_dir: + general_temp_dir = os.environ.get( + "AI_TEMP_FOLDER", DEFAULT_TEMP_DIR_LOCATION) - if not incident_upper_timestamp: - incident_upper_timestamp = os.environ.get( - "MINE_INCIDENT_UPPER_TIMESTAMP", datetime.now() - ) + temp_dir = f"{general_temp_dir}/{tenant_id}" + os.makedirs(temp_dir, exist_ok=True) - if not incident_lower_timestamp: - incident_validity = timedelta( - days=int(os.environ.get("MINE_INCIDENT_VALIDITY", "1")) - ) - incident_lower_timestamp = incident_upper_timestamp - incident_validity + tenant_config = get_tenant_config(tenant_id) + # obtain alert-related parameters + alert_validity_threshold = int(os.environ.get("ALERT_VALIDITY_THRESHOLD", ALERT_VALIDITY_THRESHOLD)) + alert_batch_stride = alert_validity_threshold // STRIDE_DENOMINATOR + if not alert_upper_timestamp: alert_upper_timestamp = os.environ.get( - "MINE_ALERT_UPPER_TIMESTAMP", datetime.now() - ) + "MINE_ALERT_UPPER_TIMESTAMP", datetime.now()) if not alert_lower_timestamp: - alert_window = timedelta(hours=int(os.environ.get("MINE_ALERT_WINDOW", "12"))) - alert_lower_timestamp = alert_upper_timestamp - alert_window + if tenant_config.get("last_correlated_batch_start", None): + alert_lower_timestamp = datetime.fromisoformat( + tenant_config.get("last_correlated_batch_start", None)) + + else: + alert_lower_timestamp = None if not use_n_historical_alerts: use_n_historical_alerts = os.environ.get( - "MINE_USE_N_HISTORICAL_ALERTS", USE_N_HISTORICAL_ALERTS_MINING - ) + "MINE_USE_N_HISTORICAL_ALERTS", + USE_N_HISTORICAL_ALERTS_MINING) - if not use_n_hist_incidents: - use_n_hist_incidents = os.environ.get( - "MINE_USE_N_HISTORICAL_INCIDENTS", USE_N_HISTORICAL_INCIDENTS - ) + # obtain incident-related parameters + if not incident_validity_threshold: + incident_validity_threshold = timedelta( + seconds=int(os.environ.get("MINE_INCIDENT_VALIDITY", INCIDENT_VALIDITY_THRESHOLD))) - if not pmi_threshold: - pmi_threshold = os.environ.get("PMI_THRESHOLD", 0.0) + if not use_n_historical_incidents: + use_n_historical_incidents = os.environ.get( + "MINE_USE_N_HISTORICAL_INCIDENTS", USE_N_HISTORICAL_INCIDENTS) - if not knee_threshold: - knee_threshold = os.environ.get("KNEE_THRESHOLD", 0.8) + if not incident_similarity_threshold: + incident_similarity_threshold = os.environ.get("INCIDENT_SIMILARITY_THRESHOLD", 0.8) if not min_incident_size: min_incident_size = os.environ.get("MIN_INCIDENT_SIZE", 5) - if not incident_similarity_threshold: - incident_similarity_threshold = os.environ.get( - "INCIDENT_SIMILARITY_THRESHOLD", 0.8 - ) + if not pmi_threshold: + pmi_threshold = os.environ.get("PMI_THRESHOLD", 0.0) - if not general_temp_dir: - general_temp_dir = os.environ.get("AI_TEMP_FOLDER", DEFAULT_TEMP_DIR_LOCATION) + if not delete_nodes: + delete_nodes = os.environ.get("DELETE_NODES", False) - temp_dir = f"{general_temp_dir}/{tenant_id}" - os.makedirs(temp_dir, exist_ok=True) + if not knee_threshold: + knee_threshold = os.environ.get("KNEE_THRESHOLD", 0.8) status = calculate_pmi_matrix(ctx, tenant_id, min_alert_number=min_alert_number) - if status.get('status') == 'failed': + if status.get("status") == "failed": + pusher_client = get_pusher_client() + if pusher_client: + log_string = f"{ALGORITHM_VERBOSE_NAME} failed to calculate PMI matrix" + pusher_client.trigger(f"private-{tenant_id}", "ai-logs-change", {"log": "Failed to calculate PMI matrix"}) + return {"incidents": []} - - logger.info( - "Getting new alerts and past incients", - extra={ - "tenant_id": tenant_id, - }, - ) - alerts = query_alerts( - tenant_id, - limit=use_n_historical_alerts, - upper_timestamp=alert_upper_timestamp, - lower_timestamp=alert_lower_timestamp, - ) - incidents, _ = get_last_incidents( - tenant_id, - limit=use_n_hist_incidents, - upper_timestamp=incident_upper_timestamp, - lower_timestamp=incident_lower_timestamp, - ) - fingerprints = list(set([alert.fingerprint for alert in alerts])) - - logger.info( - "Building alert graph", - extra={ - "tenant_id": tenant_id, - }, - ) - - graph = create_graph( - tenant_id, fingerprints, temp_dir, pmi_threshold, knee_threshold - ) - ids = [] - - logger.info( - "Analyzing alert graph", - extra={ - "tenant_id": tenant_id, - }, - ) - - incident_ids_for_summary_generation = [] - - new_incident_count = 0 - updated_incident_count = 0 - for component in nx.connected_components(graph): - if len(component) > min_incident_size: - alerts_appended = False - for incident in incidents: - incident_fingerprints = set( - [alert.fingerprint for alert in incident.alerts] - ) - intersection = incident_fingerprints.intersection(component) - - if len(intersection) / len(component) >= incident_similarity_threshold: - alerts_appended = True - - add_alerts_to_incident_by_incident_id( - tenant_id, - incident.id, - [ - alert.id - for alert in alerts - if alert.fingerprint in component - ], - ) - incident_ids_for_summary_generation.append(incident.id) - updated_incident_count += 1 - if not alerts_appended: - incident_start_time = min( - [ - alert.timestamp - for alert in alerts - if alert.fingerprint in component - ] - ) - incident_start_time = incident_start_time.replace(microsecond=0) - - incident = create_incident_from_dict( - tenant_id, - { - "name": f"Incident started at {incident_start_time}", - "description": "Summarization is Disabled", - "is_predicted": True, - }, - ) - ids.append(incident.id) - - add_alerts_to_incident_by_incident_id( - tenant_id, - incident.id, - [alert.id for alert in alerts if alert.fingerprint in component], - ) - incident_ids_for_summary_generation.append(incident.id) - new_incident_count += 1 - - if not ctx: - pool = await get_pool() - else: - pool = ctx["redis"] - - for incident_id in incident_ids_for_summary_generation: - job_summary = await pool.enqueue_job( - "process_summary_generation", - tenant_id=tenant_id, - incident_id=incident_id, - ) + elif status.get("status") == "success": logger.info( - f"Summary generation for incident {incident_id} scheduled, job: {job_summary}", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, - "tenant_id": tenant_id, "incident_id": incident_id}, - ) + f"Calculating PMI coefficients for alerts finished. PMI matrix is being written to the database. Total number of PMI coefficients: {status.get('pmi_matrix').size}", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) - job_name = await pool.enqueue_job( - "process_name_generation", - tenant_id=tenant_id, - incident_id=incident_id, - ) - logger.info( - f"Name generation for incident {incident_id} scheduled, job: {job_name}", - extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, - "tenant_id": tenant_id, "incident_id": incident_id}, - ) + pmi_values = status.get("pmi_matrix") + fingerprints = status.get("pmi_columns") + write_pmi_matrix_to_temp_file(tenant_id, pmi_values, fingerprints, temp_dir) + + logger.info("PMI matrix is written to the database.", extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + fingerprint2idx = {fingerprint: i for i, fingerprint in enumerate(fingerprints)} + logger.info("Getting new alerts and incidents", extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + alerts = query_alerts(tenant_id, limit=use_n_historical_alerts, upper_timestamp=alert_upper_timestamp, + lower_timestamp=alert_lower_timestamp, sort_ascending=True) + if not alert_lower_timestamp: + alert_lower_timestamp = min(alert.timestamp for alert in alerts) - pusher_client = get_pusher_client() - if pusher_client: - if new_incident_count > 0 or updated_incident_count > 0: - log_string = f'{ALGORITHM_VERBOSE_NAME} successfully executed. {new_incident_count} new incidents were created \ - and {updated_incident_count} incidents were updated.' - - else: - log_string = f'{ALGORITHM_VERBOSE_NAME} successfully executed. {new_incident_count} new incidents were created \ - and {updated_incident_count} incidents were updated. This may be due to high alert sparsity or low amount \ - of unique alert fingerprints. Increasing "sliding window size" or decreasing "minimal amount of unique \ - fingerprints in an incident" configuration parameters may help.' - - pusher_client.trigger( - f"private-{tenant_id}", - "ai-logs-change", - {"log": log_string}, - ) - logger.info( - "Client notified on new AI log", - extra={"tenant_id": tenant_id}, - ) - - return { - "incidents": [get_incident_by_id(tenant_id, incident_id) for incident_id in ids] - } - - -def mine_incidents( - alerts: List[Alert], - incident_sliding_window_size: int = 6 * 24 * 60 * 60, - statistic_sliding_window_size: int = 60 * 60, - jaccard_threshold: float = 0.0, - fingerprint_threshold: int = 1, -): - """ - Mine incidents from alerts. - """ - - alert_dict = { - "fingerprint": [alert.fingerprint for alert in alerts], - "timestamp": [alert.timestamp for alert in alerts], - } - alert_df = pd.DataFrame(alert_dict) - mined_incidents = shape_incidents( - alert_df, - "fingerprint", - incident_sliding_window_size, - statistic_sliding_window_size, - jaccard_threshold, - fingerprint_threshold, - ) - - return [ - { - "incident_fingerprint": incident["incident_fingerprint"], - "alerts": [ - alert - for alert in alerts - if alert.fingerprint in incident["alert_fingerprints"] - ], - } - for incident in mined_incidents - ] - - -def get_batched_alert_counts( - alerts: pd.DataFrame, unique_alert_identifier: str, sliding_window_size: int -) -> np.ndarray: - """ - Get the number of alerts in a sliding window. - """ - - resampled_alert_counts = ( - alerts.set_index("timestamp") - .resample(f"{sliding_window_size//2}s")[unique_alert_identifier] - .value_counts() - .unstack(fill_value=0) - ) - rolling_counts = resampled_alert_counts.rolling( - window=f"{sliding_window_size}s", min_periods=1 - ).sum() - alert_counts = rolling_counts.to_numpy() - - return alert_counts - - -def get_batched_alert_occurrences( - alerts: pd.DataFrame, unique_alert_identifier: str, sliding_window_size: int -) -> np.ndarray: - """ - Get the occurrence of alerts in a sliding window. - """ - - alert_counts = get_batched_alert_counts( - alerts, unique_alert_identifier, sliding_window_size - ) - alert_occurences = np.where(alert_counts > 0, 1, 0) - - return alert_occurences - - -def get_jaccard_scores(P_a: np.ndarray, P_aa: np.ndarray) -> np.ndarray: - """ - Calculate the Jaccard similarity scores between alerts. - """ - - P_a_matrix = P_a[:, None] + P_a - union_matrix = P_a_matrix - P_aa - - with np.errstate(divide="ignore", invalid="ignore"): - jaccard_matrix = np.where(union_matrix != 0, P_aa / union_matrix, 0) - - np.fill_diagonal(jaccard_matrix, 1) - - return jaccard_matrix - - -def get_alert_jaccard_matrix( - alerts: pd.DataFrame, unique_alert_identifier: str, sliding_window_size: int -) -> np.ndarray: - """ - Calculate the Jaccard similarity scores between alerts. - """ - - alert_occurrences = get_batched_alert_occurrences( - alerts, unique_alert_identifier, sliding_window_size - ) - alert_probabilities = np.mean(alert_occurrences, axis=0) - joint_alert_occurrences = np.dot(alert_occurrences.T, alert_occurrences) - pairwise_alert_probabilities = joint_alert_occurrences / alert_occurrences.shape[0] - - return get_jaccard_scores(alert_probabilities, pairwise_alert_probabilities) - - -def build_graph_from_occurrence( - occurrence_row: pd.DataFrame, - jaccard_matrix: np.ndarray, - unique_alert_identifiers: List[str], - jaccard_threshold: float = 0.05, -) -> nx.Graph: - """ - Build a weighted graph using alert occurrence matrix and Jaccard coefficients. - """ - - present_indices = np.where(occurrence_row > 0)[0] - - G = nx.Graph() - - for idx in present_indices: - alert_desc = unique_alert_identifiers[idx] - G.add_node(alert_desc) - - for i in present_indices: - for j in present_indices: - if i != j and jaccard_matrix[i, j] >= jaccard_threshold: - alert_i = unique_alert_identifiers[i] - alert_j = unique_alert_identifiers[j] - G.add_edge(alert_i, alert_j, weight=jaccard_matrix[i, j]) - - return G - - -def shape_incidents( - alerts: pd.DataFrame, - unique_alert_identifier: str, - incident_sliding_window_size: int, - statistic_sliding_window_size: int, - jaccard_threshold: float = 0.2, - fingerprint_threshold: int = 5, -) -> List[dict]: - """ - Shape incidents from alerts. - """ + incidents, _ = get_last_incidents(tenant_id, limit=use_n_historical_incidents, upper_timestamp=alert_lower_timestamp + incident_validity_threshold, + lower_timestamp=alert_upper_timestamp - incident_validity_threshold, with_alerts=True) - incidents = [] - incident_number = 0 - - resampled_alert_counts = ( - alerts.set_index("timestamp") - .resample(f"{incident_sliding_window_size//2}s")[unique_alert_identifier] - .value_counts() - .unstack(fill_value=0) - ) - jaccard_matrix = get_alert_jaccard_matrix( - alerts, unique_alert_identifier, statistic_sliding_window_size - ) - - for idx in range(resampled_alert_counts.shape[0]): - graph = build_graph_from_occurrence( - resampled_alert_counts.iloc[idx], - jaccard_matrix, - resampled_alert_counts.columns, - jaccard_threshold=jaccard_threshold, - ) - max_component = max(nx.connected_components(graph), key=len) - - min_starts_at = resampled_alert_counts.index[idx] - max_starts_at = min_starts_at + pd.Timedelta( - seconds=incident_sliding_window_size - ) - - local_alerts = alerts[ - (alerts["timestamp"] >= min_starts_at) - & (alerts["timestamp"] <= max_starts_at) - ] - local_alerts = local_alerts[ - local_alerts[unique_alert_identifier].isin(max_component) - ] - - if len(max_component) > fingerprint_threshold: - - incidents.append( - { - "incident_fingerprint": f"Incident #{incident_number}", - "alert_fingerprints": local_alerts[unique_alert_identifier] - .unique() - .tolist(), - } - ) - - return incidents - - -def generate_incident_summary( - incident: Incident, - use_n_alerts_for_summary: int = -1, - generate_summary: str = None, - max_summary_length: int = None, -) -> str: - if "OPENAI_API_KEY" not in os.environ: - logger.error( - "OpenAI API key is not set. Incident summary generation is not available.", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id} - ) - return "" - - if not generate_summary: - generate_summary = os.environ.get("GENERATE_INCIDENT_SUMMARY", "True") - - if generate_summary == "False": - logger.info(f"Incident summary generation is disabled. Aborting.", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - return "" - - if incident.user_summary: - return "" - - if not max_summary_length: - max_summary_length = os.environ.get("MAX_SUMMARY_LENGTH", MAX_SUMMARY_LENGTH) - - if not max_summary_length: - max_summary_length = os.environ.get("MAX_SUMMARY_LENGTH", MAX_SUMMARY_LENGTH) - - try: - client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) - - incident = get_incident_by_id(incident.tenant_id, incident.id) - - description_strings = np.unique( - [f'{alert.event["name"]}' for alert in incident.alerts] - ).tolist() - - if use_n_alerts_for_summary > 0: - incident_description = "\n".join( - description_strings[:use_n_alerts_for_summary] - ) - else: - incident_description = "\n".join(description_strings) - - timestamps = [alert.timestamp for alert in incident.alerts] - incident_start = min(timestamps).replace(microsecond=0) - incident_end = max(timestamps).replace(microsecond=0) - - model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") - - summary = ( - client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": f"""You are a very skilled DevOps specialist who can summarize any incident based on alert descriptions. - When provided with information, summarize it in a 2-3 sentences explaining what happened and when. - ONLY SUMMARIZE WHAT YOU SEE. In the end add information about potential scenario of the incident. - When provided with information, answer with max a {int(max_summary_length * 0.9)} symbols excerpt - describing incident thoroughly. - - EXAMPLE: - An incident occurred between 2022-11-17 14:11:04 and 2022-11-22 22:19:04, involving a - total of 200 alerts. The alerts indicated critical and warning issues such as high CPU and memory - usage in pods and nodes, as well as stuck Kubernetes Daemonset rollout. Potential incident scenario: - Kubernetes Daemonset rollout stuck due to high CPU and memory usage in pods and nodes. This caused a - long tail of alerts on various topics.""", - }, - { - "role": "user", - "content": f"""Here are alerts of an incident for summarization:\n{incident_description}\n This incident started on - {incident_start}, ended on {incident_end}, included {incident.alerts_count} alerts.""", - }, - ], - ) - .choices[0] - .message.content - ) - - logger.info(f"Generated incident summary with length {len(summary)} symbols", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - - if len(summary) > max_summary_length: - logger.info(f"Generated incident summary is too long. Applying smart truncation", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - - summary = ( - client.chat.completions.create( - model=model, - messages=[ - { - "role": "system", - "content": f"""You are a very skilled DevOps specialist who can summarize any incident based on a description. - When provided with information, answer with max a {int(max_summary_length * 0.9)} symbols excerpt describing - incident thoroughly. - """, - }, - { - "role": "user", - "content": f"""Here is the description of an incident for summarization:\n{summary}""", - }, - ], - ) - .choices[0] - .message.content - ) - - logger.info(f"Generated new incident summary with length {len(summary)} symbols", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - - if len(summary) > max_summary_length: - logger.info(f"Generated incident summary is too long. Applying hard truncation", - extra={"algorithm": SUMMARY_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - summary = summary[: max_summary_length] - - return summary - except Exception as e: - logger.error(f"Error in generating incident summary: {e}") - return "" + n_batches = int(math.ceil((alert_upper_timestamp - alert_lower_timestamp).total_seconds() / alert_batch_stride)) - (STRIDE_DENOMINATOR - 1) + logging.info( + f"Starting alert correlation. Current batch size: {alert_validity_threshold} seconds. Current \ + batch stride: {alert_batch_stride} seconds. Number of batches to process: {n_batches}") + pool = await get_pool() if not ctx else ctx["redis"] -def generate_incident_name(incident: Incident, generate_name: str = None, max_name_length: int = None, use_n_alerts_for_name: int = -1) -> str: - if "OPENAI_API_KEY" not in os.environ: - logger.error( - "OpenAI API key is not set. Incident name generation is not available.", - extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id} - ) - return "" - - if not generate_name: - generate_name = os.environ.get("GENERATE_INCIDENT_NAME", "True") - - if generate_name == "False": - logger.info(f"Incident name generation is disabled. Aborting.", - extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - return "" - - if incident.user_generated_name: - return "" - - if not max_name_length: - max_name_length = os.environ.get( - "MAX_NAME_LENGTH", MAX_NAME_LENGTH) - - if not max_name_length: - max_name_length = os.environ.get( - "MAX_NAME_LENGTH", MAX_NAME_LENGTH) - - try: - client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) - - incident = get_incident_by_id(incident.tenant_id, incident.id) + new_incident_ids = [] + updated_incident_ids = [] + incident_ids_for_processing = [] + + alert_timestamps = np.array([alert.timestamp.timestamp() for alert in alerts]) + batch_indices = np.arange(0, n_batches) + batch_start_ts = alert_lower_timestamp.timestamp() + np.array([batch_idx * alert_batch_stride for batch_idx in batch_indices]) + batch_end_ts = batch_start_ts + alert_validity_threshold - description_strings = np.unique( - [f'{alert.event["name"]}' for alert in incident.alerts]).tolist() + start_indices = np.searchsorted(alert_timestamps, batch_start_ts, side='left') + end_indices = np.searchsorted(alert_timestamps, batch_end_ts, side='right') - if use_n_alerts_for_name > 0: - incident_description = "\n".join( - description_strings[:use_n_alerts_for_name]) - else: - incident_description = "\n".join(description_strings) + for batch_idx, (start_idx, end_idx) in tqdm(enumerate(zip(start_indices, end_indices)), total=n_batches, desc="Processing alert batches.."): + batch_alerts = alerts[start_idx:end_idx] - timestamps = [alert.timestamp for alert in incident.alerts] - incident_start = min(timestamps).replace(microsecond=0) + logger.info( + f"Processing batch {batch_idx} with start timestamp {datetime.fromtimestamp(batch_start_ts[batch_idx])} \ + and end timestamp {min(datetime.fromtimestamp(batch_end_ts[batch_idx]), alert_upper_timestamp)}. Batch size: {len(batch_alerts)}", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + + if len(batch_alerts) == 0: + continue + + batch_incidents = get_last_incidents_inmem(incidents, datetime.fromtimestamp(batch_end_ts[batch_idx]), + datetime.fromtimestamp(batch_start_ts[batch_idx]) - incident_validity_threshold) - model = os.environ.get("OPENAI_MODEL", "gpt-4o-mini") + logger.info( + f"Found {len(batch_incidents)} incidents that accept updates by {datetime.fromtimestamp(batch_start_ts[batch_idx])}.", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + + batch_incident_ids_for_processing, batch_new_incidents, batch_updated_incidents = process_alert_batch( + batch_alerts, batch_incidents, tenant_id, min_incident_size, incident_validity_threshold, pmi_values, fingerprint2idx, pmi_threshold, delete_nodes, knee_threshold) - name = client.chat.completions.create(model=model, messages=[ - { - "role": "system", - "content": f"""You are a very skilled DevOps specialist who can name any incident based on alert descriptions. - When provided with information, output a short descriptive name of incident that could cause these alerts. - Add information about start time to the name. ONLY USE WHAT YOU SEE. Answer with max a {int(max_name_length * 0.9)} - symbols excerpt. - - EXAMPLE: - Kubernetes rollout stuck (started on 2022.11.17 14:11)""" - }, - { - "role": "user", - "content": f"""This incident started on {incident_start}. - Here are alerts of an incident:\n{incident_description}\n""" + new_incident_ids.extend([incident.id for incident in batch_new_incidents]) + incidents.extend(batch_new_incidents) + updated_incident_ids.extend([incident.id for incident in batch_updated_incidents]) + incident_ids_for_processing.extend(batch_incident_ids_for_processing) + + logger.info(f"Saving last correlated batch start timestamp: {datetime.isoformat(alert_lower_timestamp + timedelta(seconds= (n_batches - 1) * alert_batch_stride))}", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + tenant_config["last_correlated_batch_start"] = datetime.isoformat(alert_lower_timestamp + timedelta(seconds= (n_batches - 1) * alert_batch_stride)) + write_tenant_config(tenant_id, tenant_config) + + logger.info(f"Writing {len(incidents)} incidents to database", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + db_incident_ids_for_processing = [] + db_new_incident_ids = [] + db_updated_incident_ids = [] + for incident in incidents: + if not get_incident_by_id(tenant_id, incident.id): + incident_dict = { + "ai_generated_name": incident.ai_generated_name, + "generated_summary": incident.generated_summary, + "is_predicted": True, } - ]).choices[0].message.content - - logger.info(f"Generated incident name with length {len(name)} symbols", - extra={"incident_id": incident.id, "tenant_id": incident.tenant_id}) - - if len(name) > max_name_length: - logger.info(f"Generated incident name is too long. Applying smart truncation", - extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - - name = client.chat.completions.create(model=model, messages=[ - { - "role": "system", - "content": f"""You are a very skilled DevOps specialist who can name any incident based on a description. - Add information about start time to the name.When provided with information, answer with max a - {int(max_name_length * 0.9)} symbols. - - EXAMPLE: - Kubernetes rollout stuck (started on 2022.11.17 14:11)""" - }, - { - "role": "user", - "content": f"""This incident started on {incident_start}. - Here is the description of an incident to name:\n{name}.""" - } - ]).choices[0].message.content - - logger.info(f"Generated new incident name with length {len(name)} symbols", - extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - - if len(name) > max_name_length: - logger.info(f"Generated incident name is too long. Applying hard truncation", - extra={"algorithm": NAME_GENERATOR_VERBOSE_NAME, "incident_id": incident.id, "tenant_id": incident.tenant_id}) - name = name[: max_name_length] - - return name - except Exception as e: - logger.error(f"Error in generating incident name: {e}") - return "" + db_incident = create_incident_from_dict(tenant_id, incident_dict) + + incident_id = db_incident.id + else: + incident_id = incident.id + + if incident.id in incident_ids_for_processing: + db_incident_ids_for_processing.append(incident_id) + + if incident.id in new_incident_ids: + db_new_incident_ids.append(incident_id) + + if incident.id in updated_incident_ids: + db_updated_incident_ids.append(incident_id) + + + add_alerts_to_incident_by_incident_id(tenant_id, incident_id, [alert.id for alert in incident.alerts]) + + logger.info(f"Scheduling {len(db_incident_ids_for_processing)} incidents for name / summary generation", + extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) + new_incident_count = len(set(new_incident_ids)) + updated_incident_count = len(set(updated_incident_ids).difference(set(new_incident_ids))) + db_incident_ids_for_processing = list(set(db_incident_ids_for_processing)) + for incident_id in db_incident_ids_for_processing: + await schedule_incident_processing(pool, tenant_id, incident_id) + incident_ids = list(set(db_new_incident_ids + db_updated_incident_ids)) -async def generate_update_incident_summary(ctx, tenant_id: str, incident_id: str): - incident = get_incident_by_id(tenant_id, incident_id) - summary = generate_incident_summary(incident) - if summary: - update_incident_summary(tenant_id, incident_id, summary) - - return summary + pusher_client = get_pusher_client() + if pusher_client: + if new_incident_count > 0 or updated_incident_count > 0: + log_string = f"{ALGORITHM_VERBOSE_NAME} successfully executed. Alerts from {alert_lower_timestamp.replace(microsecond=0)} \ + till {alert_upper_timestamp.replace(microsecond=0)} were processed. Total count of processed alerts: {len(alerts)}. \ + Total count of created incidents: {new_incident_count}. Total count of updated incidents: \ + {updated_incident_count}." + elif len(alerts) > 0: + log_string = f'{ALGORITHM_VERBOSE_NAME} successfully executed. Alerts from {alert_lower_timestamp.replace(microsecond=0)} \ + till {alert_upper_timestamp.replace(microsecond=0)} were processed. Total count of processed alerts: {len(alerts)}. \ + Total count of created incidents: {new_incident_count}. Total count of updated incidents: \ + {updated_incident_count}. This may be due to high alert sparsity or low amount of unique \ + alert fingerprints. Adding more alerts, increasing "sliding window size" or decreasing minimal amount of \ + "minimal amount of unique fingerprints in an incident" configuration parameters may help.' + + else: + log_string = f'{ALGORITHM_VERBOSE_NAME} successfully executed. Alerts from {alert_lower_timestamp.replace(microsecond=0)} \ + till {alert_upper_timestamp.replace(microsecond=0)} were processed. Total count of processed alerts: {len(alerts)}. \ + No incidents were created or updated. Add alerts to the system to enable automatic incident creation.' + pusher_client.trigger(f"private-{tenant_id}", "ai-logs-change", {"log": log_string}) -async def generate_update_incident_name(ctx, tenant_id: str, incident_id: str): - incident = get_incident_by_id(tenant_id, incident_id) - name = generate_incident_name(incident) - if name: - update_incident_name(tenant_id, incident_id, name) + logger.info("Client notified on new AI log", extra={"tenant_id": tenant_id, "algorithm": ALGORITHM_VERBOSE_NAME}) - return name \ No newline at end of file + return {"incidents": [get_incident_by_id(tenant_id, incident_id) + for incident_id in incident_ids]} \ No newline at end of file diff --git a/keep/api/arq_worker.py b/keep/api/arq_worker.py index 1e0667888..29d9af3f1 100644 --- a/keep/api/arq_worker.py +++ b/keep/api/arq_worker.py @@ -63,8 +63,7 @@ FUNCTIONS: list = ( [ - import_string(background_function) - for background_function in list(ARQ_BACKGROUND_FUNCTIONS) + import_string(background_function) for background_function in list(ARQ_BACKGROUND_FUNCTIONS) ] if ARQ_BACKGROUND_FUNCTIONS is not None else list() @@ -86,13 +85,15 @@ def get_arq_worker(queue_name: str) -> Worker: expires = config( "ARQ_EXPIRES", cast=int, default=3600 ) # the default length of time from when a job is expected to start after which the job expires, making it shorter to avoid clogging - + expires_ai = config( + "ARQ_EXPIRES_AI", cast=int, default=3600*1000 + ) # generate a worker id so each worker will have a different health check key worker_id = str(uuid4()).replace("-", "") worker = create_worker( WorkerSettings, keep_result=keep_result, - expires_extra_ms=expires, + expires_extra_ms=expires_ai if KEEP_ARQ_TASK_POOL == KEEP_ARQ_TASK_POOL_AI else expires, queue_name=queue_name, health_check_key=f"{queue_name}:{worker_id}:health-check", ) diff --git a/keep/api/core/db.py b/keep/api/core/db.py index f8ea824b0..234679c23 100644 --- a/keep/api/core/db.py +++ b/keep/api/core/db.py @@ -15,7 +15,6 @@ from uuid import uuid4 import numpy as np -import pandas as pd import validators from dotenv import find_dotenv, load_dotenv from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor @@ -38,7 +37,6 @@ from keep.api.models.db.preset import * # pylint: disable=unused-wildcard-import from keep.api.models.db.provider import * # pylint: disable=unused-wildcard-import from keep.api.models.db.rule import * # pylint: disable=unused-wildcard-import -from keep.api.models.db.statistics import * # pylint: disable=unused-wildcard-import from keep.api.models.db.tenant import * # pylint: disable=unused-wildcard-import from keep.api.models.db.topology import * # pylint: disable=unused-wildcard-import from keep.api.models.db.workflow import * # pylint: disable=unused-wildcard-import @@ -1077,6 +1075,7 @@ def query_alerts( upper_timestamp=None, lower_timestamp=None, skip_alerts_with_null_timestamp=True, + sort_ascending=False, ) -> list[Alert]: """ Get all alerts for a given tenant_id. @@ -1126,9 +1125,14 @@ def query_alerts( if skip_alerts_with_null_timestamp: query = query.filter(Alert.timestamp.isnot(None)) - - # Order by timestamp in descending order and limit the results - query = query.order_by(Alert.timestamp.desc()).limit(limit) + + if sort_ascending: + query = query.order_by(Alert.timestamp.asc()) + else: + query = query.order_by(Alert.timestamp.desc()) + + if limit: + query = query.limit(limit) # Execute the query alerts = query.all() @@ -1738,7 +1742,6 @@ def update_key_last_used( session.add(tenant_api_key_entry) session.commit() - def get_linked_providers(tenant_id: str) -> List[Tuple[str, str, datetime]]: with Session(engine) as session: providers = ( @@ -2168,6 +2171,7 @@ def get_last_incidents( is_confirmed: bool = False, sorting: Optional[IncidentSorting] = IncidentSorting.creation_time, with_alerts: bool = False, + is_predicted: bool = None, ) -> Tuple[list[Incident], int]: """ Get the last incidents and total amount of incidents. @@ -2195,6 +2199,9 @@ def get_last_incidents( if with_alerts: query = query.options(joinedload(Incident.alerts)) + if is_predicted is not None: + query = query.filter(Incident.is_predicted == is_predicted) + if timeframe: query = query.filter( Incident.start_time @@ -2416,6 +2423,9 @@ def inner(db_session: Session): def add_alerts_to_incident_by_incident_id( tenant_id: str, incident_id: str | UUID, alert_ids: List[UUID] ) -> Optional[Incident]: + logger.info(f"Adding alerts to incident {incident_id} in database, total {len(alert_ids)} alerts", + extra={"tags": {"tenant_id": tenant_id, "incident_id": incident_id}}) + with Session(engine) as session: incident = session.exec( select(Incident).where( @@ -2427,39 +2437,40 @@ def add_alerts_to_incident_by_incident_id( if not incident: return None - existed_alert_ids = session.exec( - select(AlertToIncident.alert_id).where( - AlertToIncident.tenant_id == tenant_id, - AlertToIncident.incident_id == incident.id, - col(AlertToIncident.alert_id).in_(alert_ids), - ) - ).all() + # Use a set for faster membership checks + existing_alert_ids = set( + session.exec( + select(AlertToIncident.alert_id).where( + AlertToIncident.tenant_id == tenant_id, + AlertToIncident.incident_id == incident.id, + col(AlertToIncident.alert_id).in_(alert_ids), + ) + ).all() + ) - new_alert_ids = [ - alert_id for alert_id in alert_ids if alert_id not in existed_alert_ids - ] + new_alert_ids = [alert_id for alert_id in alert_ids if alert_id not in existing_alert_ids] if not new_alert_ids: return incident alerts_data_for_incident = get_alerts_data_for_incident(new_alert_ids, session) - incident.sources = list( - set(incident.sources) | set(alerts_data_for_incident["sources"]) - ) - incident.affected_services = list( - set(incident.affected_services) | set(alerts_data_for_incident["services"]) - ) + incident.sources = list(set(incident.sources) | set(alerts_data_for_incident["sources"])) + incident.affected_services = list(set(incident.affected_services) | set(alerts_data_for_incident["services"])) incident.alerts_count += alerts_data_for_incident["count"] alert_to_incident_entries = [ - AlertToIncident( - alert_id=alert_id, incident_id=incident.id, tenant_id=tenant_id - ) + AlertToIncident(alert_id=alert_id, incident_id=incident.id, tenant_id=tenant_id) for alert_id in new_alert_ids ] - session.bulk_save_objects(alert_to_incident_entries) + for idx, entry in enumerate(alert_to_incident_entries): + session.add(entry) + if (idx + 1) % 100 == 0: + logger.info(f"Added {idx + 1}/{len(alert_to_incident_entries)} alerts to incident {incident.id} in database", + extra={"tags": {"tenant_id": tenant_id, "incident_id": incident.id}}) + session.commit() + session.flush() started_at, last_seen_at = session.exec( select(func.min(Alert.timestamp), func.max(Alert.timestamp)) @@ -2469,9 +2480,9 @@ def add_alerts_to_incident_by_incident_id( AlertToIncident.incident_id == incident.id, ) ).one() + incident.start_time = started_at incident.last_seen_time = last_seen_at - incident.severity = alerts_data_for_incident["max_severity"].order session.add(incident) @@ -2658,78 +2669,6 @@ def write_pmi_matrix_to_temp_file( return True -def write_pmi_matrix_to_db(tenant_id: str, pmi_matrix_df: pd.DataFrame) -> bool: - # TODO: add handlers for sequential launches - with Session(engine) as session: - pmi_entries_to_update = 0 - pmi_entries_to_insert = [] - - # Query for existing entries to differentiate between updates and inserts - existing_entries = session.query(PMIMatrix).filter_by(tenant_id=tenant_id).all() - existing_entries_dict = { - (entry.fingerprint_i, entry.fingerprint_j): entry - for entry in existing_entries - } - - for fingerprint_i in pmi_matrix_df.index: - for fingerprint_j in pmi_matrix_df.columns: - if pmi_matrix_df.at[fingerprint_i, fingerprint_j] == -100: - continue - - pmi = float(pmi_matrix_df.at[fingerprint_i, fingerprint_j]) - - pmi_entry = { - "tenant_id": tenant_id, - "fingerprint_i": fingerprint_i, - "fingerprint_j": fingerprint_j, - "pmi": pmi, - } - - if (fingerprint_i, fingerprint_j) in existing_entries_dict: - existed_entry = existing_entries_dict[ - (fingerprint_i, fingerprint_j) - ] - if existed_entry.pmi != pmi: - session.execute( - update(PMIMatrix) - .where( - PMIMatrix.fingerprint_i == fingerprint_i, - PMIMatrix.fingerprint_j == fingerprint_j, - PMIMatrix.tenant_id == tenant_id, - ) - .values(pmi=pmi) - ) - pmi_entries_to_update += 1 - else: - pmi_entries_to_insert.append(pmi_entry) - - if pmi_entries_to_insert: - session.bulk_insert_mappings(PMIMatrix, pmi_entries_to_insert) - - logger.info( - f"PMI matrix for tenant {tenant_id} updated. {pmi_entries_to_update} entries updated, {len(pmi_entries_to_insert)} entries inserted", - extra={"tenant_id": tenant_id}, - ) - - session.commit() - - return True - - -def get_pmi_value( - tenant_id: str, fingerprint_i: str, fingerprint_j: str -) -> Optional[float]: - with Session(engine) as session: - pmi_entry = session.exec( - select(PMIMatrix) - .where(PMIMatrix.tenant_id == tenant_id) - .where(PMIMatrix.fingerprint_i == fingerprint_i) - .where(PMIMatrix.fingerprint_j == fingerprint_j) - ).first() - - return pmi_entry.pmi if pmi_entry else None - - def get_pmi_values_from_temp_file(temp_dir: str) -> Tuple[np.array, Dict[str, int]]: npzfile = np.load(f"{temp_dir}/pmi_matrix.npz", allow_pickle=True) pmi_matrix = npzfile["pmi_matrix"] @@ -2740,18 +2679,25 @@ def get_pmi_values_from_temp_file(temp_dir: str) -> Tuple[np.array, Dict[str, in return pmi_matrix, fingerint2idx -def get_pmi_values( - tenant_id: str, fingerprints: List[str] -) -> Dict[Tuple[str, str], Optional[float]]: +def get_tenant_config(tenant_id: str) -> dict: with Session(engine) as session: - pmi_entries = session.exec( - select(PMIMatrix).where(PMIMatrix.tenant_id == tenant_id) - ).all() + tenant_data = session.exec( + select(Tenant) + .where(Tenant.id == tenant_id) + ).first() + return tenant_data.configuration if tenant_data else {} + - pmi_values = { - (entry.fingerprint_i, entry.fingerprint_j): entry.pmi for entry in pmi_entries - } - return pmi_values +def write_tenant_config(tenant_id: str, config: dict) -> None: + with Session(engine) as session: + tenant_data = session.exec( + select(Tenant) + .where(Tenant.id == tenant_id) + ).first() + tenant_data.configuration = config + session.commit() + session.refresh(tenant_data) + return tenant_data def update_incident_summary( @@ -2895,4 +2841,4 @@ def get_provider_by_name(tenant_id: str, provider_name: str) -> Provider: .where(Provider.tenant_id == tenant_id) .where(Provider.name == provider_name) ).first() - return provider + return provider \ No newline at end of file diff --git a/keep/api/core/db_on_start.py b/keep/api/core/db_on_start.py index a39abf1e7..b65dc698d 100644 --- a/keep/api/core/db_on_start.py +++ b/keep/api/core/db_on_start.py @@ -150,6 +150,7 @@ def try_create_single_tenant(tenant_id: str) -> None: pass logger.info(f"Api key {api_key_name} provisioned") logger.info("Api keys provisioned") + # commit the changes session.commit() logger.info("Single tenant created") @@ -180,4 +181,4 @@ def migrate_db(): os.path.dirname(os.path.abspath(__file__)) + "/../models/db/migrations", ) alembic.command.upgrade(config, "head") - logger.info("Finished migrations") + logger.info("Finished migrations") \ No newline at end of file diff --git a/keep/api/models/db/tenant.py b/keep/api/models/db/tenant.py index d7a67e815..2ccf32833 100644 --- a/keep/api/models/db/tenant.py +++ b/keep/api/models/db/tenant.py @@ -36,4 +36,4 @@ class TenantInstallation(SQLModel, table=True): tenant_id: str = Field(foreign_key="tenant.id") bot_id: str installed: bool = False - tenant: Optional[Tenant] = Relationship(back_populates="installations") + tenant: Optional[Tenant] = Relationship(back_populates="installations") \ No newline at end of file diff --git a/keep/api/routes/incidents.py b/keep/api/routes/incidents.py index dd9a07f05..5db465f3e 100644 --- a/keep/api/routes/incidents.py +++ b/keep/api/routes/incidents.py @@ -49,7 +49,6 @@ sys.path.insert(0, path_with_ee) from ee.experimental.incident_utils import ( # noqa ALGORITHM_VERBOSE_NAME, - mine_incidents, ) @@ -391,29 +390,29 @@ def mine( ), alert_lower_timestamp: datetime = None, alert_upper_timestamp: datetime = None, - use_n_historical_alerts: int = 10e10, + use_n_historical_alerts: int = None, incident_lower_timestamp: datetime = None, incident_upper_timestamp: datetime = None, - use_n_hist_incidents: int = 10e10, - pmi_threshold: float = 0.0, - knee_threshold: float = 0.8, - min_incident_size: int = 5, - incident_similarity_threshold: float = 0.8, + use_n_historical_incidents: int = None, + pmi_threshold: float = None, + knee_threshold: float = None, + min_incident_size: int = None, + incident_similarity_threshold: float = None, ) -> dict: result = asyncio.run( mine_incidents_and_create_objects( - None, - authenticated_entity.tenant_id, - alert_lower_timestamp, - alert_upper_timestamp, - use_n_historical_alerts, - incident_lower_timestamp, - incident_upper_timestamp, - use_n_hist_incidents, - pmi_threshold, - knee_threshold, - min_incident_size, - incident_similarity_threshold, + ctx=None, + tenant_id=authenticated_entity.tenant_id, + alert_lower_timestamp=alert_lower_timestamp, + alert_upper_timestamp=alert_upper_timestamp, + use_n_historical_alerts=use_n_historical_alerts, + incident_lower_timestamp=incident_lower_timestamp, + incident_upper_timestamp=incident_upper_timestamp, + use_n_historical_incidents=use_n_historical_incidents, + pmi_threshold=pmi_threshold, + knee_threshold=knee_threshold, + min_incident_size=min_incident_size, + incident_similarity_threshold=incident_similarity_threshold, ) ) return result diff --git a/keep/api/utils/import_ee.py b/keep/api/utils/import_ee.py index 5d06ff228..de1742c1f 100644 --- a/keep/api/utils/import_ee.py +++ b/keep/api/utils/import_ee.py @@ -17,7 +17,8 @@ sys.path.insert(0, path_with_ee) from ee.experimental.incident_utils import mine_incidents_and_create_objects, generate_update_incident_summary, generate_update_incident_name # noqa - from ee.experimental.incident_utils import ALGORITHM_VERBOSE_NAME, SUMMARY_GENERATOR_VERBOSE_NAME, NAME_GENERATOR_VERBOSE_NAME # noqa + from ee.experimental.generative_utils import generate_incident_summary, generate_incident_name, SUMMARY_GENERATOR_VERBOSE_NAME, NAME_GENERATOR_VERBOSE_NAME # noqa + from ee.experimental.incident_utils import ALGORITHM_VERBOSE_NAME # noqa else: mine_incidents_and_create_objects = NotImplemented generate_update_incident_summary = NotImplemented diff --git a/tests/test_alert_correlation.py b/tests/test_alert_correlation.py new file mode 100644 index 000000000..aaa06be16 --- /dev/null +++ b/tests/test_alert_correlation.py @@ -0,0 +1,133 @@ +import os +import pytest +import random +import numpy as np + +from datetime import datetime, timedelta +from unittest.mock import patch, MagicMock, AsyncMock +from keep.api.models.db.alert import Alert +from keep.api.models.db.tenant import Tenant +from ee.experimental.incident_utils import mine_incidents_and_create_objects, calculate_pmi_matrix, DEFAULT_TEMP_DIR_LOCATION + +random.seed(42) + +@pytest.mark.asyncio +async def test_mine_incidents_and_create_objects(db_session, tenant_id='test', n_alerts=10000, n_fingerprints=50): + # Add alerts + current_time = datetime.now() + time_lags = [int(round(random.normalvariate(mu=60*24*30/2, sigma=60*24*30/6))) for _ in range(n_alerts)] + alerts = [ + Alert( + tenant_id=tenant_id, + provider_type="test", + provider_id="test", + event={ + "id": f"test-{i}", + "name": f"Test Alert {i}", + "fingerprint": f"fp-{i % n_fingerprints}", + "lastReceived": (current_time - timedelta(minutes=time_lags[i])).isoformat(), + "severity": "critical", + "source": ["test-source"], + }, + fingerprint=f"fp-{i % n_fingerprints}", + timestamp=current_time - timedelta(minutes=time_lags[i]) + ) + for i in range(n_alerts) + ] + db_session.add_all(alerts) + db_session.commit() + + # add Tenant + tenant = Tenant( + id=tenant_id, + name=tenant_id, + configuration={ + "ee_enabled": True, + } + ) + db_session.add(tenant) + db_session.commit() + + # Mock dependencies and call the function + with patch('ee.experimental.incident_utils.get_pusher_client') as mock_pusher, \ + patch('ee.experimental.incident_utils.get_pool') as mock_get_pool: + + mock_pusher.return_value = MagicMock() + mock_pool = AsyncMock() + mock_get_pool.return_value = mock_pool + + result = await mine_incidents_and_create_objects(None, tenant_id) + + assert result is not None + assert mock_pusher.called + assert mock_get_pool.called + +def test_calculate_pmi_matrix(db_session, tenant_id='test', n_alerts=10000, n_fingerprints=50): + # Add Alerts + current_time = datetime.now() + time_lags = [int(round(random.normalvariate(mu=60*24*30/2, sigma=60*24*30/6))) for _ in range(n_alerts)] + alerts = [ + Alert( + tenant_id=tenant_id, + provider_type="test", + provider_id="test", + event={ + "id": f"test-{i}", + "name": f"Test Alert {i}", + "fingerprint": f"fp-{i % n_fingerprints}", + "lastReceived": (current_time - timedelta(minutes=time_lags[i])).isoformat(), + "severity": "critical", + "source": ["test-source"], + }, + fingerprint=f"fp-{i % n_fingerprints}", + timestamp=current_time - timedelta(minutes=time_lags[i]) + ) + for i in range(n_alerts) + ] + db_session.add_all(alerts) + db_session.commit() + + # add Tenant + tenant = Tenant( + id=tenant_id, + name=tenant_id, + configuration={ + "ee_enabled": True, + } + ) + db_session.add(tenant) + db_session.commit() + + # Call the function + result = calculate_pmi_matrix(None, tenant_id) + + assert result["status"] == "success" + pmi_matrix = result["pmi_matrix"] + fingerprints = result["pmi_columns"] + assert (np.unique(fingerprints) == np.unique([f"fp-{i % n_fingerprints}" for i in range(n_fingerprints)])).all() + assert pmi_matrix.shape == (n_fingerprints, n_fingerprints) + + +@pytest.mark.asyncio +async def test_mine_incidents_and_create_objects_with_no_alerts(db_session, tenant_id='test'): + # add Tenant + tenant = Tenant( + id=tenant_id, + name=tenant_id, + configuration={ + "ee_enabled": True, + } + ) + + with patch('ee.experimental.incident_utils.get_pusher_client') as mock_pusher, \ + patch('ee.experimental.incident_utils.get_pool') as mock_get_pool: + + mock_pusher.return_value = MagicMock() + mock_pool = AsyncMock() + mock_get_pool.return_value = mock_pool + + result = await mine_incidents_and_create_objects(None, tenant_id) + + assert result=={"incidents": []} + + From 026d12bc7957cf90ca5f4f013f9f14442850679d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 18 Sep 2024 08:59:31 +0300 Subject: [PATCH 03/16] chore(deps): bump next from 14.2.1 to 14.2.12 in /keep-ui (#1952) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- keep-ui/package-lock.json | 88 +++++++++++++++++++-------------------- keep-ui/package.json | 2 +- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/keep-ui/package-lock.json b/keep-ui/package-lock.json index 36a6162ec..0b385c849 100644 --- a/keep-ui/package-lock.json +++ b/keep-ui/package-lock.json @@ -231,7 +231,7 @@ "mz": "^2.7.0", "nanoid": "^3.3.6", "natural-compare": "^1.4.0", - "next": "^14.2.1", + "next": "^14.2.12", "next-auth": "^4.24.7", "node-releases": "^2.0.10", "normalize-path": "^3.0.0", @@ -3061,9 +3061,9 @@ "integrity": "sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==" }, "node_modules/@next/env": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.1.tgz", - "integrity": "sha512-qsHJle3GU3CmVx7pUoXcghX4sRN+vINkbLdH611T8ZlsP//grzqVW87BSUgOZeSAD4q7ZdZicdwNe/20U2janA==" + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/env/-/env-14.2.12.tgz", + "integrity": "sha512-3fP29GIetdwVIfIRyLKM7KrvJaqepv+6pVodEbx0P5CaMLYBtx+7eEg8JYO5L9sveJO87z9eCReceZLi0hxO1Q==" }, "node_modules/@next/eslint-plugin-next": { "version": "14.2.1", @@ -3117,9 +3117,9 @@ } }, "node_modules/@next/swc-darwin-arm64": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.1.tgz", - "integrity": "sha512-kGjnjcIJehEcd3rT/3NAATJQndAEELk0J9GmGMXHSC75TMnvpOhONcjNHbjtcWE5HUQnIHy5JVkatrnYm1QhVw==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-arm64/-/swc-darwin-arm64-14.2.12.tgz", + "integrity": "sha512-crHJ9UoinXeFbHYNok6VZqjKnd8rTd7K3Z2zpyzF1ch7vVNKmhjv/V7EHxep3ILoN8JB9AdRn/EtVVyG9AkCXw==", "cpu": [ "arm64" ], @@ -3132,9 +3132,9 @@ } }, "node_modules/@next/swc-darwin-x64": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.1.tgz", - "integrity": "sha512-dAdWndgdQi7BK2WSXrx4lae7mYcOYjbHJUhvOUnJjMNYrmYhxbbvJ2xElZpxNxdfA6zkqagIB9He2tQk+l16ew==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-darwin-x64/-/swc-darwin-x64-14.2.12.tgz", + "integrity": "sha512-JbEaGbWq18BuNBO+lCtKfxl563Uw9oy2TodnN2ioX00u7V1uzrsSUcg3Ep9ce+P0Z9es+JmsvL2/rLphz+Frcw==", "cpu": [ "x64" ], @@ -3147,9 +3147,9 @@ } }, "node_modules/@next/swc-linux-arm64-gnu": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.1.tgz", - "integrity": "sha512-2ZctfnyFOGvTkoD6L+DtQtO3BfFz4CapoHnyLTXkOxbZkVRgg3TQBUjTD/xKrO1QWeydeo8AWfZRg8539qNKrg==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-gnu/-/swc-linux-arm64-gnu-14.2.12.tgz", + "integrity": "sha512-qBy7OiXOqZrdp88QEl2H4fWalMGnSCrr1agT/AVDndlyw2YJQA89f3ttR/AkEIP9EkBXXeGl6cC72/EZT5r6rw==", "cpu": [ "arm64" ], @@ -3162,9 +3162,9 @@ } }, "node_modules/@next/swc-linux-arm64-musl": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.1.tgz", - "integrity": "sha512-jazZXctiaanemy4r+TPIpFP36t1mMwWCKMsmrTRVChRqE6putyAxZA4PDujx0SnfvZHosjdkx9xIq9BzBB5tWg==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-linux-arm64-musl/-/swc-linux-arm64-musl-14.2.12.tgz", + "integrity": "sha512-EfD9L7o9biaQxjwP1uWXnk3vYZi64NVcKUN83hpVkKocB7ogJfyH2r7o1pPnMtir6gHZiGCeHKagJ0yrNSLNHw==", "cpu": [ "arm64" ], @@ -3177,9 +3177,9 @@ } }, "node_modules/@next/swc-linux-x64-gnu": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.1.tgz", - "integrity": "sha512-VjCHWCjsAzQAAo8lkBOLEIkBZFdfW+Z18qcQ056kL4KpUYc8o59JhLDCBlhg+hINQRgzQ2UPGma2AURGOH0+Qg==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-gnu/-/swc-linux-x64-gnu-14.2.12.tgz", + "integrity": "sha512-iQ+n2pxklJew9IpE47hE/VgjmljlHqtcD5UhZVeHICTPbLyrgPehaKf2wLRNjYH75udroBNCgrSSVSVpAbNoYw==", "cpu": [ "x64" ], @@ -3192,9 +3192,9 @@ } }, "node_modules/@next/swc-linux-x64-musl": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.1.tgz", - "integrity": "sha512-7HZKYKvAp4nAHiHIbY04finRqjeYvkITOGOurP1aLMexIFG/1+oCnqhGogBdc4lao/lkMW1c+AkwWSzSlLasqw==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-linux-x64-musl/-/swc-linux-x64-musl-14.2.12.tgz", + "integrity": "sha512-rFkUkNwcQ0ODn7cxvcVdpHlcOpYxMeyMfkJuzaT74xjAa5v4fxP4xDk5OoYmPi8QNLDs3UgZPMSBmpBuv9zKWA==", "cpu": [ "x64" ], @@ -3207,9 +3207,9 @@ } }, "node_modules/@next/swc-win32-arm64-msvc": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.1.tgz", - "integrity": "sha512-YGHklaJ/Cj/F0Xd8jxgj2p8po4JTCi6H7Z3Yics3xJhm9CPIqtl8erlpK1CLv+HInDqEWfXilqatF8YsLxxA2Q==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-win32-arm64-msvc/-/swc-win32-arm64-msvc-14.2.12.tgz", + "integrity": "sha512-PQFYUvwtHs/u0K85SG4sAdDXYIPXpETf9mcEjWc0R4JmjgMKSDwIU/qfZdavtP6MPNiMjuKGXHCtyhR/M5zo8g==", "cpu": [ "arm64" ], @@ -3222,9 +3222,9 @@ } }, "node_modules/@next/swc-win32-ia32-msvc": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.1.tgz", - "integrity": "sha512-o+ISKOlvU/L43ZhtAAfCjwIfcwuZstiHVXq/BDsZwGqQE0h/81td95MPHliWCnFoikzWcYqh+hz54ZB2FIT8RA==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-win32-ia32-msvc/-/swc-win32-ia32-msvc-14.2.12.tgz", + "integrity": "sha512-FAj2hMlcbeCV546eU2tEv41dcJb4NeqFlSXU/xL/0ehXywHnNpaYajOUvn3P8wru5WyQe6cTZ8fvckj/2XN4Vw==", "cpu": [ "ia32" ], @@ -3237,9 +3237,9 @@ } }, "node_modules/@next/swc-win32-x64-msvc": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.1.tgz", - "integrity": "sha512-GmRoTiLcvCLifujlisknv4zu9/C4i9r0ktsA8E51EMqJL4bD4CpO7lDYr7SrUxCR0tS4RVcrqKmCak24T0ohaw==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/@next/swc-win32-x64-msvc/-/swc-win32-x64-msvc-14.2.12.tgz", + "integrity": "sha512-yu8QvV53sBzoIVRHsxCHqeuS8jYq6Lrmdh0briivuh+Brsp6xjg80MAozUsBTAV9KNmY08KlX0KYTWz1lbPzEg==", "cpu": [ "x64" ], @@ -10721,11 +10721,11 @@ } }, "node_modules/next": { - "version": "14.2.1", - "resolved": "https://registry.npmjs.org/next/-/next-14.2.1.tgz", - "integrity": "sha512-SF3TJnKdH43PMkCcErLPv+x/DY1YCklslk3ZmwaVoyUfDgHKexuKlf9sEfBQ69w+ue8jQ3msLb+hSj1T19hGag==", + "version": "14.2.12", + "resolved": "https://registry.npmjs.org/next/-/next-14.2.12.tgz", + "integrity": "sha512-cDOtUSIeoOvt1skKNihdExWMTybx3exnvbFbb9ecZDIxlvIbREQzt9A5Km3Zn3PfU+IFjyYGsHS+lN9VInAGKA==", "dependencies": { - "@next/env": "14.2.1", + "@next/env": "14.2.12", "@swc/helpers": "0.5.5", "busboy": "1.6.0", "caniuse-lite": "^1.0.30001579", @@ -10740,15 +10740,15 @@ "node": ">=18.17.0" }, "optionalDependencies": { - "@next/swc-darwin-arm64": "14.2.1", - "@next/swc-darwin-x64": "14.2.1", - "@next/swc-linux-arm64-gnu": "14.2.1", - "@next/swc-linux-arm64-musl": "14.2.1", - "@next/swc-linux-x64-gnu": "14.2.1", - "@next/swc-linux-x64-musl": "14.2.1", - "@next/swc-win32-arm64-msvc": "14.2.1", - "@next/swc-win32-ia32-msvc": "14.2.1", - "@next/swc-win32-x64-msvc": "14.2.1" + "@next/swc-darwin-arm64": "14.2.12", + "@next/swc-darwin-x64": "14.2.12", + "@next/swc-linux-arm64-gnu": "14.2.12", + "@next/swc-linux-arm64-musl": "14.2.12", + "@next/swc-linux-x64-gnu": "14.2.12", + "@next/swc-linux-x64-musl": "14.2.12", + "@next/swc-win32-arm64-msvc": "14.2.12", + "@next/swc-win32-ia32-msvc": "14.2.12", + "@next/swc-win32-x64-msvc": "14.2.12" }, "peerDependencies": { "@opentelemetry/api": "^1.1.0", diff --git a/keep-ui/package.json b/keep-ui/package.json index ac5372d81..0ad6708e4 100644 --- a/keep-ui/package.json +++ b/keep-ui/package.json @@ -232,7 +232,7 @@ "mz": "^2.7.0", "nanoid": "^3.3.6", "natural-compare": "^1.4.0", - "next": "^14.2.1", + "next": "^14.2.12", "next-auth": "^4.24.7", "node-releases": "^2.0.10", "normalize-path": "^3.0.0", From f97c63b7fd32f03b46c498f0a13a641c6ef2be59 Mon Sep 17 00:00:00 2001 From: Vladimir Filonov Date: Wed, 18 Sep 2024 10:47:37 +0400 Subject: [PATCH 04/16] feat: add status for incidents (#1905) Signed-off-by: Vladimir Filonov Co-authored-by: Tal --- .../app/incidents/[id]/incident-alerts.tsx | 2 +- keep-ui/app/incidents/[id]/incident-info.tsx | 2 +- .../incidents/create-or-update-incident.tsx | 2 +- .../incidents/incident-candidate-actions.tsx | 2 +- .../incident-change-status-modal.tsx | 153 ++++++++++++++++++ keep-ui/app/incidents/incident-pagination.tsx | 2 +- .../incidents/incident-table-component.tsx | 2 +- keep-ui/app/incidents/incident.tsx | 2 +- keep-ui/app/incidents/incidents-table.tsx | 71 +++++++- keep-ui/app/incidents/{model.ts => models.ts} | 9 +- .../incidents/predicted-incidents-table.tsx | 2 +- keep-ui/utils/hooks/useIncidents.ts | 2 +- keep/api/api.py | 2 +- keep/api/core/db.py | 60 ++++--- keep/api/models/alert.py | 31 ++++ keep/api/models/db/alert.py | 3 +- .../versions/2024-07-25-17-13_67f1efb93c99.py | 2 +- .../versions/2024-07-28-16-24_8e5942040de6.py | 2 +- .../versions/2024-07-29-18-10_92f4f93f2140.py | 2 +- .../versions/2024-08-08-13-55_42098785763c.py | 21 --- .../versions/2024-08-09-10-53_6e353161f5a8.py | 21 --- .../versions/2024-08-11-17-38_9453855f3ba0.py | 2 +- .../versions/2024-08-11-19-45_005efc57cc1c.py | 21 --- .../versions/2024-08-13-19-22_0832e0d9889a.py | 2 +- .../versions/2024-09-01-14-04_94886bc59c11.py | 2 +- .../versions/2024-09-04-09-38_b30d2141e1cb.py | 21 --- .../versions/2024-09-10-17-59_710b4ff1d19e.py | 21 --- .../versions/2024-09-11-23-30_c5443d9deb0f.py | 34 ++++ .../versions/2024-09-13-10-48_938b1aa62d5c.py | 2 +- keep/api/routes/alerts.py | 9 ++ keep/api/routes/incidents.py | 77 +++++++-- tests/conftest.py | 1 + tests/test_incidents.py | 65 +++++++- 33 files changed, 484 insertions(+), 168 deletions(-) create mode 100644 keep-ui/app/incidents/incident-change-status-modal.tsx rename keep-ui/app/incidents/{model.ts => models.ts} (85%) delete mode 100644 keep/api/models/db/migrations/versions/2024-08-08-13-55_42098785763c.py delete mode 100644 keep/api/models/db/migrations/versions/2024-08-09-10-53_6e353161f5a8.py delete mode 100644 keep/api/models/db/migrations/versions/2024-08-11-19-45_005efc57cc1c.py delete mode 100644 keep/api/models/db/migrations/versions/2024-09-04-09-38_b30d2141e1cb.py delete mode 100644 keep/api/models/db/migrations/versions/2024-09-10-17-59_710b4ff1d19e.py create mode 100644 keep/api/models/db/migrations/versions/2024-09-11-23-30_c5443d9deb0f.py diff --git a/keep-ui/app/incidents/[id]/incident-alerts.tsx b/keep-ui/app/incidents/[id]/incident-alerts.tsx index cf32d87d4..8e18ae9f3 100644 --- a/keep-ui/app/incidents/[id]/incident-alerts.tsx +++ b/keep-ui/app/incidents/[id]/incident-alerts.tsx @@ -28,7 +28,7 @@ import { ExclamationTriangleIcon } from "@radix-ui/react-icons"; import IncidentAlertMenu from "./incident-alert-menu"; import IncidentPagination from "../incident-pagination"; import React, {Dispatch, SetStateAction, useEffect, useState} from "react"; -import {IncidentDto} from "../model"; +import {IncidentDto} from "../models"; interface Props { incident: IncidentDto; diff --git a/keep-ui/app/incidents/[id]/incident-info.tsx b/keep-ui/app/incidents/[id]/incident-info.tsx index 14ada9d65..8bf6cb9c3 100644 --- a/keep-ui/app/incidents/[id]/incident-info.tsx +++ b/keep-ui/app/incidents/[id]/incident-info.tsx @@ -1,6 +1,6 @@ import {Button, Title} from "@tremor/react"; -import { IncidentDto } from "../model"; +import { IncidentDto } from "../models"; import CreateOrUpdateIncident from "../create-or-update-incident"; import Modal from "@/components/ui/Modal"; import React, {useState} from "react"; diff --git a/keep-ui/app/incidents/create-or-update-incident.tsx b/keep-ui/app/incidents/create-or-update-incident.tsx index 0d8c23972..fcfae9934 100644 --- a/keep-ui/app/incidents/create-or-update-incident.tsx +++ b/keep-ui/app/incidents/create-or-update-incident.tsx @@ -12,7 +12,7 @@ import { useSession } from "next-auth/react"; import { FormEvent, useEffect, useState } from "react"; import { toast } from "react-toastify"; import { getApiURL } from "utils/apiUrl"; -import { IncidentDto } from "./model"; +import { IncidentDto } from "./models"; import { useIncidents } from "utils/hooks/useIncidents"; interface Props { diff --git a/keep-ui/app/incidents/incident-candidate-actions.tsx b/keep-ui/app/incidents/incident-candidate-actions.tsx index e6170dd09..1a8a33e8f 100644 --- a/keep-ui/app/incidents/incident-candidate-actions.tsx +++ b/keep-ui/app/incidents/incident-candidate-actions.tsx @@ -1,6 +1,6 @@ import {getApiURL} from "../../utils/apiUrl"; import {toast} from "react-toastify"; -import {IncidentDto, PaginatedIncidentsDto} from "./model"; +import {IncidentDto, PaginatedIncidentsDto} from "./models"; import {Session} from "next-auth"; interface Props { diff --git a/keep-ui/app/incidents/incident-change-status-modal.tsx b/keep-ui/app/incidents/incident-change-status-modal.tsx new file mode 100644 index 000000000..886d205b8 --- /dev/null +++ b/keep-ui/app/incidents/incident-change-status-modal.tsx @@ -0,0 +1,153 @@ +import { Button, Title, Subtitle } from "@tremor/react"; +import Modal from "@/components/ui/Modal"; +import Select, { + CSSObjectWithLabel, + ControlProps, + OptionProps, + GroupBase, +} from "react-select"; +import { useState } from "react"; +import { IncidentDto, Status } from "./models"; +import { getApiURL } from "utils/apiUrl"; +import { useSession } from "next-auth/react"; +import { toast } from "react-toastify"; +import { + CheckCircleIcon, + ExclamationCircleIcon, + PauseIcon, +} from "@heroicons/react/24/outline"; + +const statusIcons = { + [Status.Firing]: , + [Status.Resolved]: , + [Status.Acknowledged]: , +}; + +const customSelectStyles = { + control: ( + base: CSSObjectWithLabel, + state: ControlProps< + { value: Status; label: JSX.Element }, + false, + GroupBase<{ value: Status; label: JSX.Element }> + > + ) => ({ + ...base, + borderColor: state.isFocused ? "orange" : base.borderColor, + boxShadow: state.isFocused ? "0 0 0 1px orange" : base.boxShadow, + "&:hover": { + borderColor: "orange", + }, + }), + option: ( + base: CSSObjectWithLabel, + { + isFocused, + }: OptionProps< + { value: Status; label: JSX.Element }, + false, + GroupBase<{ value: Status; label: JSX.Element }> + > + ) => ({ + ...base, + backgroundColor: isFocused ? "rgba(255,165,0,0.1)" : base.backgroundColor, + "&:hover": { + backgroundColor: "rgba(255,165,0,0.2)", + }, + }), +}; + +interface Props { + incident: IncidentDto | null | undefined; + mutate: () => void; + handleClose: () => void; +} + +export default function IncidentChangeStatusModal({ + incident, + mutate, + handleClose, +}: Props) { + const { data: session } = useSession(); + const [selectedStatus, setSelectedStatus] = useState(null); + const [comment, setComment] = useState(""); + + if (!incident) return null; + + const statusOptions = Object.values(Status) + .filter((status) => status !== incident.status) // Exclude current status + .map((status) => ({ + value: status, + label: ( +
+ {statusIcons[status]} + {status.charAt(0).toUpperCase() + status.slice(1)} +
+ ), + })); + + const clearAndClose = () => { + setSelectedStatus(null); + handleClose(); + }; + + const handleChangeStatus = async () => { + if (!selectedStatus) { + toast.error("Please select a new status."); + return; + } + + try { + const response = await fetch(`${getApiURL()}/incidents/${incident.id}/status`, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${session?.accessToken}`, + }, + body: JSON.stringify({ + status: selectedStatus, + comment: comment, + }), + }); + + if (response.ok) { + toast.success("Incident status changed successfully!"); + clearAndClose(); + await mutate(); + } else { + toast.error("Failed to change incident status."); + } + } catch (error) { + toast.error("An error occurred while changing incident status."); + } + }; + + return ( + + Change Incident Status + + Change status from {incident.status} to: +
+ - Feel free to edit the payload as you want. However, some of the providers expects specific fields, so be careful. + + Feel free to edit the payload as you want. However, some of the + providers expects specific fields, so be careful. +
{selectedSource && ( <> From 34a9505776067c398afba5fadfaabcdcff465449 Mon Sep 17 00:00:00 2001 From: Tal Date: Wed, 18 Sep 2024 13:36:02 +0300 Subject: [PATCH 06/16] chore(deps): posthog latest version (#1956) --- docs/providers/overview.mdx | 302 ++++++++++++++++++++++------- keep-ui/package-lock.json | 9 +- keep-ui/package.json | 2 +- scripts/docs_get_providers_list.py | 14 +- 4 files changed, 247 insertions(+), 80 deletions(-) diff --git a/docs/providers/overview.mdx b/docs/providers/overview.mdx index 60f754022..7a45033ce 100644 --- a/docs/providers/overview.mdx +++ b/docs/providers/overview.mdx @@ -15,403 +15,569 @@ By leveraging Keep Providers, users are able to deeply integrate Keep with the t } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } +> + + + } > } + icon={ + + } > } + icon={ + + } +> + + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } +> + + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } > } + icon={ + + } +> + + + } > } + icon={ + + } > } + icon={ + + } > } > - \ No newline at end of file + diff --git a/keep-ui/package-lock.json b/keep-ui/package-lock.json index 0b385c849..4459999ce 100644 --- a/keep-ui/package-lock.json +++ b/keep-ui/package-lock.json @@ -272,7 +272,7 @@ "postcss-nested": "^6.0.1", "postcss-selector-parser": "^6.0.12", "postcss-value-parser": "^4.2.0", - "posthog-js": "^1.157.2", + "posthog-js": "^1.161.6", "posthog-node": "^3.1.1", "preact-render-to-string": "^5.2.6", "prelude-ls": "^1.2.1", @@ -11591,10 +11591,9 @@ "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==" }, "node_modules/posthog-js": { - "version": "1.157.2", - "resolved": "https://registry.npmjs.org/posthog-js/-/posthog-js-1.157.2.tgz", - "integrity": "sha512-ATYKGs+Q51u26nHHhrhWNh1whqFm7j/rwQQYw+y6/YzNmRlo+YsqrGZji9nqXb9/4fo0ModDr+ZmuOI3hKkUXA==", - "license": "MIT", + "version": "1.161.6", + "resolved": "https://registry.npmjs.org/posthog-js/-/posthog-js-1.161.6.tgz", + "integrity": "sha512-UO0z/YTuan55Kl5Yg9Xs5x1PKUkm2zGKUNPioznb4GLRcxFnLBkWoeKQXNro2YZsYJvK+MY8jlF3cdGa8BZ8/Q==", "dependencies": { "fflate": "^0.4.8", "preact": "^10.19.3", diff --git a/keep-ui/package.json b/keep-ui/package.json index 0ad6708e4..206ea0c2e 100644 --- a/keep-ui/package.json +++ b/keep-ui/package.json @@ -273,7 +273,7 @@ "postcss-nested": "^6.0.1", "postcss-selector-parser": "^6.0.12", "postcss-value-parser": "^4.2.0", - "posthog-js": "^1.157.2", + "posthog-js": "^1.161.6", "posthog-node": "^3.1.1", "preact-render-to-string": "^5.2.6", "prelude-ls": "^1.2.1", diff --git a/scripts/docs_get_providers_list.py b/scripts/docs_get_providers_list.py index 6d01b3392..a12687f38 100644 --- a/scripts/docs_get_providers_list.py +++ b/scripts/docs_get_providers_list.py @@ -6,10 +6,10 @@ python get_providers_list.py --validate # To check docs/providers/overview.mdx """ +import argparse import glob import os import re -import argparse LOGO_DEV_PUBLISHABLE_KEY = "pk_dfXfZBoKQMGDTIgqu7LvYg" @@ -24,10 +24,13 @@ def validate(providers_to_validate): for provider in providers_to_validate: if provider not in overview_content: - print(f"""Provider {provider} is not in the docs/providers/overview.md file, -use scripts/get_providers_list.py to generate recent providers list and update the file.""") + print( + f"""Provider {provider} is not in the docs/providers/overview.md file, +use scripts/get_providers_list.py to generate recent providers list and update the file.""" + ) exit(1) + def main(): """ This script lists all the integrations in the documentation folder and outputs a markdown list of links. @@ -42,13 +45,12 @@ def main(): if os.path.isfile(file_path): with open(file_path, "r") as file: for line in file.readlines(): - match = re.search(r'title:\s*"([^"]+)"', line) + match = re.search(r"title:\s*[\"|\']([^\"]+)[\"|\']", line) if match: url = "/providers/documentation/" + file_path.replace( "./../docs/providers/documentation/", "" ).replace(".mdx", "") - provider_name = match.group( - 1).replace("Provider", "").strip() + provider_name = match.group(1).replace("Provider", "").strip() # Due to https://github.com/keephq/keep/pull/1239#discussion_r1643196800 if "Slack" in provider_name: From b48b5b059e68eabd6532f54f886e7c4986168b48 Mon Sep 17 00:00:00 2001 From: Tal Date: Wed, 18 Sep 2024 15:03:36 +0300 Subject: [PATCH 07/16] chore(internal): add prettier to pre commits (#1958) --- .pre-commit-config.yaml | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index ec7b221de..2fc9e4e62 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,12 +7,6 @@ repos: language: system types: [python] require_serial: true - # - id: yamllint - # name: yamllint - # description: This hook runs yamllint. - # entry: yamllint - # language: python - # types: [file, yaml] - id: end-of-file-fixer name: Fix End of Files entry: end-of-file-fixer @@ -38,10 +32,17 @@ repos: hooks: # Run the linter. - id: ruff - args: [ --fix ] + args: [--fix] - repo: https://github.com/compilerla/conventional-pre-commit rev: v2.1.1 hooks: - id: conventional-pre-commit stages: [commit-msg] args: [] # optional: list of Conventional Commits types to allow e.g. [feat, fix, ci, chore, test] + - repo: https://github.com/pre-commit/mirrors-prettier + rev: v3.0.3 + hooks: + - id: prettier + types_or: + [javascript, jsx, ts, tsx, json, yaml, css, scss, html, markdown] + args: [--write] From 231e07dfc7776c5a61802d29a09f4eaa4dd56101 Mon Sep 17 00:00:00 2001 From: Tal Date: Wed, 18 Sep 2024 15:36:20 +0300 Subject: [PATCH 08/16] fix(api): mapping rule default type (#1960) --- docs/deployment/stress-testing.mdx | 2 +- keep/api/models/db/mapping.py | 10 ++++- .../versions/2024-09-18-14-08_5d7ae55efc6a.py | 38 +++++++++++++++++++ 3 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py diff --git a/docs/deployment/stress-testing.mdx b/docs/deployment/stress-testing.mdx index 92342712b..0959c925b 100644 --- a/docs/deployment/stress-testing.mdx +++ b/docs/deployment/stress-testing.mdx @@ -46,7 +46,7 @@ The primary parameters that affect the specification requirements for Keep are: 3. **High Volume (100,000 - 1,000,000 total alerts, 5000's of alerts per day)**: - **Setup**: Deploy Keep with Elasticsearch for storing alerts as documents. - **Expectations**: The system should maintain performance levels despite the large alert volume, with increased resource usage managed through scaling strategies. -4. **Very High Volume (> 1,000,000 total alerts, 10k's of alerts per day) +4. **Very High Volume (> 1,000,000 total alerts, 10k's of alerts per day)**: - **Setup**: Deploy Keep with Elasticsearch for storing alerts as documents. - **Setup #2**: Deploy Keep with Redis and with ARQ to use Redis as a queue. diff --git a/keep/api/models/db/mapping.py b/keep/api/models/db/mapping.py index 2081656ea..2f0e0ab79 100644 --- a/keep/api/models/db/mapping.py +++ b/keep/api/models/db/mapping.py @@ -2,6 +2,7 @@ from typing import Literal, Optional from pydantic import BaseModel, validator +from sqlalchemy import String from sqlmodel import JSON, Column, Field, SQLModel @@ -19,7 +20,14 @@ class MappingRule(SQLModel, table=True): override: bool = Field(default=True) condition: Optional[str] = Field(max_length=2000) # The type of this mapping rule - type: str = "csv" + type: str = Field( + sa_column=Column( + String(255), + name="type", + server_default="csv", + ), + max_length=255, + ) # The attributes to match against (e.g. ["service","region"]) matchers: list[str] = Field(sa_column=Column(JSON), nullable=False) # The rows of the CSV file [{service: "service1", region: "region1", ...}, ...] diff --git a/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py b/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py new file mode 100644 index 000000000..c0c825b1b --- /dev/null +++ b/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py @@ -0,0 +1,38 @@ +"""mappingrule type default value + +Revision ID: 5d7ae55efc6a +Revises: 938b1aa62d5c +Create Date: 2024-09-18 14:08:49.363483 + +""" + +import sqlalchemy as sa +from alembic import op + +# revision identifiers, used by Alembic. +revision = "5d7ae55efc6a" +down_revision = "938b1aa62d5c" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("mappingrule", schema=None) as batch_op: + batch_op.alter_column( + "type", + existing_type=sa.VARCHAR(length=255), + nullable=False, + server_default="csv", + ) + + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + with op.batch_alter_table("mappingrule", schema=None) as batch_op: + batch_op.alter_column( + "type", existing_type=sa.VARCHAR(length=255), nullable=True + ) + # ### end Alembic commands ### From 135e967706a675c1d4d3d70d958ea82bb44bd2c6 Mon Sep 17 00:00:00 2001 From: Shahar Glazner Date: Wed, 18 Sep 2024 17:44:07 +0300 Subject: [PATCH 09/16] fix: netdata GET trigger 500 (google bot) (#1961) --- keep/api/routes/alerts.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/keep/api/routes/alerts.py b/keep/api/routes/alerts.py index f452ebfde..5935d63a4 100644 --- a/keep/api/routes/alerts.py +++ b/keep/api/routes/alerts.py @@ -318,7 +318,11 @@ async def receive_generic_event( description="Helper function to complete Netdata webhook challenge", ) async def webhook_challenge(): - token = Request.query_params.get("token").encode("ascii") + try: + token = Request.query_params.get("token").encode("ascii") + except Exception as e: + logger.exception("Failed to get token", extra={"error": str(e)}) + raise HTTPException(status_code=400, detail="Bad request: failed to get token") KEY = "keep-netdata-webhook-integration" # creates HMAC SHA-256 hash from incomming token and your consumer secret From c1a3c5e7fc59931a968fae6fd877f52eab3b87f6 Mon Sep 17 00:00:00 2001 From: Tal Date: Thu, 19 Sep 2024 10:32:44 +0300 Subject: [PATCH 10/16] feat: incident timeline (#1895) Signed-off-by: Tal --- keep-ui/app/alerts/alert-severity.tsx | 5 +- keep-ui/app/alerts/alert-sidebar.tsx | 2 +- keep-ui/app/alerts/alert-table-utils.tsx | 3 +- keep-ui/app/alerts/alert-timeline.tsx | 25 +- .../app/incidents/[id]/incident-alerts.tsx | 54 ++- keep-ui/app/incidents/[id]/incident-info.tsx | 75 ++- .../app/incidents/[id]/incident-timeline.tsx | 427 ++++++++++++++++++ keep-ui/app/incidents/[id]/incident.tsx | 27 +- keep-ui/package-lock.json | 7 + keep-ui/package.json | 1 + keep-ui/tailwind.config.js | 9 +- keep-ui/utils/hooks/useAlerts.ts | 39 +- keep-ui/utils/hooks/useTopology.ts | 14 +- keep/api/core/db.py | 38 +- keep/api/models/alert_audit.py | 57 +++ keep/api/routes/alerts.py | 64 +-- keep/api/utils/enrichment_helpers.py | 4 +- 17 files changed, 731 insertions(+), 120 deletions(-) create mode 100644 keep-ui/app/incidents/[id]/incident-timeline.tsx create mode 100644 keep/api/models/alert_audit.py diff --git a/keep-ui/app/alerts/alert-severity.tsx b/keep-ui/app/alerts/alert-severity.tsx index a7bdd634d..cafa2fddb 100644 --- a/keep-ui/app/alerts/alert-severity.tsx +++ b/keep-ui/app/alerts/alert-severity.tsx @@ -10,9 +10,10 @@ import { interface Props { severity: Severity | undefined; + marginLeft?: boolean; } -export default function AlertSeverity({ severity }: Props) { +export default function AlertSeverity({ severity, marginLeft = true }: Props) { let icon: any; let color: any; let severityText: string; @@ -56,7 +57,7 @@ export default function AlertSeverity({ severity }: Props) { icon={icon} tooltip={severityText} size="sm" - className="ml-2.5" + className={marginLeft ? "ml-2.5" : ""} /> ); } diff --git a/keep-ui/app/alerts/alert-sidebar.tsx b/keep-ui/app/alerts/alert-sidebar.tsx index 2f9195a20..abef81bcb 100644 --- a/keep-ui/app/alerts/alert-sidebar.tsx +++ b/keep-ui/app/alerts/alert-sidebar.tsx @@ -102,7 +102,7 @@ const AlertSidebar = ({ isOpen, toggle, alert }: AlertSidebarProps) => { diff --git a/keep-ui/app/alerts/alert-table-utils.tsx b/keep-ui/app/alerts/alert-table-utils.tsx index 2dc2147d9..01a26fb09 100644 --- a/keep-ui/app/alerts/alert-table-utils.tsx +++ b/keep-ui/app/alerts/alert-table-utils.tsx @@ -2,7 +2,6 @@ import { useState } from "react"; import { ColumnDef, FilterFn, - Row, RowSelectionState, VisibilityState, createColumnHelper, @@ -18,7 +17,7 @@ import AlertAssignee from "./alert-assignee"; import AlertExtraPayload from "./alert-extra-payload"; import AlertMenu from "./alert-menu"; import { isSameDay, isValid, isWithinInterval, startOfDay } from "date-fns"; -import { Severity, severityMapping } from "./models"; +import { severityMapping } from "./models"; import { MdOutlineNotificationsActive, MdOutlineNotificationsOff } from "react-icons/md"; export const DEFAULT_COLS = [ diff --git a/keep-ui/app/alerts/alert-timeline.tsx b/keep-ui/app/alerts/alert-timeline.tsx index 72d1e0f2f..db584d624 100644 --- a/keep-ui/app/alerts/alert-timeline.tsx +++ b/keep-ui/app/alerts/alert-timeline.tsx @@ -1,9 +1,10 @@ -import React, { useState } from "react"; +import React from "react"; import { Subtitle, Button } from "@tremor/react"; import { Chrono } from "react-chrono"; import Image from "next/image"; import { ArrowPathIcon } from "@heroicons/react/24/outline"; import { AlertDto } from "./models"; +import { AuditEvent } from "utils/hooks/useAlerts"; const getInitials = (name: string) => ((name.match(/(^\S\S?|\b\S)?/g) ?? []).join("").match(/(^\S|\S$)?/g) ?? []) @@ -15,13 +16,6 @@ const formatTimestamp = (timestamp: Date | string) => { return date.toLocaleString(); }; -type AuditEvent = { - user_id: string; - action: string; - description: string; - timestamp: string; -}; - type AlertTimelineProps = { alert: AlertDto | null; auditData: AuditEvent[]; @@ -29,7 +23,12 @@ type AlertTimelineProps = { onRefresh: () => void; }; -const AlertTimeline: React.FC = ({ alert, auditData, isLoading, onRefresh }) => { +const AlertTimeline: React.FC = ({ + alert, + auditData, + isLoading, + onRefresh, +}) => { // Default audit event if no audit data is available const defaultAuditEvent = alert ? [ @@ -97,11 +96,9 @@ const AlertTimeline: React.FC = ({ alert, auditData, isLoadi
({ - title: formatTimestamp(entry.timestamp), - }) - ) || [] + auditContent.map((entry) => ({ + title: formatTimestamp(entry.timestamp), + })) || [] } hideControls disableToolbar diff --git a/keep-ui/app/incidents/[id]/incident-alerts.tsx b/keep-ui/app/incidents/[id]/incident-alerts.tsx index 8e18ae9f3..e4946ac37 100644 --- a/keep-ui/app/incidents/[id]/incident-alerts.tsx +++ b/keep-ui/app/incidents/[id]/incident-alerts.tsx @@ -27,8 +27,8 @@ import AlertName from "app/alerts/alert-name"; import { ExclamationTriangleIcon } from "@radix-ui/react-icons"; import IncidentAlertMenu from "./incident-alert-menu"; import IncidentPagination from "../incident-pagination"; -import React, {Dispatch, SetStateAction, useEffect, useState} from "react"; -import {IncidentDto} from "../models"; +import React, { useEffect, useState } from "react"; +import { IncidentDto } from "../models"; interface Props { incident: IncidentDto; @@ -39,7 +39,6 @@ interface Pagination { offset: number; } - const columnHelper = createColumnHelper(); export default function IncidentAlerts({ incident }: Props) { @@ -48,11 +47,15 @@ export default function IncidentAlerts({ incident }: Props) { offset: 0, }); - const { data: alerts, isLoading } = useIncidentAlerts(incident.id, alertsPagination.limit, alertsPagination.offset); + const { data: alerts, isLoading } = useIncidentAlerts( + incident.id, + alertsPagination.limit, + alertsPagination.offset + ); const [pagination, setTablePagination] = useState({ - pageIndex: alerts? Math.ceil(alerts.offset / alerts.limit) : 0, - pageSize: alerts? alerts.limit : 20, + pageIndex: alerts ? Math.ceil(alerts.offset / alerts.limit) : 0, + pageSize: alerts ? alerts.limit : 20, }); useEffect(() => { @@ -60,16 +63,16 @@ export default function IncidentAlerts({ incident }: Props) { setAlertsPagination({ limit: pagination.pageSize, offset: 0, - }) + }); } const currentOffset = pagination.pageSize * pagination.pageIndex; if (alerts && alerts.offset != currentOffset) { setAlertsPagination({ limit: pagination.pageSize, offset: currentOffset, - }) + }); } - }, [pagination]) + }, [pagination]); usePollIncidentAlerts(incident.id); const columns = [ @@ -116,7 +119,7 @@ export default function IncidentAlerts({ incident }: Props) { (context.getValue() ?? []).map((source, index) => ( {source} ( - incident.is_confirmed && + cell: (context) => + incident.is_confirmed && ( - ), + ), }), ]; @@ -164,9 +167,9 @@ export default function IncidentAlerts({ incident }: Props) { {table.getHeaderGroups().map((headerGroup) => ( - {headerGroup.headers.map((header) => { + {headerGroup.headers.map((header, index) => { return ( - + {flexRender( header.column.columnDef.header, header.getContext() @@ -179,10 +182,13 @@ export default function IncidentAlerts({ incident }: Props) { {alerts && alerts?.items?.length > 0 && ( - {table.getRowModel().rows.map((row) => ( - - {row.getVisibleCells().map((cell) => ( - + {table.getRowModel().rows.map((row, index) => ( + + {row.getVisibleCells().map((cell, index) => ( + {flexRender(cell.column.columnDef.cell, cell.getContext())} ))} @@ -196,10 +202,10 @@ export default function IncidentAlerts({ incident }: Props) { {Array(pagination.pageSize) .fill("") - .map((index) => ( - - {columns.map((c) => ( - + .map((index, rowIndex) => ( + + {columns.map((c, cellIndex) => ( + ))} @@ -211,7 +217,7 @@ export default function IncidentAlerts({ incident }: Props) {
- +
); diff --git a/keep-ui/app/incidents/[id]/incident-info.tsx b/keep-ui/app/incidents/[id]/incident-info.tsx index 8bf6cb9c3..842f73b0d 100644 --- a/keep-ui/app/incidents/[id]/incident-info.tsx +++ b/keep-ui/app/incidents/[id]/incident-info.tsx @@ -1,14 +1,17 @@ - -import {Button, Title} from "@tremor/react"; +import { Button, Title } from "@tremor/react"; import { IncidentDto } from "../models"; import CreateOrUpdateIncident from "../create-or-update-incident"; import Modal from "@/components/ui/Modal"; -import React, {useState} from "react"; -import {MdBlock, MdDone, MdModeEdit} from "react-icons/md"; -import {useIncident} from "../../../utils/hooks/useIncidents"; -import {deleteIncident, handleConfirmPredictedIncident} from "../incident-candidate-actions"; -import {useSession} from "next-auth/react"; -import {useRouter} from "next/navigation"; +import React, { useState } from "react"; +import { MdBlock, MdDone, MdModeEdit } from "react-icons/md"; +import { useIncident } from "../../../utils/hooks/useIncidents"; +import { + deleteIncident, + handleConfirmPredictedIncident, +} from "../incident-candidate-actions"; +import { useSession } from "next-auth/react"; +import { useRouter } from "next/navigation"; +import { format } from "date-fns"; // import { RiSparkling2Line } from "react-icons/ri"; interface Props { @@ -34,12 +37,16 @@ export default function IncidentInformation({ incident }: Props) { mutate(); }; + const formatString = "dd, MMM yyyy - HH:mm.ss 'UTC'"; + return (
- {incident.is_confirmed ? "⚔️ " : "Possible "}Incident Information - {incident.is_confirmed && + + {incident.is_confirmed ? "⚔️ " : "Possible "}Incident Information + + {incident.is_confirmed && ( + > + Confirm +
- } + )} +
+
+ {incident.user_generated_name || incident.ai_generated_name}
-
{incident.user_generated_name || incident.ai_generated_name}

Summary: {incident.user_summary || incident.generated_summary}

- {!!incident.start_time &&

Started at: {new Date(incident.start_time + "Z").toLocaleString()}

} - {!!incident.last_seen_time &&

Last seen at: {new Date(incident.last_seen_time + "Z").toLocaleString()}

} - {!!incident.rule_fingerprint &&

Group by value: {incident.rule_fingerprint}

} - + {!!incident.start_time && ( +

+ Started at: {format(new Date(incident.start_time), formatString)} +

+ )} + {!!incident.last_seen_time && ( +

+ Last seen at:{" "} + {format(new Date(incident.last_seen_time), formatString)} +

+ )} + {!!incident.rule_fingerprint && ( +

Group by value: {incident.rule_fingerprint}

+ )}
void; + isSelected: boolean; +} + +const AlertEventInfo: React.FC<{ event: AuditEvent; alert: AlertDto }> = ({ + event, + alert, +}) => { + return ( +
+

+ {alert.name} ({alert.fingerprint}) +

+

{alert.description}

+
+

Date:

+

+ {format(parseISO(event.timestamp), "dd, MMM yyyy - HH:mm.ss 'UTC'")} +

+ +

Action:

+

{event.action}

+ +

Description:

+

{event.description}

+ +

Severity:

+
+ +

{alert.severity}

+
+ +

Source:

+
+ {alert.source.map((source, index) => ( + {source} + ))} +

{alert.source.join(",")}

+
+ +

Status:

+

{alert.status}

+
+
+ ); +}; + +const EventDot: React.FC = ({ + event, + alertStart, + alertEnd, + color, + onClick, + isSelected, +}) => { + const eventTime = parseISO(event.timestamp); + let position = + ((eventTime.getTime() - alertStart.getTime()) / + (alertEnd.getTime() - alertStart.getTime())) * + 100; + if (position == 0) position = 5; + if (position == 100) position = 90; + + return ( +
onClick(event)} + > +
+
+ ); +}; + +interface AlertBarProps { + alert: AlertDto; + auditEvents: AuditEvent[]; + startTime: Date; + endTime: Date; + timeScale: "minutes" | "hours" | "days"; + onEventClick: (event: AuditEvent | null) => void; + selectedEventId: string | null; + isFirstRow: boolean; + isLastRow: boolean; +} + +const AlertBar: React.FC = ({ + alert, + auditEvents, + startTime, + endTime, + timeScale, + onEventClick, + selectedEventId, + isFirstRow, + isLastRow, +}) => { + const alertEvents = auditEvents.filter( + (event) => event.fingerprint === alert.fingerprint + ); + const alertStart = new Date( + Math.min(...alertEvents.map((e) => parseISO(e.timestamp).getTime())) + ); + const alertEnd = new Date( + Math.max(...alertEvents.map((e) => parseISO(e.timestamp).getTime())) + ); + + const startPosition = + ((alertStart.getTime() - startTime.getTime()) / + (endTime.getTime() - startTime.getTime())) * + 100; + let width = + ((alertEnd.getTime() - alertStart.getTime()) / + (endTime.getTime() - startTime.getTime())) * + 100; + + // Ensure the width is at least 0.5% to make it visible + width = Math.max(width, 0.5); + + const handleEventClick = (event: AuditEvent) => { + onEventClick(selectedEventId === event.id ? null : event); + }; + + return ( +
+
+ {Array.from({ length: 24 }).map((_, index) => ( +
+ ))} +
+
+
+
+ + + {alert.name} + +
+ {alertEvents.map((event, index) => ( + + ))} +
+
+
+ ); +}; + +export default function IncidentTimeline({ + incident, +}: { + incident: IncidentDto; +}) { + const { data: alerts, isLoading: alertsLoading } = useIncidentAlerts( + incident.id + ); + const { useMultipleFingerprintsAlertAudit } = useAlerts(); + const { + data: auditEvents, + isLoading: auditEventsLoading, + mutate, + } = useMultipleFingerprintsAlertAudit( + alerts?.items.map((m) => m.fingerprint) + ); + + const [selectedEvent, setSelectedEvent] = useState(null); + + useEffect(() => { + mutate(); + }, [alerts, mutate]); + + const timelineData = useMemo(() => { + if (auditEvents && alerts) { + const allTimestamps = auditEvents.map((event) => + parseISO(event.timestamp).getTime() + ); + + const startTime = new Date(Math.min(...allTimestamps)); + const endTime = new Date(Math.max(...allTimestamps)); + + // Add padding to start and end times + const paddedStartTime = new Date(startTime.getTime() - 1000 * 60 * 10); // 10 minutes before + const paddedEndTime = new Date(endTime.getTime() + 1000 * 60 * 10); // 10 minutes after + + const totalDuration = paddedEndTime.getTime() - paddedStartTime.getTime(); + const pixelsPerMillisecond = 5000 / totalDuration; // Assuming 5000px minimum width + + let timeScale: "minutes" | "hours" | "days"; + let intervalDuration: number; + let formatString: string; + + if (totalDuration > 3 * 24 * 60 * 60 * 1000) { + timeScale = "days"; + intervalDuration = 24 * 60 * 60 * 1000; + formatString = "MMM dd"; + } else if (totalDuration > 24 * 60 * 60 * 1000) { + timeScale = "hours"; + intervalDuration = 60 * 60 * 1000; + formatString = "HH:mm"; + } else { + timeScale = "minutes"; + intervalDuration = 5 * 60 * 1000; // 5-minute intervals + formatString = "HH:mm:ss"; + } + + const intervals: Date[] = []; + let currentTime = paddedStartTime; + while (currentTime <= paddedEndTime) { + intervals.push(new Date(currentTime)); + currentTime = new Date(currentTime.getTime() + intervalDuration); + } + + return { + startTime: paddedStartTime, + endTime: paddedEndTime, + intervals, + formatString, + timeScale, + pixelsPerMillisecond, + }; + } + return {}; + }, [auditEvents, alerts]); + + if (auditEventsLoading || !auditEvents || alertsLoading) return <>No Data; + + const { + startTime, + endTime, + intervals, + formatString, + timeScale, + pixelsPerMillisecond, + } = timelineData; + + if ( + !intervals || + !startTime || + !endTime || + !timeScale || + !pixelsPerMillisecond + ) + return <>No Data; + + const totalWidth = Math.max( + 5000, + (endTime.getTime() - startTime.getTime()) * pixelsPerMillisecond + ); + + return ( +
+
+
+ {/* Time labels */} +
+ {intervals.map((time, index) => ( +
+ {format(time, formatString)} +
+ ))} + {/* Add an extra label for the first time, positioned at the start */} +
+ {format(intervals[0], formatString)} +
+
+ + {/* Alert bars */} +
+ {alerts?.items + .sort((a, b) => { + const aStart = Math.min( + ...auditEvents + .filter((e) => e.fingerprint === a.fingerprint) + .map((e) => parseISO(e.timestamp).getTime()) + ); + const bStart = Math.min( + ...auditEvents + .filter((e) => e.fingerprint === b.fingerprint) + .map((e) => parseISO(e.timestamp).getTime()) + ); + return aStart - bStart; + }) + .map((alert, index, array) => ( + + ))} +
+
+
+
+ {/* Event details box */} + {selectedEvent && ( +
+ a.fingerprint === selectedEvent.fingerprint + )! + } + /> +
+ )} +
+ ); +} diff --git a/keep-ui/app/incidents/[id]/incident.tsx b/keep-ui/app/incidents/[id]/incident.tsx index b552b51dc..416bead04 100644 --- a/keep-ui/app/incidents/[id]/incident.tsx +++ b/keep-ui/app/incidents/[id]/incident.tsx @@ -16,7 +16,9 @@ import { import IncidentAlerts from "./incident-alerts"; import { ArrowUturnLeftIcon } from "@heroicons/react/24/outline"; import { useRouter } from "next/navigation"; -import {useState} from "react"; +import IncidentTimeline from "./incident-timeline"; +import { CiBellOn, CiViewTimeline } from "react-icons/ci"; +import { IoIosGitNetwork } from "react-icons/io"; interface Props { incidentId: string; @@ -47,24 +49,29 @@ export default function IncidentView({ incidentId }: Props) { onClick={() => router.back()} />
- -
-
-
+ +
+
+
-
+
- Alerts - Timeline - Topology + Alerts + Timeline + Topology - Coming Soon... + + + Coming Soon... diff --git a/keep-ui/package-lock.json b/keep-ui/package-lock.json index 4459999ce..ebe5dba09 100644 --- a/keep-ui/package-lock.json +++ b/keep-ui/package-lock.json @@ -372,6 +372,7 @@ }, "devDependencies": { "@tailwindcss/typography": "^0.5.12", + "@types/d3-time-format": "^4.0.3", "@types/js-cookie": "^3.0.3", "@types/js-yaml": "^4.0.5", "@types/json-logic-js": "^2.0.7", @@ -4022,6 +4023,12 @@ "resolved": "https://registry.npmjs.org/@types/d3-time/-/d3-time-3.0.3.tgz", "integrity": "sha512-2p6olUZ4w3s+07q3Tm2dbiMZy5pCDfYwtLXXHUnVzXgQlZ/OyPtUz6OL382BkOuGlLXqfT+wqv8Fw2v8/0geBw==" }, + "node_modules/@types/d3-time-format": { + "version": "4.0.3", + "resolved": "https://registry.npmjs.org/@types/d3-time-format/-/d3-time-format-4.0.3.tgz", + "integrity": "sha512-5xg9rC+wWL8kdDj153qZcsJ0FWiFt0J5RB6LYUNZjwSnesfblqrI/bJ1wBdJ8OQfncgbJG5+2F+qfqnqyzYxyg==", + "dev": true + }, "node_modules/@types/d3-timer": { "version": "3.0.2", "resolved": "https://registry.npmjs.org/@types/d3-timer/-/d3-timer-3.0.2.tgz", diff --git a/keep-ui/package.json b/keep-ui/package.json index 206ea0c2e..e1c722b95 100644 --- a/keep-ui/package.json +++ b/keep-ui/package.json @@ -373,6 +373,7 @@ }, "devDependencies": { "@tailwindcss/typography": "^0.5.12", + "@types/d3-time-format": "^4.0.3", "@types/js-cookie": "^3.0.3", "@types/js-yaml": "^4.0.5", "@types/json-logic-js": "^2.0.7", diff --git a/keep-ui/tailwind.config.js b/keep-ui/tailwind.config.js index aae009bf3..1646a014b 100644 --- a/keep-ui/tailwind.config.js +++ b/keep-ui/tailwind.config.js @@ -8,6 +8,10 @@ module.exports = { darkMode: "class", theme: { extend: { + gridTemplateColumns: { + 20: "repeat(20, minmax(0, 1fr))", + 24: "repeat(24, minmax(0, 1fr))", + }, minHeight: { "screen-minus-200": "calc(100vh - 200px)", }, @@ -129,5 +133,8 @@ module.exports = { /^(fill-(?:slate|gray|zinc|neutral|stone|red|orange|amber|yellow|lime|green|emerald|teal|cyan|sky|blue|indigo|violet|purple|fuchsia|pink|rose)-(?:50|100|200|300|400|500|600|700|800|900|950))$/, }, ], - plugins: [require("@headlessui/tailwindcss"), require('@tailwindcss/typography')], + plugins: [ + require("@headlessui/tailwindcss"), + require("@tailwindcss/typography"), + ], }; diff --git a/keep-ui/utils/hooks/useAlerts.ts b/keep-ui/utils/hooks/useAlerts.ts index 53b343b93..23d73139c 100644 --- a/keep-ui/utils/hooks/useAlerts.ts +++ b/keep-ui/utils/hooks/useAlerts.ts @@ -6,6 +6,15 @@ import { getApiURL } from "utils/apiUrl"; import { fetcher } from "utils/fetcher"; import { toDateObjectWithFallback } from "utils/helpers"; +export type AuditEvent = { + id: string; + user_id: string; + action: string; + description: string; + timestamp: string; + fingerprint: string; +}; + export const useAlerts = () => { const apiUrl = getApiURL(); const { data: session } = useSession(); @@ -33,7 +42,8 @@ export const useAlerts = () => { options: SWRConfiguration = { revalidateOnFocus: false } ) => { return useSWR( - () => (session && presetName ? `${apiUrl}/preset/${presetName}/alerts` : null), + () => + session && presetName ? `${apiUrl}/preset/${presetName}/alerts` : null, (url) => fetcher(url, session?.accessToken), options ); @@ -78,12 +88,32 @@ export const useAlerts = () => { }; }; + const useMultipleFingerprintsAlertAudit = ( + fingerprints: string[] | undefined, + options: SWRConfiguration = { revalidateOnFocus: true } + ) => { + return useSWR( + () => (session && fingerprints ? `${apiUrl}/alerts/audit` : null), + (url) => + fetcher(url, session?.accessToken, { + method: "POST", + body: JSON.stringify(fingerprints), + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${session?.accessToken}`, + }, + }), + options + ); + }; + const useAlertAudit = ( fingerprint: string, options: SWRConfiguration = { revalidateOnFocus: false } ) => { - return useSWR( - () => (session && fingerprint ? `${apiUrl}/alerts/${fingerprint}/audit` : null), + return useSWR( + () => + session && fingerprint ? `${apiUrl}/alerts/${fingerprint}/audit` : null, (url) => fetcher(url, session?.accessToken), options ); @@ -93,6 +123,7 @@ export const useAlerts = () => { useAlertHistory, useAllAlerts, usePresetAlerts, - useAlertAudit + useAlertAudit, + useMultipleFingerprintsAlertAudit, }; }; diff --git a/keep-ui/utils/hooks/useTopology.ts b/keep-ui/utils/hooks/useTopology.ts index ba959cd54..a3180815c 100644 --- a/keep-ui/utils/hooks/useTopology.ts +++ b/keep-ui/utils/hooks/useTopology.ts @@ -4,7 +4,7 @@ import useSWR from "swr"; import { getApiURL } from "utils/apiUrl"; import { fetcher } from "utils/fetcher"; import { useWebsocket } from "./usePusher"; -import { useCallback, useEffect } from "react"; +import { useCallback, useEffect, useState } from "react"; import { toast } from "react-toastify"; const isNullOrUndefined = (value: any) => value === null || value === undefined; @@ -20,7 +20,7 @@ export const useTopology = ( environment?: string ) => { const { data: session } = useSession(); - useTopologyPolling(); + const { data: pollTopology } = useTopologyPolling(); const apiUrl = getApiURL(); const url = !session @@ -36,6 +36,12 @@ export const useTopology = ( (url: string) => fetcher(url, session!.accessToken) ); + useEffect(() => { + if (pollTopology) { + mutate(); + } + }, [pollTopology, mutate]); + return { topologyData: data, error, @@ -46,12 +52,14 @@ export const useTopology = ( export const useTopologyPolling = () => { const { bind, unbind } = useWebsocket(); + const [pollTopology, setPollTopology] = useState(0); const handleIncoming = useCallback((data: TopologyUpdate) => { toast.success( `Topology pulled from ${data.providerId} (${data.providerType})`, { position: "top-right" } ); + setPollTopology(Math.floor(Math.random() * 10000)); }, []); useEffect(() => { @@ -60,4 +68,6 @@ export const useTopologyPolling = () => { unbind("topology-update", handleIncoming); }; }, [bind, unbind, handleIncoming]); + + return { data: pollTopology }; }; diff --git a/keep/api/core/db.py b/keep/api/core/db.py index 2adf87a74..fbc21d200 100644 --- a/keep/api/core/db.py +++ b/keep/api/core/db.py @@ -2094,16 +2094,38 @@ def get_incidents(tenant_id) -> List[Incident]: def get_alert_audit( - tenant_id: str, fingerprint: str, limit: int = 50 + tenant_id: str, fingerprint: str | list[str], limit: int = 50 ) -> List[AlertAudit]: + """ + Get the alert audit for the given fingerprint(s). + + Args: + tenant_id (str): the tenant_id to filter the alert audit by + fingerprint (str | list[str]): the fingerprint(s) to filter the alert audit by + limit (int, optional): the maximum number of alert audits to return. Defaults to 50. + + Returns: + List[AlertAudit]: the alert audit for the given fingerprint(s) + """ with Session(engine) as session: - audit = session.exec( - select(AlertAudit) - .where(AlertAudit.tenant_id == tenant_id) - .where(AlertAudit.fingerprint == fingerprint) - .order_by(desc(AlertAudit.timestamp)) - .limit(limit) - ).all() + if isinstance(fingerprint, list): + query = ( + select(AlertAudit) + .where(AlertAudit.tenant_id == tenant_id) + .where(AlertAudit.fingerprint.in_(fingerprint)) + .order_by(desc(AlertAudit.timestamp), AlertAudit.fingerprint) + ) + if limit: + query = query.limit(limit) + audit = session.exec(query).all() + else: + audit = session.exec( + select(AlertAudit) + .where(AlertAudit.tenant_id == tenant_id) + .where(AlertAudit.fingerprint == fingerprint) + .order_by(desc(AlertAudit.timestamp)) + .limit(limit) + ).all() return audit diff --git a/keep/api/models/alert_audit.py b/keep/api/models/alert_audit.py new file mode 100644 index 000000000..f03a5af67 --- /dev/null +++ b/keep/api/models/alert_audit.py @@ -0,0 +1,57 @@ +from datetime import datetime + +from pydantic import BaseModel + +from keep.api.models.db.alert import AlertActionType, AlertAudit + + +class AlertAuditDto(BaseModel): + id: str + timestamp: datetime + fingerprint: str + action: AlertActionType + user_id: str + description: str + + @classmethod + def from_orm(cls, alert_audit: AlertAudit) -> "AlertAuditDto": + return cls( + id=str(alert_audit.id), + timestamp=alert_audit.timestamp, + fingerprint=alert_audit.fingerprint, + action=alert_audit.action, + user_id=alert_audit.user_id, + description=alert_audit.description, + ) + + @classmethod + def from_orm_list(cls, alert_audits: list[AlertAudit]) -> list["AlertAuditDto"]: + grouped_events = [] + previous_event = None + count = 1 + + for event in alert_audits: + # Check if the current event is similar to the previous event + if previous_event and ( + event.user_id == previous_event.user_id + and event.action == previous_event.action + and event.description == previous_event.description + ): + # Increment the count if the events are similar + count += 1 + else: + # If the events are not similar, append the previous event to the grouped events + if previous_event: + if count > 1: + previous_event.description += f" x{count}" + grouped_events.append(AlertAuditDto.from_orm(previous_event)) + # Update the previous event to the current event and reset the count + previous_event = event + count = 1 + + # Add the last event to the grouped events + if previous_event: + if count > 1: + previous_event.description += f" x{count}" + grouped_events.append(AlertAuditDto.from_orm(previous_event)) + return grouped_events diff --git a/keep/api/routes/alerts.py b/keep/api/routes/alerts.py index 5935d63a4..210d5adea 100644 --- a/keep/api/routes/alerts.py +++ b/keep/api/routes/alerts.py @@ -34,6 +34,7 @@ EnrichAlertRequestBody, UnEnrichAlertRequestBody, ) +from keep.api.models.alert_audit import AlertAuditDto from keep.api.models.db.alert import AlertActionType from keep.api.models.search_alert import SearchAlertsRequest from keep.api.tasks.process_event_task import process_event @@ -686,16 +687,49 @@ async def search_alerts( raise HTTPException(status_code=500, detail="Failed to search alerts") +@router.post( + "/audit", + description="Get alert timeline audit trail for multiple fingerprints", +) +def get_multiple_fingerprint_alert_audit( + fingerprints: list[str], + authenticated_entity: AuthenticatedEntity = Depends( + IdentityManagerFactory.get_auth_verifier(["read:alert"]) + ), +) -> list[AlertAuditDto]: + tenant_id = authenticated_entity.tenant_id + logger.info( + "Fetching alert audit", + extra={"fingerprints": fingerprints, "tenant_id": tenant_id}, + ) + alert_audit = get_alert_audit_db(tenant_id, fingerprints) + + if not alert_audit: + raise HTTPException(status_code=404, detail="Alert not found") + grouped_events = [] + + # Group the results by fingerprint for "deduplication" (2x, 3x, etc.) thingy.. + grouped_audit = {} + for audit in alert_audit: + if audit.fingerprint not in grouped_audit: + grouped_audit[audit.fingerprint] = [] + grouped_audit[audit.fingerprint].append(audit) + + for values in grouped_audit.values(): + grouped_events.extend(AlertAuditDto.from_orm_list(values)) + return grouped_events + + @router.get( "/{fingerprint}/audit", - description="Get alert enrichment", + description="Get alert timeline audit trail", ) def get_alert_audit( fingerprint: str, authenticated_entity: AuthenticatedEntity = Depends( IdentityManagerFactory.get_auth_verifier(["read:alert"]) ), -): +) -> list[AlertAuditDto]: tenant_id = authenticated_entity.tenant_id logger.info( "Fetching alert audit", @@ -708,29 +742,5 @@ def get_alert_audit( if not alert_audit: raise HTTPException(status_code=404, detail="Alert not found") - grouped_events = [] - previous_event = None - count = 1 - - for event in alert_audit: - if previous_event and ( - event.user_id == previous_event.user_id - and event.action == previous_event.action - and event.description == previous_event.description - ): - count += 1 - else: - if previous_event: - if count > 1: - previous_event.description += f" x{count}" - grouped_events.append(previous_event.dict()) - previous_event = event - count = 1 - - # Add the last event - if previous_event: - if count > 1: - previous_event.description += f" x{count}" - grouped_events.append(previous_event.dict()) - + grouped_events = AlertAuditDto.from_orm_list(alert_audit) return grouped_events diff --git a/keep/api/utils/enrichment_helpers.py b/keep/api/utils/enrichment_helpers.py index 7085626c0..bd2baa38c 100644 --- a/keep/api/utils/enrichment_helpers.py +++ b/keep/api/utils/enrichment_helpers.py @@ -108,9 +108,7 @@ def convert_db_alerts_to_dto_alerts(alerts: list[Alert]) -> list[AlertDto]: ) continue - # include the db event id if it's not present - if alert_dto.event_id is None: - alert_dto.event_id = str(alert.id) + alert_dto.event_id = str(alert.id) # enrich provider id when it's possible if alert_dto.providerId is None: From 74b1193c12a767a55876607ad54398426876df79 Mon Sep 17 00:00:00 2001 From: Shahar Glazner Date: Thu, 19 Sep 2024 08:55:15 +0100 Subject: [PATCH 11/16] fix: over flowing badge (#1967) --- keep-ui/app/workflows/workflow-tile.tsx | 81 ++++++++++++------------- 1 file changed, 40 insertions(+), 41 deletions(-) diff --git a/keep-ui/app/workflows/workflow-tile.tsx b/keep-ui/app/workflows/workflow-tile.tsx index 73da76321..7e3da37ba 100644 --- a/keep-ui/app/workflows/workflow-tile.tsx +++ b/keep-ui/app/workflows/workflow-tile.tsx @@ -551,31 +551,32 @@ function WorkflowTile({ workflow }: { workflow: Workflow }) {
)} { - e.stopPropagation(); - e.preventDefault(); - if (workflow.id) { - router.push(`/workflows/${workflow.id}`); - } - }} + className="relative flex flex-col justify-between bg-white rounded shadow p-2 h-full hover:border-orange-400 hover:border-2 overflow-hidden" + onClick={(e) => { + e.stopPropagation(); + e.preventDefault(); + if (workflow.id) { + router.push(`/workflows/${workflow.id}`); + } + }} > -
+
{workflow.provisioned && ( - + Provisioned )} - {!!handleRunClick && WorkflowMenuSection({ - onDelete: handleDeleteClick, - onRun: handleRunClick, - onDownload: handleDownloadClick, - onView: handleViewClick, - onBuilder: handleBuilderClick, - runButtonToolTip: message, - isRunButtonDisabled: !!isRunButtonDisabled, - provisioned: workflow.provisioned, - })} + {!!handleRunClick && + WorkflowMenuSection({ + onDelete: handleDeleteClick, + onRun: handleRunClick, + onDownload: handleDownloadClick, + onView: handleViewClick, + onBuilder: handleBuilderClick, + runButtonToolTip: message, + isRunButtonDisabled: !!isRunButtonDisabled, + provisioned: workflow.provisioned, + })}
@@ -634,7 +635,7 @@ function WorkflowTile({ workflow }: { workflow: Workflow }) { className="object-cover" /> ) : ( - + )} Trigger
@@ -684,9 +685,9 @@ function WorkflowTile({ workflow }: { workflow: Workflow }) {
- {!!getTriggerModalProps && } + {!!getTriggerModalProps && ( + + )} { @@ -760,7 +761,6 @@ export function WorkflowTileOld({ workflow }: { workflow: Workflow }) { setFormErrors(updatedFormErrors); }; - const handleDeleteClick = async () => { try { const response = await fetch(`${apiUrl}/workflows/${workflow.id}`, { @@ -863,16 +863,17 @@ export function WorkflowTileOld({ workflow }: { workflow: Workflow }) { {workflow.name} - {!!handleRunClick && WorkflowMenuSection({ - onDelete: handleDeleteClick, - onRun: handleRunClick, - onDownload: handleDownloadClick, - onView: handleViewClick, - onBuilder: handleBuilderClick, - runButtonToolTip: message, - isRunButtonDisabled: !!isRunButtonDisabled, - provisioned: workflow.provisioned, - })} + {!!handleRunClick && + WorkflowMenuSection({ + onDelete: handleDeleteClick, + onRun: handleRunClick, + onDownload: handleDownloadClick, + onView: handleViewClick, + onBuilder: handleBuilderClick, + runButtonToolTip: message, + isRunButtonDisabled: !!isRunButtonDisabled, + provisioned: workflow.provisioned, + })}
@@ -920,9 +921,7 @@ export function WorkflowTileOld({ workflow }: { workflow: Workflow }) { Disabled - - {workflow?.disabled?.toString()} - + {workflow?.disabled?.toString()} @@ -1023,9 +1022,9 @@ export function WorkflowTileOld({ workflow }: { workflow: Workflow }) { )} - {!!getTriggerModalProps && } + {!!getTriggerModalProps && ( + + )}
); } From 6f2ee05a7702ca481e6e4ad6eb8678b665e4b809 Mon Sep 17 00:00:00 2001 From: Rajesh Jonnalagadda <38752904+rajeshj11@users.noreply.github.com> Date: Thu, 19 Sep 2024 13:31:19 +0530 Subject: [PATCH 12/16] fix: save and deploy change (#1938) Co-authored-by: Tal Co-authored-by: Shahar Glazner --- .../app/workflows/builder/ReactFlowEditor.tsx | 22 +- .../app/workflows/builder/builder-store.tsx | 6 +- keep-ui/app/workflows/builder/builder.tsx | 22 +- keep-ui/app/workflows/builder/editors.tsx | 295 ++++++++++-------- 4 files changed, 206 insertions(+), 139 deletions(-) diff --git a/keep-ui/app/workflows/builder/ReactFlowEditor.tsx b/keep-ui/app/workflows/builder/ReactFlowEditor.tsx index 6848dd13c..19e192798 100644 --- a/keep-ui/app/workflows/builder/ReactFlowEditor.tsx +++ b/keep-ui/app/workflows/builder/ReactFlowEditor.tsx @@ -21,15 +21,24 @@ const ReactFlowEditor = ({ }; onDefinitionChange: (def: Definition) => void }) => { - const { selectedNode, changes, v2Properties, nodes, edges, setOpneGlobalEditor, synced, setSynced } = useStore(); + const { selectedNode, changes, v2Properties, nodes, edges, setOpneGlobalEditor, synced, setSynced, setCanDeploy } = useStore(); const [isOpen, setIsOpen] = useState(false); const stepEditorRef = useRef(null); const containerRef = useRef(null); const isTrigger = ['interval', 'manual', 'alert'].includes(selectedNode || '') + const saveRef = useRef(false); + useEffect(()=>{ + if(saveRef.current && synced){ + setCanDeploy(true); + saveRef.current = false; + } + }, [saveRef?.current, synced]) + useEffect(() => { setIsOpen(true); if (selectedNode) { + saveRef.current = false; const timer = setTimeout(() => { if (isTrigger) { setOpneGlobalEditor(true); @@ -114,9 +123,16 @@ const ReactFlowEditor = ({
- + {!selectedNode?.includes('empty') && !isTrigger && } - {!selectedNode?.includes('empty') && !isTrigger && } + {!selectedNode?.includes('empty') && !isTrigger && }
diff --git a/keep-ui/app/workflows/builder/builder-store.tsx b/keep-ui/app/workflows/builder/builder-store.tsx index e19b7d6ea..7adae1287 100644 --- a/keep-ui/app/workflows/builder/builder-store.tsx +++ b/keep-ui/app/workflows/builder/builder-store.tsx @@ -149,6 +149,8 @@ export type FlowState = { errorNode: string | null; synced: boolean; setSynced: (synced: boolean) => void; + canDeploy: boolean; + setCanDeploy: (deploy: boolean) => void; }; @@ -260,6 +262,8 @@ const useStore = create((set, get) => ({ firstInitilisationDone: false, errorNode: null, synced: true, + canDeploy: false, + setCanDeploy: (deploy)=>set({canDeploy: deploy}), setSynced: (sync) => set({ synced: sync }), setErrorNode: (id) => set({ errorNode: id }), setFirstInitilisationDone: (firstInitilisationDone) => set({ firstInitilisationDone }), @@ -291,7 +295,7 @@ const useStore = create((set, get) => ({ }); set({ nodes: updatedNodes, - changes: get().changes + 1 + changes: get().changes + 1, }); } }, diff --git a/keep-ui/app/workflows/builder/builder.tsx b/keep-ui/app/workflows/builder/builder.tsx index 020d13bf4..08d60a4c5 100644 --- a/keep-ui/app/workflows/builder/builder.tsx +++ b/keep-ui/app/workflows/builder/builder.tsx @@ -27,6 +27,7 @@ import { WorkflowExecution, WorkflowExecutionFailure } from "./types"; import ReactFlowBuilder from "./ReactFlowBuilder"; import { ReactFlowProvider } from "@xyflow/react"; import useStore, { ReactFlowDefinition, V2Step, Definition as FlowDefinition } from "./builder-store"; +import { toast } from "react-toastify"; interface Props { loadedAlertFile: string | null; @@ -76,7 +77,7 @@ function Builder({ const [compiledAlert, setCompiledAlert] = useState(null); const searchParams = useSearchParams(); - const { setErrorNode } = useStore(); + const { errorNode, setErrorNode, canDeploy, synced } = useStore(); const setStepValidationErrorV2 = (step: V2Step, error: string | null) => { setStepValidationError(error); @@ -210,7 +211,12 @@ function Builder({ }, [triggerRun]); useEffect(() => { + if (triggerSave) { + if(!synced) { + toast('Please save the previous step or wait while properties sync with the workflow.'); + return; + } if (workflowId) { updateWorkflow(); } else { @@ -220,6 +226,20 @@ function Builder({ // eslint-disable-next-line react-hooks/exhaustive-deps }, [triggerSave]); + useEffect(()=>{ + if (canDeploy && !errorNode && definition.isValid) { + if(!synced) { + toast('Please save the previous step or wait while properties sync with the workflow.'); + return; + } + if (workflowId) { + updateWorkflow(); + } else { + addWorkflow(); + } + } + }, [canDeploy, errorNode, definition?.isValid]) + useEffect(() => { enableGenerate( (definition.isValid && diff --git a/keep-ui/app/workflows/builder/editors.tsx b/keep-ui/app/workflows/builder/editors.tsx index 53283f26b..2670b6904 100644 --- a/keep-ui/app/workflows/builder/editors.tsx +++ b/keep-ui/app/workflows/builder/editors.tsx @@ -13,20 +13,17 @@ import { KeyIcon } from "@heroicons/react/20/solid"; import { Provider } from "app/providers/providers"; import { BackspaceIcon, - BellSnoozeIcon, - ClockIcon, FunnelIcon, - HandRaisedIcon, } from "@heroicons/react/24/outline"; import useStore, { V2Properties } from "./builder-store"; -import { useEffect, useState } from "react"; +import { useEffect, useRef, useState } from "react"; function EditorLayout({ children }: { children: React.ReactNode }) { return
{children}
; } -export function GlobalEditorV2({synced}: {synced: boolean}) { +export function GlobalEditorV2({ synced, saveRef }: { synced: boolean, saveRef: React.MutableRefObject; }) { const { v2Properties: properties, updateV2Properties: setProperty, selectedNode } = useStore(); return ( @@ -45,6 +42,7 @@ export function GlobalEditorV2({synced}: {synced: boolean}) { properties={properties} setProperties={setProperty} selectedNode={selectedNode} + saveRef={saveRef} /> ); @@ -58,7 +56,7 @@ interface keepEditorProps { installedProviders?: Provider[] | null | undefined; providerType?: string; type?: string; - isV2?:boolean + isV2?: boolean } function KeepStepEditor({ @@ -142,7 +140,7 @@ function KeepStepEditor({ placeholder="Enter provider name manually" onChange={(e: any) => updateProperty("config", e.target.value)} className="my-2.5" - value={providerConfig} + value={providerConfig || ""} error={ providerConfig !== "" && providerConfig !== undefined && @@ -151,14 +149,13 @@ function KeepStepEditor({ (p) => p.details?.name === providerConfig ) === undefined } - errorMessage={`${ - providerConfig && isThisProviderNeedsInstallation && - installedProviderByType?.find( - (p) => p.details?.name === providerConfig - ) === undefined + errorMessage={`${providerConfig && isThisProviderNeedsInstallation && + installedProviderByType?.find( + (p) => p.details?.name === providerConfig + ) === undefined ? "Please note this provider is not installed and you'll need to install it before executing this workflow." : "" - }`} + }`} /> Provider Parameters
@@ -168,7 +165,7 @@ function KeepStepEditor({ placeholder="If Condition" onValueChange={(value) => updateProperty("if", value)} className="mb-2.5" - value={properties?.if as string} + value={properties?.if || "" as string} />
{uniqueParams @@ -186,7 +183,7 @@ function KeepStepEditor({ placeholder={key} onChange={propertyChanged} className="mb-2.5" - value={currentPropertyValue} + value={currentPropertyValue || ""} />
); @@ -258,11 +255,13 @@ function KeepForeachEditor({ properties, updateProperty }: keepEditorProps) { function WorkflowEditorV2({ properties, setProperties, - selectedNode + selectedNode, + saveRef }: { properties: V2Properties; setProperties: (updatedProperties: V2Properties) => void; selectedNode: string | null; + saveRef: React.MutableRefObject; }) { const isTrigger = ['interval', 'manual', 'alert'].includes(selectedNode || '') @@ -271,6 +270,9 @@ function WorkflowEditorV2({ const currentFilters = properties.alert || {}; const updatedFilters = { ...currentFilters, [filter]: value }; setProperties({ ...properties, alert: updatedFilters }); + if (saveRef.current) { + saveRef.current = false + } }; const addFilter = () => { @@ -285,8 +287,21 @@ function WorkflowEditorV2({ const currentFilters = { ...properties.alert }; delete currentFilters[filter]; setProperties({ ...properties, alert: currentFilters }); + if (saveRef.current) { + saveRef.current = false + } }; + const handleChange = (key: string, value: string) => { + setProperties({ + ...properties, + [key]: value, + }); + if (saveRef.current) { + saveRef.current = false + } + } + const propertyKeys = Object.keys(properties).filter( (k) => k !== "isLocked" && k !== "id" ); @@ -298,118 +313,112 @@ function WorkflowEditorV2({ const isTrigger = ["manual", "alert", 'interval'].includes(key) ; renderDivider = isTrigger && key === selectedNode ? !renderDivider : false; return ( -
- {renderDivider && } - {((key === selectedNode) || (!isTrigger)) && {key}} - - {(() => { - switch (key) { - case "manual": - return ( - selectedNode === "manual" && ( -
- - setProperties({ - ...properties, - [key]: e.target.checked ? "true" : "false", - }) - } - disabled={true} - /> -
- ) - ); - - case "alert": - return ( - selectedNode === "alert" && ( - <> -
- -
- {properties.alert && - Object.keys(properties.alert as {}).map((filter) => { - return ( - <> - {filter} -
- - updateAlertFilter(filter, e.target.value) - } - value={(properties.alert as any)[filter] as string} - /> - deleteFilter(filter)} - /> -
- - ); - })} - - ) - ); - - case "interval": - return ( - selectedNode === "interval" && ( - - setProperties({ ...properties, [key]: e.target.value }) - } - value={properties[key] as string} - /> - ) - ); - case "disabled": - return ( -
- - setProperties({ - ...properties, - [key]: e.target.checked ? "true" : "false", - }) - } - /> -
- ); - default: - return ( +
+ {renderDivider && } + {((key === selectedNode) || (!isTrigger)) && {key}} + + {(() => { + switch (key) { + case "manual": + return ( + selectedNode === "manual" && ( +
+ + handleChange(key, e.target.checked ? "true" : "false") + } + disabled={true} + /> +
+ ) + ); + + case "alert": + return ( + selectedNode === "alert" && ( + <> +
+ +
+ {properties.alert && + Object.keys(properties.alert as {}).map((filter) => { + return ( + <> + {filter} +
- setProperties({ ...properties, [key]: e.target.value }) - } - value={properties[key] as string} + key={filter} + placeholder={`Set alert ${filter}`} + onChange={(e: any) => + updateAlertFilter(filter, e.target.value) + } + value={(properties.alert as any)[filter] || "" as string} + /> + deleteFilter(filter)} /> - ); +
+ + ); + })} + + ) + ); + + case "interval": + return ( + selectedNode === "interval" && ( + + handleChange(key, e.target.value) + } + value={properties[key] || "" as string} + /> + ) + ); + case "disabled": + return ( +
+ + handleChange(key, e.target.checked ? "true" : "false") + } + /> +
+ ); + default: + return ( + + handleChange(key, e.target.value) } - })()} -
- ); - + value={properties[key] || "" as string} + /> + ); + } + })()} +
+ ); })} ); @@ -420,25 +429,28 @@ function WorkflowEditorV2({ export function StepEditorV2({ providers, installedProviders, - setSynced + setSynced, + saveRef }: { providers: Provider[] | undefined | null; installedProviders?: Provider[] | undefined | null; - setSynced: (sync:boolean) => void; + setSynced: (sync: boolean) => void; + saveRef: React.MutableRefObject; }) { - const [formData, setFormData] = useState<{ name?: string; properties?: V2Properties, type?:string }>({}); + const [formData, setFormData] = useState<{ name?: string; properties?: V2Properties, type?: string }>({}); const { selectedNode, updateSelectedNodeData, - setOpneGlobalEditor, - getNodeById + getNodeById, } = useStore(); + const deployRef = useRef(null); + useEffect(() => { if (selectedNode) { const { data } = getNodeById(selectedNode) || {}; const { name, type, properties } = data || {}; - setFormData({ name, type , properties }); + setFormData({ name, type, properties }); } }, [selectedNode, getNodeById]); @@ -457,6 +469,9 @@ export function StepEditorV2({ properties: { ...formData.properties, [key]: value }, }); setSynced(false); + if (saveRef.current) { + saveRef.current = false; + } }; @@ -464,6 +479,10 @@ export function StepEditorV2({ // Finalize the changes before saving updateSelectedNodeData('name', formData.name); updateSelectedNodeData('properties', formData.properties); + setSynced(false); + if (saveRef && deployRef?.current?.checked) { + saveRef.current = true; + } }; const type = formData ? formData.type?.includes("step-") || formData.type?.includes("action-") : ""; @@ -479,7 +498,7 @@ export function StepEditorV2({ value={formData.name || ''} onChange={handleInputChange} /> - {type && formData.properties ? ( + {type && formData.properties ? ( ) : null} +
+ Deploy + +
); From 91d0a9a72132ade98be2489e1b3cdad05da17323 Mon Sep 17 00:00:00 2001 From: Shahar Glazner Date: Thu, 19 Sep 2024 09:05:27 +0100 Subject: [PATCH 13/16] fix: clean secret if service already exists (#1963) --- keep/providers/providers_service.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/keep/providers/providers_service.py b/keep/providers/providers_service.py index 9dbb0a3ba..debb97662 100644 --- a/keep/providers/providers_service.py +++ b/keep/providers/providers_service.py @@ -100,6 +100,16 @@ def install_provider( session.add(provider_model) session.commit() except IntegrityError: + try: + # if the provider is already installed, delete the secret + logger.warning("Provider already installed, deleting secret") + secret_manager.delete_secret( + secret_name=secret_name, + ) + logger.warning("Secret deleted") + except Exception: + logger.exception("Failed to delete the secret") + pass raise HTTPException( status_code=409, detail="Provider already installed" ) From 286e303c1d7e59f3bb0e4b10444c3748b8ef63db Mon Sep 17 00:00:00 2001 From: Vladimir Filonov Date: Thu, 19 Sep 2024 12:34:07 +0400 Subject: [PATCH 14/16] feat: Adding incident workflow triggers (#1861) --- docs/workflows/overview.mdx | 13 +- .../workflow-execution-table.tsx | 4 + keep-ui/app/workflows/builder/CustomNode.tsx | 1 + .../app/workflows/builder/ReactFlowEditor.tsx | 7 +- keep-ui/app/workflows/builder/ToolBox.tsx | 3 +- .../app/workflows/builder/builder-store.tsx | 8 +- .../workflows/builder/builder-validators.tsx | 11 +- keep-ui/app/workflows/builder/editors.tsx | 70 +++++---- keep-ui/app/workflows/builder/utils.tsx | 20 ++- keep-ui/utils/reactFlow.ts | 2 +- keep/api/core/db.py | 23 ++- keep/api/models/alert.py | 26 +++- .../versions/2024-09-13-10-48_938b1aa62d5c.py | 2 +- ...0f.py => 2024-09-17-23-30_c5443d9deb0f.py} | 2 +- .../versions/2024-09-18-02-05_772790c2e50a.py | 38 +++++ .../versions/2024-09-18-14-08_5d7ae55efc6a.py | 2 +- keep/api/models/db/workflow.py | 15 ++ keep/api/routes/incidents.py | 66 ++++++++- keep/contextmanager/contextmanager.py | 5 + keep/step/step.py | 6 +- keep/workflowmanager/workflowmanager.py | 107 ++++++++++---- keep/workflowmanager/workflowscheduler.py | 41 ++++-- tests/test_workflow_execution.py | 135 +++++++++++++++++- 23 files changed, 515 insertions(+), 92 deletions(-) rename keep/api/models/db/migrations/versions/{2024-09-11-23-30_c5443d9deb0f.py => 2024-09-17-23-30_c5443d9deb0f.py} (96%) create mode 100644 keep/api/models/db/migrations/versions/2024-09-18-02-05_772790c2e50a.py diff --git a/docs/workflows/overview.mdx b/docs/workflows/overview.mdx index fcf66c5c3..f728d3e6e 100644 --- a/docs/workflows/overview.mdx +++ b/docs/workflows/overview.mdx @@ -10,7 +10,7 @@ In this section we will review the Workflow components. ## Triggers When you run alert with the CLI using `keep run`, the CLI run the alert regardless of the triggers. A trigger is an event that starts the workflow. It could be a manual trigger, an alert, or an interval depending on your use case. -Keep support three types of triggers: +Keep support four types of triggers: ### Manual trigger ``` # run manually @@ -28,6 +28,17 @@ triggers: value: cloudwatch ``` +### Incident trigger +``` +# run when incident get created, update or deleted +# You can use multiple events, but at least one is required +triggers: + - type: incident + events: + - created + - deleted +``` + ### Interval trigger ``` # run every 10 seconds diff --git a/keep-ui/app/workflows/[workflow_id]/workflow-execution-table.tsx b/keep-ui/app/workflows/[workflow_id]/workflow-execution-table.tsx index 6ca7c1748..1a66001ee 100644 --- a/keep-ui/app/workflows/[workflow_id]/workflow-execution-table.tsx +++ b/keep-ui/app/workflows/[workflow_id]/workflow-execution-table.tsx @@ -113,6 +113,7 @@ function getTriggerIcon(triggered_by: string) { case "Manual": return FaHandPointer; case "Scheduler": return PiDiamondsFourFill; case "Alert": return HiBellAlert; + case "Incident": return HiBellAlert; default: return PiDiamondsFourFill; } } @@ -159,6 +160,9 @@ export function ExecutionTable({ case triggered_by.substring(0, 6) === "manual": valueToShow = "Manual"; break; + case triggered_by.substring(0, 8) === "incident": + valueToShow = "Incident"; + break; } } diff --git a/keep-ui/app/workflows/builder/CustomNode.tsx b/keep-ui/app/workflows/builder/CustomNode.tsx index 8d7e43ea9..c98d868fa 100644 --- a/keep-ui/app/workflows/builder/CustomNode.tsx +++ b/keep-ui/app/workflows/builder/CustomNode.tsx @@ -16,6 +16,7 @@ import { toast } from "react-toastify"; function IconUrlProvider(data: FlowNode["data"]) { const { componentType, type } = data || {}; if (type === "alert" || type === "workflow" || type === "trigger" || !type) return "/keep.png"; + if (type === "incident" || type === "workflow" || type === "trigger" || !type) return "/keep.png"; return `/icons/${type ?.replace("step-", "") ?.replace("action-", "") diff --git a/keep-ui/app/workflows/builder/ReactFlowEditor.tsx b/keep-ui/app/workflows/builder/ReactFlowEditor.tsx index 19e192798..3cbea77ea 100644 --- a/keep-ui/app/workflows/builder/ReactFlowEditor.tsx +++ b/keep-ui/app/workflows/builder/ReactFlowEditor.tsx @@ -25,7 +25,7 @@ const ReactFlowEditor = ({ const [isOpen, setIsOpen] = useState(false); const stepEditorRef = useRef(null); const containerRef = useRef(null); - const isTrigger = ['interval', 'manual', 'alert'].includes(selectedNode || '') + const isTrigger = ['interval', 'manual', 'alert', 'incident'].includes(selectedNode || '') const saveRef = useRef(false); useEffect(()=>{ if(saveRef.current && synced){ @@ -34,7 +34,6 @@ const ReactFlowEditor = ({ } }, [saveRef?.current, synced]) - useEffect(() => { setIsOpen(true); if (selectedNode) { @@ -123,11 +122,11 @@ const ReactFlowEditor = ({
- {!selectedNode?.includes('empty') && !isTrigger && } - {!selectedNode?.includes('empty') && !isTrigger && ['interval', 'manual', 'alert'].includes(node?.id)).reduce((obj: any, node: any) => { + const triggerNodeMap = nodes.filter((node: any) => ['interval', 'manual', 'alert', 'incident'].includes(node?.id)).reduce((obj: any, node: any) => { obj[node.id] = true; return obj; }, {} as Record); diff --git a/keep-ui/app/workflows/builder/builder-store.tsx b/keep-ui/app/workflows/builder/builder-store.tsx index 7adae1287..3bc6fa0f3 100644 --- a/keep-ui/app/workflows/builder/builder-store.tsx +++ b/keep-ui/app/workflows/builder/builder-store.tsx @@ -244,6 +244,10 @@ function addNodeBetween(nodeOrEdge: string | null, step: V2Step, type: string, s set({v2Properties: {...get().v2Properties, [newNodeId]: {}}}); break; } + case "incident": { + set({v2Properties: {...get().v2Properties, [newNodeId]: {}}}); + break; + } } } @@ -437,7 +441,7 @@ const useStore = create((set, get) => ({ finalEdges = edges.filter((edge) => !(idArray.includes(edge.source) || idArray.includes(edge.target))); - if (['interval', 'alert', 'manual'].includes(ids) && edges.some((edge) => edge.source === 'trigger_start' && edge.target !== ids)) { + if (['interval', 'alert', 'manual', 'incident'].includes(ids) && edges.some((edge) => edge.source === 'trigger_start' && edge.target !== ids)) { edges = edges.filter((edge) => !(idArray.includes(edge.source))); } const sources = [...new Set(edges.filter((edge) => startNode.id === edge.target))]; @@ -457,7 +461,7 @@ const useStore = create((set, get) => ({ const newNode = createDefaultNodeV2({ ...nodes[endIndex + 1].data, islayouted: false }, nodes[endIndex + 1].id); const newNodes = [...nodes.slice(0, nodeStartIndex), newNode, ...nodes.slice(endIndex + 2)]; - if(['manual', 'alert', 'interval'].includes(ids)) { + if(['manual', 'alert', 'interval', 'incident'].includes(ids)) { const v2Properties = get().v2Properties; delete v2Properties[ids]; set({ v2Properties }); diff --git a/keep-ui/app/workflows/builder/builder-validators.tsx b/keep-ui/app/workflows/builder/builder-validators.tsx index b9742a2ed..fdbcebab9 100644 --- a/keep-ui/app/workflows/builder/builder-validators.tsx +++ b/keep-ui/app/workflows/builder/builder-validators.tsx @@ -20,9 +20,10 @@ export function globalValidatorV2( !!definition?.properties && !definition.properties['manual'] && !definition.properties['interval'] && - !definition.properties['alert'] + !definition.properties['alert'] && + !definition.properties['incident'] ) { - setGlobalValidationError('trigger_start', "Workflow Should alteast have one trigger."); + setGlobalValidationError('trigger_start', "Workflow Should at least have one trigger."); return false; } @@ -38,6 +39,12 @@ export function globalValidatorV2( return false; } + const incidentActions = Object.values(definition.properties.incident||{}).filter(Boolean) + if(definition?.properties && definition.properties['incident'] && incidentActions.length==0){ + setGlobalValidationError('incident', "Workflow incident trigger cannot be empty."); + return false; + } + const anyStepOrAction = definition?.sequence?.length > 0; if (!anyStepOrAction) { setGlobalValidationError(null, diff --git a/keep-ui/app/workflows/builder/editors.tsx b/keep-ui/app/workflows/builder/editors.tsx index 2670b6904..1df59f1f4 100644 --- a/keep-ui/app/workflows/builder/editors.tsx +++ b/keep-ui/app/workflows/builder/editors.tsx @@ -7,6 +7,7 @@ import { Subtitle, Icon, Button, + Switch, Divider, } from "@tremor/react"; import { KeyIcon } from "@heroicons/react/20/solid"; @@ -15,6 +16,7 @@ import { BackspaceIcon, FunnelIcon, } from "@heroicons/react/24/outline"; +import React from "react"; import useStore, { V2Properties } from "./builder-store"; import { useEffect, useRef, useState } from "react"; @@ -263,8 +265,6 @@ function WorkflowEditorV2({ selectedNode: string | null; saveRef: React.MutableRefObject; }) { - const isTrigger = ['interval', 'manual', 'alert'].includes(selectedNode || '') - const updateAlertFilter = (filter: string, value: string) => { const currentFilters = properties.alert || {}; @@ -282,7 +282,6 @@ function WorkflowEditorV2({ } }; - const deleteFilter = (filter: string) => { const currentFilters = { ...properties.alert }; delete currentFilters[filter]; @@ -310,7 +309,7 @@ function WorkflowEditorV2({ <> Workflow Settings {propertyKeys.map((key, index) => { - const isTrigger = ["manual", "alert", 'interval'].includes(key) ; + const isTrigger = ["manual", "alert", 'interval', 'incident'].includes(key); renderDivider = isTrigger && key === selectedNode ? !renderDivider : false; return (
@@ -380,20 +379,41 @@ function WorkflowEditorV2({ ) ); + case "incident": + return selectedNode === 'incident' && <> + Incident events + {Array("created", "updated", "deleted").map((event) => +
+ -1} + onChange={() => { + let events = properties.incident.events || []; + if (events.indexOf(event) > -1) { + events = (events as string[]).filter(e => e !== event) + setProperties({ ...properties, [key]: {events: events } }) + } else { + events.push(event); + setProperties({ ...properties, [key]: {events: events} }) + } + }} + color={"orange"} + /> + +
+ )} + ; case "interval": - return ( - selectedNode === "interval" && ( - - handleChange(key, e.target.value) - } - value={properties[key] || "" as string} - /> - ) - ); - case "disabled": + return selectedNode === "interval" && ( + handleChange(key, e.target.value) + } + value={properties[key] || ""as string} + />); + case "isabled": return (
handleChange(key, e.target.value) } - value={properties[key] || "" as string} - /> - ); - } - })()} -
- ); + value={properties[key] || ""as string} + /> + ); + } + })()} +
+ ); })} ); @@ -489,7 +509,7 @@ export function StepEditorV2({ return ( - {providerType} Editor + {providerType}1 Editor Unique Identifier { - if (['interval', 'manual', 'alert'].includes(key) && properties[key]) { + if (['interval', 'manual', 'alert', 'incident'].includes(key) && properties[key]) { _steps.push({ id: key, type: key, diff --git a/keep/api/core/db.py b/keep/api/core/db.py index fbc21d200..89c550c6d 100644 --- a/keep/api/core/db.py +++ b/keep/api/core/db.py @@ -108,6 +108,7 @@ def create_workflow_execution( event_id: str = None, fingerprint: str = None, execution_id: str = None, + event_type: str = "alert", ) -> str: with Session(engine) as session: try: @@ -126,13 +127,21 @@ def create_workflow_execution( # Ensure the object has an id session.flush() execution_id = workflow_execution.id - if fingerprint: + if fingerprint and event_type == "alert": workflow_to_alert_execution = WorkflowToAlertExecution( workflow_execution_id=execution_id, alert_fingerprint=fingerprint, event_id=event_id, ) session.add(workflow_to_alert_execution) + elif event_type == "incident": + workflow_to_incident_execution = WorkflowToIncidentExecution( + workflow_execution_id=execution_id, + alert_fingerprint=fingerprint, + incident_id=event_id, + ) + session.add(workflow_to_incident_execution) + session.commit() return execution_id except IntegrityError: @@ -687,9 +696,8 @@ def get_workflow_executions( ).scalar() avgDuration = avgDuration if avgDuration else 0.0 - query = ( - query.order_by(desc(WorkflowExecution.started)).limit(limit).offset(offset) - ) + query = (query.order_by(desc(WorkflowExecution.started)).limit(limit).offset(offset) +) # Execute the query workflow_executions = query.all() @@ -2366,7 +2374,7 @@ def get_incidents_count( def get_incident_alerts_by_incident_id( - tenant_id: str, incident_id: str, limit: int, offset: int + tenant_id: str, incident_id: str, limit: Optional[int] = None, offset: Optional[int] = None ) -> (List[Alert], int): with Session(engine) as session: query = ( @@ -2384,7 +2392,10 @@ def get_incident_alerts_by_incident_id( total_count = query.count() - return query.limit(limit).offset(offset).all(), total_count + if limit and offset: + query = query.limit(limit).offset(offset) + + return query.all(), total_count def get_alerts_data_for_incident( diff --git a/keep/api/models/alert.py b/keep/api/models/alert.py index 58747fd1c..bb4c97eb8 100644 --- a/keep/api/models/alert.py +++ b/keep/api/models/alert.py @@ -4,11 +4,11 @@ import logging import uuid from enum import Enum -from typing import Any, Dict +from typing import Any, Dict, List from uuid import UUID import pytz -from pydantic import AnyHttpUrl, BaseModel, Extra, root_validator, validator +from pydantic import AnyHttpUrl, BaseModel, Extra, root_validator, validator, PrivateAttr logger = logging.getLogger(__name__) @@ -385,6 +385,8 @@ class IncidentDto(IncidentDtoIn): rule_fingerprint: str | None + _tenant_id: str = PrivateAttr() + def __str__(self) -> str: # Convert the model instance to a dictionary model_dict = self.dict() @@ -393,12 +395,26 @@ def __str__(self) -> str: class Config: extra = Extra.allow schema_extra = IncidentDtoIn.Config.schema_extra + underscore_attrs_are_private = True json_encoders = { # Converts UUID to their values for JSON serialization UUID: lambda v: str(v), } + @property + def name(self): + return self.user_generated_name or self.ai_generated_name + + @property + def alerts(self) -> List["AlertDto"]: + from keep.api.core.db import get_incident_alerts_by_incident_id + from keep.api.utils.enrichment_helpers import convert_db_alerts_to_dto_alerts + if not self._tenant_id: + return [] + alerts, _ = get_incident_alerts_by_incident_id(self._tenant_id, str(self.id)) + return convert_db_alerts_to_dto_alerts(alerts) + @root_validator(pre=True) def set_default_values(cls, values: Dict[str, Any]) -> Dict[str, Any]: # Check and set default status @@ -420,7 +436,7 @@ def from_db_incident(cls, db_incident): if isinstance(db_incident.severity, int) \ else db_incident.severity - return cls( + dto = cls( id=db_incident.id, user_generated_name=db_incident.user_generated_name, ai_generated_name = db_incident.ai_generated_name, @@ -441,6 +457,10 @@ def from_db_incident(cls, db_incident): rule_fingerprint=db_incident.rule_fingerprint, ) + # This field is required for getting alerts when required + dto._tenant_id = db_incident.tenant_id + return dto + class IncidentStatusChangeDto(BaseModel): status: IncidentStatus diff --git a/keep/api/models/db/migrations/versions/2024-09-13-10-48_938b1aa62d5c.py b/keep/api/models/db/migrations/versions/2024-09-13-10-48_938b1aa62d5c.py index 0f4dd1963..72a8082fc 100644 --- a/keep/api/models/db/migrations/versions/2024-09-13-10-48_938b1aa62d5c.py +++ b/keep/api/models/db/migrations/versions/2024-09-13-10-48_938b1aa62d5c.py @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "938b1aa62d5c" -down_revision = "c5443d9deb0f" +down_revision = "1aacee84447e" branch_labels = None depends_on = None diff --git a/keep/api/models/db/migrations/versions/2024-09-11-23-30_c5443d9deb0f.py b/keep/api/models/db/migrations/versions/2024-09-17-23-30_c5443d9deb0f.py similarity index 96% rename from keep/api/models/db/migrations/versions/2024-09-11-23-30_c5443d9deb0f.py rename to keep/api/models/db/migrations/versions/2024-09-17-23-30_c5443d9deb0f.py index bc35f2fe1..9ecb5c1cc 100644 --- a/keep/api/models/db/migrations/versions/2024-09-11-23-30_c5443d9deb0f.py +++ b/keep/api/models/db/migrations/versions/2024-09-17-23-30_c5443d9deb0f.py @@ -12,7 +12,7 @@ # revision identifiers, used by Alembic. revision = "c5443d9deb0f" -down_revision = "1aacee84447e" +down_revision = "938b1aa62d5c" branch_labels = None depends_on = None diff --git a/keep/api/models/db/migrations/versions/2024-09-18-02-05_772790c2e50a.py b/keep/api/models/db/migrations/versions/2024-09-18-02-05_772790c2e50a.py new file mode 100644 index 000000000..e04890ab3 --- /dev/null +++ b/keep/api/models/db/migrations/versions/2024-09-18-02-05_772790c2e50a.py @@ -0,0 +1,38 @@ +"""add WorkflowToIncidentExecution + +Revision ID: 772790c2e50a +Revises: 49e7c02579db +Create Date: 2024-09-08 02:05:42.739163 + +""" + +import sqlalchemy as sa +import sqlmodel +from alembic import op + +# revision identifiers, used by Alembic. +revision = "772790c2e50a" +down_revision = "c5443d9deb0f" +branch_labels = None +depends_on = None + + +def upgrade() -> None: + op.create_table( + "workflowtoincidentexecution", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column( + "workflow_execution_id", sqlmodel.sql.sqltypes.AutoString(), nullable=False + ), + sa.Column("incident_id", sqlmodel.sql.sqltypes.AutoString(), nullable=True), + sa.ForeignKeyConstraint( + ["workflow_execution_id"], + ["workflowexecution.id"], + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint("workflow_execution_id", "incident_id"), + ) + + +def downgrade() -> None: + op.drop_table("workflowtoincidentexecution") diff --git a/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py b/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py index c0c825b1b..8f0ce6c47 100644 --- a/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py +++ b/keep/api/models/db/migrations/versions/2024-09-18-14-08_5d7ae55efc6a.py @@ -11,7 +11,7 @@ # revision identifiers, used by Alembic. revision = "5d7ae55efc6a" -down_revision = "938b1aa62d5c" +down_revision = "772790c2e50a" branch_labels = None depends_on = None diff --git a/keep/api/models/db/workflow.py b/keep/api/models/db/workflow.py index 3426a9560..f243b51f0 100644 --- a/keep/api/models/db/workflow.py +++ b/keep/api/models/db/workflow.py @@ -53,6 +53,9 @@ class WorkflowExecution(SQLModel, table=True): workflow_to_alert_execution: "WorkflowToAlertExecution" = Relationship( back_populates="workflow_execution" ) + workflow_to_incident_execution: "WorkflowToIncidentExecution" = Relationship( + back_populates="workflow_execution" + ) class Config: orm_mode = True @@ -71,6 +74,18 @@ class WorkflowToAlertExecution(SQLModel, table=True): ) +class WorkflowToIncidentExecution(SQLModel, table=True): + __table_args__ = (UniqueConstraint("workflow_execution_id", "incident_id"),) + + # https://sqlmodel.tiangolo.com/tutorial/automatic-id-none-refresh/ + id: Optional[int] = Field(primary_key=True, default=None) + workflow_execution_id: str = Field(foreign_key="workflowexecution.id") + incident_id: str | None + workflow_execution: WorkflowExecution = Relationship( + back_populates="workflow_to_incident_execution" + ) + + class WorkflowExecutionLog(SQLModel, table=True): id: int = Field(default=None, primary_key=True) workflow_execution_id: str = Field(foreign_key="workflowexecution.id") diff --git a/keep/api/routes/incidents.py b/keep/api/routes/incidents.py index 6819f4322..ca2bf87aa 100644 --- a/keep/api/routes/incidents.py +++ b/keep/api/routes/incidents.py @@ -38,6 +38,7 @@ ) from keep.identitymanager.authenticatedentity import AuthenticatedEntity from keep.identitymanager.identitymanagerfactory import IdentityManagerFactory +from keep.workflowmanager.workflowmanager import WorkflowManager router = APIRouter() logger = logging.getLogger(__name__) @@ -118,6 +119,21 @@ def create_incident_endpoint( }, ) __update_client_on_incident_change(pusher_client, tenant_id) + + try: + workflow_manager = WorkflowManager.get_instance() + logger.info("Adding incident to the workflow manager queue") + workflow_manager.insert_incident(tenant_id, new_incident_dto, "created") + logger.info("Added incident to the workflow manager queue") + except Exception: + logger.exception( + "Failed to run workflows based on incident", + extra={ + "incident_id": new_incident_dto.id, + "tenant_id": tenant_id + }, + ) + return new_incident_dto @@ -219,7 +235,19 @@ def update_incident( raise HTTPException(status_code=404, detail="Incident not found") new_incident_dto = IncidentDto.from_db_incident(incident) - + try: + workflow_manager = WorkflowManager.get_instance() + logger.info("Adding incident to the workflow manager queue") + workflow_manager.insert_incident(tenant_id, new_incident_dto, "updated") + logger.info("Added incident to the workflow manager queue") + except Exception: + logger.exception( + "Failed to run workflows based on incident", + extra={ + "incident_id": new_incident_dto.id, + "tenant_id": tenant_id + }, + ) return new_incident_dto @@ -242,10 +270,30 @@ def delete_incident( "tenant_id": tenant_id, }, ) + + incident = get_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) + if not incident: + raise HTTPException(status_code=404, detail="Incident not found") + + incident_dto = IncidentDto.from_db_incident(incident) + deleted = delete_incident_by_id(tenant_id=tenant_id, incident_id=incident_id) if not deleted: raise HTTPException(status_code=404, detail="Incident not found") __update_client_on_incident_change(pusher_client, tenant_id) + try: + workflow_manager = WorkflowManager.get_instance() + logger.info("Adding incident to the workflow manager queue") + workflow_manager.insert_incident(tenant_id, incident_dto, "deleted") + logger.info("Added incident to the workflow manager queue") + except Exception: + logger.exception( + "Failed to run workflows based on incident", + extra={ + "incident_id": incident_dto.id, + "tenant_id": tenant_id + }, + ) return Response(status_code=202) @@ -329,6 +377,22 @@ async def add_alerts_to_incident( add_alerts_to_incident_by_incident_id(tenant_id, incident_id, alert_ids) __update_client_on_incident_change(pusher_client, tenant_id, incident_id) + incident_dto = IncidentDto.from_db_incident(incident) + + try: + workflow_manager = WorkflowManager.get_instance() + logger.info("Adding incident to the workflow manager queue") + workflow_manager.insert_incident(tenant_id, incident_dto, "updated") + logger.info("Added incident to the workflow manager queue") + except Exception: + logger.exception( + "Failed to run workflows based on incident", + extra={ + "incident_id": incident_dto.id, + "tenant_id": tenant_id + }, + ) + fingerprints_count = get_incident_unique_fingerprint_count(tenant_id, incident_id) if ( diff --git a/keep/contextmanager/contextmanager.py b/keep/contextmanager/contextmanager.py index 953a97757..2f5611fa7 100644 --- a/keep/contextmanager/contextmanager.py +++ b/keep/contextmanager/contextmanager.py @@ -22,6 +22,7 @@ def __init__(self, tenant_id, workflow_id=None, workflow_execution_id=None): self.providers_context = {} self.actions_context = {} self.event_context = {} + self.incident_context = {} self.foreach_context = { "value": None, } @@ -78,6 +79,9 @@ def get_logger(self): def set_event_context(self, event): self.event_context = event + def set_incident_context(self, incident): + self.incident_context = incident + def get_workflow_id(self): return self.workflow_id @@ -104,6 +108,7 @@ def get_full_context(self, exclude_providers=False, exclude_env=False): "event": self.event_context, "last_workflow_results": self.last_workflow_execution_results, "alert": self.event_context, # this is an alias so workflows will be able to use alert.source + "incident": self.incident_context, # this is an alias so workflows will be able to use alert.source } if not exclude_providers: diff --git a/keep/step/step.py b/keep/step/step.py index be7fae881..4b0caede4 100644 --- a/keep/step/step.py +++ b/keep/step/step.py @@ -91,8 +91,10 @@ def _get_foreach_items(self) -> list | list[list]: index = [i.strip() for i in index] items = self.context_manager.get_full_context() for i in index: - # try to get it as a dict - items = items.get(i, {}) + if isinstance(items, dict): + items = items.get(i, {}) + else: + items = getattr(items, i, {}) foreach_items.append(items) if not foreach_items: return [] diff --git a/keep/workflowmanager/workflowmanager.py b/keep/workflowmanager/workflowmanager.py index d50ab8939..acbdb286b 100644 --- a/keep/workflowmanager/workflowmanager.py +++ b/keep/workflowmanager/workflowmanager.py @@ -4,13 +4,15 @@ import typing import uuid +from pandas.core.common import flatten + from keep.api.core.config import AuthenticationType, config from keep.api.core.db import ( get_enrichment, get_previous_alert_by_fingerprint, save_workflow_results, ) -from keep.api.models.alert import AlertDto, AlertSeverity +from keep.api.models.alert import AlertDto, AlertSeverity, IncidentDto from keep.providers.providers_factory import ProviderConfigurationException from keep.workflowmanager.workflow import Workflow from keep.workflowmanager.workflowscheduler import WorkflowScheduler @@ -68,7 +70,74 @@ def _apply_filter(self, filter_val, value): return value == str(filter_val) return value == filter_val - def insert_events(self, tenant_id, events: typing.List[AlertDto]): + def _get_workflow_from_store(self, tenant_id, workflow_model): + try: + # get the actual workflow that can be triggered + self.logger.info("Getting workflow from store") + workflow = self.workflow_store.get_workflow( + tenant_id, workflow_model.id + ) + self.logger.info("Got workflow from store") + return workflow + except ProviderConfigurationException: + self.logger.exception( + "Workflow have a provider that is not configured", + extra={ + "workflow_id": workflow_model.id, + "tenant_id": tenant_id, + }, + ) + except Exception: + self.logger.exception( + "Error getting workflow", + extra={ + "workflow_id": workflow_model.id, + "tenant_id": tenant_id, + }, + ) + + def insert_incident(self, tenant_id: str, incident: IncidentDto, trigger: str): + all_workflow_models = self.workflow_store.get_all_workflows(tenant_id) + self.logger.info( + "Got all workflows", + extra={ + "num_of_workflows": len(all_workflow_models), + }, + ) + for workflow_model in all_workflow_models: + + if workflow_model.is_disabled: + self.logger.debug( + f"Skipping the workflow: id={workflow_model.id}, name={workflow_model.name}, " + f"tenant_id={workflow_model.tenant_id} - Workflow is disabled." + ) + continue + workflow = self._get_workflow_from_store(tenant_id, workflow_model) + if workflow is None: + continue + + incident_triggers = flatten( + [t.get("events", []) for t in workflow.workflow_triggers if t["type"] == "incident"] + ) + + if trigger not in incident_triggers: + self.logger.debug("workflow does not contain trigger %s, skipping", trigger) + continue + + self.logger.info("Adding workflow to run") + with self.scheduler.lock: + self.scheduler.workflows_to_run.append( + { + "workflow": workflow, + "workflow_id": workflow_model.id, + "tenant_id": tenant_id, + "triggered_by": "incident:{}".format(trigger), + "event": incident, + } + ) + self.logger.info("Workflow added to run") + + def insert_events(self, tenant_id, events: typing.List[AlertDto | IncidentDto]): for event in events: self.logger.info("Getting all workflows") all_workflow_models = self.workflow_store.get_all_workflows(tenant_id) @@ -79,37 +148,17 @@ def insert_events(self, tenant_id, events: typing.List[AlertDto]): }, ) for workflow_model in all_workflow_models: + if workflow_model.is_disabled: self.logger.debug( f"Skipping the workflow: id={workflow_model.id}, name={workflow_model.name}, " f"tenant_id={workflow_model.tenant_id} - Workflow is disabled." ) continue - try: - # get the actual workflow that can be triggered - self.logger.info("Getting workflow from store") - workflow = self.workflow_store.get_workflow( - tenant_id, workflow_model.id - ) - self.logger.info("Got workflow from store") - except ProviderConfigurationException: - self.logger.exception( - "Workflow have a provider that is not configured", - extra={ - "workflow_id": workflow_model.id, - "tenant_id": tenant_id, - }, - ) - continue - except Exception: - self.logger.exception( - "Error getting workflow", - extra={ - "workflow_id": workflow_model.id, - "tenant_id": tenant_id, - }, - ) + workflow = self._get_workflow_from_store(tenant_id, workflow_model) + if workflow is None: continue + for trigger in workflow.workflow_triggers: # TODO: handle it better if not trigger.get("type") == "alert": @@ -371,7 +420,8 @@ def _run_workflow( return [errors, results] - def _get_workflow_results(self, workflow: Workflow): + @staticmethod + def _get_workflow_results(workflow: Workflow): """ Get the results of the workflow from the DB. @@ -381,8 +431,7 @@ def _get_workflow_results(self, workflow: Workflow): Returns: dict: The results of the workflow. """ - print("workflowssssss", workflow.workflow_actions) - print(workflow.workflow_steps) + workflow_results = { action.name: action.provider.results for action in workflow.workflow_actions } diff --git a/keep/workflowmanager/workflowscheduler.py b/keep/workflowmanager/workflowscheduler.py index 693b5000b..c7df0b534 100644 --- a/keep/workflowmanager/workflowscheduler.py +++ b/keep/workflowmanager/workflowscheduler.py @@ -15,7 +15,7 @@ from keep.api.core.db import get_enrichment, get_previous_execution_id from keep.api.core.db import get_workflow as get_workflow_db from keep.api.core.db import get_workflows_that_should_run -from keep.api.models.alert import AlertDto +from keep.api.models.alert import AlertDto, IncidentDto from keep.providers.providers_factory import ProviderConfigurationException from keep.workflowmanager.workflow import Workflow, WorkflowStrategy from keep.workflowmanager.workflowstore import WorkflowStore @@ -57,10 +57,11 @@ def _handle_interval_workflows(self): pass for workflow in workflows: self.logger.debug("Running workflow on background") + + workflow_execution_id = workflow.get("workflow_execution_id") + tenant_id = workflow.get("tenant_id") + workflow_id = workflow.get("workflow_id") try: - workflow_execution_id = workflow.get("workflow_execution_id") - tenant_id = workflow.get("tenant_id") - workflow_id = workflow.get("workflow_id") workflow = self.workflow_store.get_workflow(tenant_id, workflow_id) except ProviderConfigurationException: self.logger.exception( @@ -106,8 +107,13 @@ def _run_workflow( ): self.logger.info(f"Running workflow {workflow.workflow_id}...") try: - # set the event context, e.g. the event that triggered the workflow - workflow.context_manager.set_event_context(event_context) + if isinstance(event_context, AlertDto): + # set the event context, e.g. the event that triggered the workflow + workflow.context_manager.set_event_context(event_context) + else: + # set the incident context, e.g. the incident that triggered the workflow + workflow.context_manager.set_incident_context(event_context) + errors, _ = self.workflow_manager._run_workflow( workflow, workflow_execution_id ) @@ -216,6 +222,7 @@ def handle_manual_event_workflow( execution_number=unique_execution_number, fingerprint=alert.fingerprint, event_id=alert.event_id, + event_type="alert" ) self.logger.info(f"Workflow execution id: {workflow_execution_id}") # This is kinda WTF exception since create_workflow_execution shouldn't fail for manual @@ -313,13 +320,26 @@ def _handle_event_workflows(self): continue event = workflow_to_run.get("event") + triggered_by = workflow_to_run.get("triggered_by") if triggered_by == "manual": triggered_by_user = workflow_to_run.get("triggered_by_user") triggered_by = f"manually by {triggered_by_user}" + elif triggered_by.startswith("incident:"): + triggered_by = f"type:{triggered_by} name:{event.name} id:{event.id}" else: triggered_by = f"type:alert name:{event.name} id:{event.id}" + if isinstance(event, IncidentDto): + event_id = str(event.id) + event_type = "incident" + fingerprint = "incident:{}".format(event_id) + else: + event_id = event.event_id + event_type = "alert" + fingerprint = event.fingerprint + + # In manual, we create the workflow execution id sync so it could be tracked by the caller (UI) # In event (e.g. alarm), we will create it here if not workflow_execution_id: @@ -333,16 +353,17 @@ def _handle_event_workflows(self): # else, we want to enforce that no workflow already run with the same fingerprint else: workflow_execution_number = self._get_unique_execution_number( - event.fingerprint + fingerprint ) workflow_execution_id = create_workflow_execution( workflow_id=workflow_id, tenant_id=tenant_id, triggered_by=triggered_by, execution_number=workflow_execution_number, - fingerprint=event.fingerprint, - event_id=event.event_id, + fingerprint=fingerprint, + event_id=event_id, execution_id=execution_id, + event_type=event_type, ) # If there is already running workflow from the same event except IntegrityError: @@ -404,7 +425,7 @@ def _handle_event_workflows(self): # - the second one will wait for the next iteration # - on the next iteratino, the second alert enriched with the ticket_url # and will trigger a workflow that will update the ticket with "resolved" - if workflow_to_run.get("retry", False): + if workflow_to_run.get("retry", False) and isinstance(event, AlertDto): try: self.logger.info( "Updating enrichments for workflow after retry", diff --git a/tests/test_workflow_execution.py b/tests/test_workflow_execution.py index b772391fd..af04791d4 100644 --- a/tests/test_workflow_execution.py +++ b/tests/test_workflow_execution.py @@ -4,10 +4,11 @@ import pytest import pytz +from asyncio import sleep from keep.api.core.db import get_last_workflow_execution_by_workflow_id from keep.api.core.dependencies import SINGLE_TENANT_UUID -from keep.api.models.alert import AlertDto, AlertStatus +from keep.api.models.alert import AlertDto, AlertStatus, IncidentDtoIn, IncidentDto from keep.api.models.db.workflow import Workflow from keep.workflowmanager.workflowmanager import WorkflowManager @@ -575,3 +576,135 @@ def test_workflow_execution_with_disabled_workflow( assert enabled_workflow_execution.status == "success" assert disabled_workflow_execution is None + + + +workflow_definition_4 = """workflow: +id: incident-triggers-test-created-updated +description: test incident triggers +triggers: +- type: incident + events: + - updated + - created +name: created-updated +owners: [] +services: [] +steps: [] +actions: +- name: mock-action + provider: + type: console + with: + message: | + "incident: {{ incident.name }}" +""" + +workflow_definition_5 = """workflow: +id: incident-incident-triggers-test-deleted +description: test incident triggers +triggers: +- type: incident + events: + - deleted +name: deleted +owners: [] +services: [] +steps: [] +actions: +- name: mock-action + provider: + type: console + with: + message: | + "deleted incident: {{ incident.name }}" +""" + + +def test_workflow_incident_triggers( + db_session, + workflow_manager, +): + workflow_created = Workflow( + id="incident-triggers-test-created-updated", + name="incident-triggers-test-created-updated", + tenant_id=SINGLE_TENANT_UUID, + description="Check that incident triggers works", + created_by="test@keephq.dev", + interval=0, + workflow_raw=workflow_definition_4, + ) + db_session.add(workflow_created) + db_session.commit() + + # Create the current alert + incident = IncidentDto( + id="ba9ddbb9-3a83-40fc-9ace-1e026e08ca2b", + user_generated_name="incident", + alerts_count=0, + alert_sources=[], + services=[], + severity="critical", + is_predicted=False, + is_confirmed=True, + ) + + # Insert the current alert into the workflow manager + + def wait_workflow_execution(workflow_id): + # Wait for the workflow execution to complete + workflow_execution = None + count = 0 + status = None + while workflow_execution is None and count < 30 and status != "success": + workflow_execution = get_last_workflow_execution_by_workflow_id( + SINGLE_TENANT_UUID, workflow_id + ) + if workflow_execution is not None: + status = workflow_execution.status + time.sleep(1) + count += 1 + return workflow_execution + + workflow_manager.insert_incident(SINGLE_TENANT_UUID, incident, "created") + assert len(workflow_manager.scheduler.workflows_to_run) == 1 + + workflow_execution_created = wait_workflow_execution("incident-triggers-test-created-updated") + assert workflow_execution_created is not None + assert workflow_execution_created.status == "success" + assert workflow_execution_created.results['mock-action'] == ['"incident: incident"\n'] + assert len(workflow_manager.scheduler.workflows_to_run) == 0 + + workflow_manager.insert_incident(SINGLE_TENANT_UUID, incident, "updated") + assert len(workflow_manager.scheduler.workflows_to_run) == 1 + workflow_execution_updated = wait_workflow_execution("incident-triggers-test-created-updated") + assert workflow_execution_updated is not None + assert workflow_execution_updated.status == "success" + assert workflow_execution_updated.results['mock-action'] == ['"incident: incident"\n'] + + # incident-triggers-test-created-updated should not be triggered + workflow_manager.insert_incident(SINGLE_TENANT_UUID, incident, "deleted") + assert len(workflow_manager.scheduler.workflows_to_run) == 0 + + workflow_deleted = Workflow( + id="incident-triggers-test-deleted", + name="incident-triggers-test-deleted", + tenant_id=SINGLE_TENANT_UUID, + description="Check that incident triggers works", + created_by="test@keephq.dev", + interval=0, + workflow_raw=workflow_definition_5, + ) + db_session.add(workflow_deleted) + db_session.commit() + + workflow_manager.insert_incident(SINGLE_TENANT_UUID, incident, "deleted") + assert len(workflow_manager.scheduler.workflows_to_run) == 1 + + # incident-triggers-test-deleted should be triggered now + workflow_execution_deleted = wait_workflow_execution("incident-triggers-test-deleted") + assert len(workflow_manager.scheduler.workflows_to_run) == 0 + + assert workflow_execution_deleted is not None + assert workflow_execution_deleted.status == "success" + assert workflow_execution_deleted.results['mock-action'] == ['"deleted incident: incident"\n'] \ No newline at end of file From 5db7c6e21604a67ef08da46a69f817e6ef454c5c Mon Sep 17 00:00:00 2001 From: Tal Date: Thu, 19 Sep 2024 12:44:57 +0300 Subject: [PATCH 15/16] chore(logs): move some topology logs to debug (#1970) --- keep/api/bl/enrichments_bl.py | 5 ++++- keep/api/routes/preset.py | 9 +++++---- keep/api/tasks/process_topology_task.py | 2 +- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/keep/api/bl/enrichments_bl.py b/keep/api/bl/enrichments_bl.py index 3cec23e0e..d8f18691c 100644 --- a/keep/api/bl/enrichments_bl.py +++ b/keep/api/bl/enrichments_bl.py @@ -278,7 +278,10 @@ def _check_alert_matches_rule(self, alert: AlertDto, rule: MappingRule) -> bool: ) if not topology_service: - self.logger.warning("No topology service found to match on") + self.logger.debug( + "No topology service found to match on", + extra={"matcher_value": matcher_value}, + ) else: enrichments = topology_service.dict(exclude_none=True) # Remove redundant fields diff --git a/keep/api/routes/preset.py b/keep/api/routes/preset.py index 53fa09f85..45f46c983 100644 --- a/keep/api/routes/preset.py +++ b/keep/api/routes/preset.py @@ -102,6 +102,9 @@ def pull_data_from_providers( f"Pulling alerts from provider {provider.type} ({provider.id})", extra=extra, ) + # Even if we failed at processing some event, lets save the last pull time to not iterate this process over and over again. + update_provider_last_pull_time(tenant_id=tenant_id, provider_id=provider.id) + provider_class = ProvidersFactory.get_provider( context_manager=context_manager, provider_id=provider.id, @@ -121,7 +124,8 @@ def pull_data_from_providers( logger.info("Pulling topology data", extra=extra) topology_data = provider_class.pull_topology() logger.info( - "Pulling topology data finished, processing", extra=extra + "Pulling topology data finished, processing", + extra={**extra, "topology_length": len(topology_data)}, ) process_topology( tenant_id, topology_data, provider.id, provider.type @@ -155,9 +159,6 @@ def pull_data_from_providers( f"Unknown error pulling from provider {provider.type} ({provider.id})", extra=extra, ) - finally: - # Even if we failed at processing some event, lets save the last pull time to not iterate this process over and over again. - update_provider_last_pull_time(tenant_id=tenant_id, provider_id=provider.id) logger.info( "Pulling data from providers completed", extra={ diff --git a/keep/api/tasks/process_topology_task.py b/keep/api/tasks/process_topology_task.py index 5d65de035..b681dcd54 100644 --- a/keep/api/tasks/process_topology_task.py +++ b/keep/api/tasks/process_topology_task.py @@ -72,7 +72,7 @@ def process_topology( service_id = service_to_keep_service_id_map.get(service.service) depends_on_service_id = service_to_keep_service_id_map.get(dependency) if not service_id or not depends_on_service_id: - logger.warning( + logger.debug( "Found a dangling service, skipping", extra={"service": service.service, "dependency": dependency}, ) From 879ce157c514315202dcf8213b9d757f54558150 Mon Sep 17 00:00:00 2001 From: Rajesh Jonnalagadda <38752904+rajeshj11@users.noreply.github.com> Date: Thu, 19 Sep 2024 17:57:44 +0530 Subject: [PATCH 16/16] chore: fixed the disabled input issue and auto deploy issue (#1972) --- keep-ui/app/workflows/builder/builder-store.tsx | 4 ++-- keep-ui/app/workflows/builder/editors.tsx | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/keep-ui/app/workflows/builder/builder-store.tsx b/keep-ui/app/workflows/builder/builder-store.tsx index 3bc6fa0f3..202c32c19 100644 --- a/keep-ui/app/workflows/builder/builder-store.tsx +++ b/keep-ui/app/workflows/builder/builder-store.tsx @@ -303,10 +303,10 @@ const useStore = create((set, get) => ({ }); } }, - setV2Properties: (properties) => set({ v2Properties: properties }), + setV2Properties: (properties) => set({ v2Properties: properties, canDeploy:false }), updateV2Properties: (properties) => { const updatedProperties = { ...get().v2Properties, ...properties }; - set({ v2Properties: updatedProperties, changes: get().changes + 1 }); + set({ v2Properties: updatedProperties, changes: get().changes + 1, canDeploy:false }); }, setSelectedNode: (id) => { set({ diff --git a/keep-ui/app/workflows/builder/editors.tsx b/keep-ui/app/workflows/builder/editors.tsx index 1df59f1f4..cb34b2c51 100644 --- a/keep-ui/app/workflows/builder/editors.tsx +++ b/keep-ui/app/workflows/builder/editors.tsx @@ -413,7 +413,7 @@ function WorkflowEditorV2({ } value={properties[key] || ""as string} />); - case "isabled": + case "disabled": return (