v1.3.81

grafana · Dec 28, 2023 · b6a4448 · b6a4448
2 parents bfa7442 + 33fcb87
commit b6a4448
Show file tree

Hide file tree

Showing 59 changed files with 856 additions and 266 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,5 @@ venv
 
 yarn.lock
 node_modules
+
+test-results
diff --git a/.prettierrc.js b/.prettierrc.js
@@ -0,0 +1,8 @@
+overrides: [
+  {
+    files: ["*.yml", "*.yaml"],
+    options: {
+      singleQuote: false,
+    },
+  },
+];
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## Unreleased
+
+## v1.3.81 (2023-12-28)
+
+### Added
+
+- Support e2e tests in Tilt and Makefile ([#3516](https://github.com/grafana/oncall/pull/3516))
+- Support PATCH method for outgoing webhooks by @ravishankar15 ([#3580](https://github.com/grafana/oncall/pull/3580))
+
+### Changed
+
+- Limit acknowledge reminders to stop repeating after 1 month @mderynck ([#3571](https://github.com/grafana/oncall/pull/3571))
+
+### Fixed
+
+- Check reason to skip notification in Slack to avoid task perform_notification retries @Ferril ([#3562](https://github.com/grafana/oncall/pull/3562))
+- Fix alert group table columns validation @Ferril ([#3577](https://github.com/grafana/oncall/pull/3577))
+- Fix posting message about rate limit to Slack @Ferril ([#3582](https://github.com/grafana/oncall/pull/3582))
+- Fix issue with parsing sender email address from email message for inbound email integration endpoint @Ferril ([#3586](https://github.com/grafana/oncall/pull/3586))
+- Fix PUT /api/v1/escalation_policies/id issue when updating `from_time` and `to_time` by @joeyorlando ([#3581](https://github.com/grafana/oncall/pull/3581))
+- Fix issue where duplicate team options would show up in the teams dropdown for the `/escalate` Slack command
+  by @joeyorlando ([#3590](https://github.com/grafana/oncall/pull/3590))
+
 ## v1.3.80 (2023-12-14)
 
 ### Added

diff --git a/Makefile b/Makefile
@@ -197,6 +197,15 @@ engine-manage:  ## run Django's `manage.py` script, inside of a docker container
                 ## https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-makemigrations
 	$(call run_engine_docker_command,python manage.py $(CMD))
 
+test-e2e:  ## run the e2e tests in headless mode
+	yarn --cwd grafana-plugin test:e2e
+
+test-e2e-watch:  ## start e2e tests in watch mode
+	yarn --cwd grafana-plugin test:e2e:watch
+
+test-e2e-show-report:  ## open last e2e test report
+	yarn --cwd grafana-plugin playwright show-report
+
 ui-test:  ## run the UI tests
 	$(call run_ui_docker_command,yarn test)
 

diff --git a/Tiltfile b/Tiltfile
@@ -1,3 +1,4 @@
+load('ext://uibutton', 'cmd_button', 'location', 'text_input', 'bool_input')
 running_under_parent_tiltfile = os.getenv("TILT_PARENT", "false") == "true"
 # The user/pass that you will login to Grafana with
 grafana_admin_user_pass = os.getenv("GRAFANA_ADMIN_USER_PASS", "oncall")
@@ -36,7 +37,7 @@ docker_build_sub(
     "localhost:63628/oncall/engine:dev",
     context="./engine",
     cache_from=["grafana/oncall:latest", "grafana/oncall:dev"],
-    ignore=["./grafana-plugin/test-results/", "./grafana-plugin/dist/", "./grafana-plugin/e2e-tests/"],
+    ignore=["./test-results/", "./grafana-plugin/dist/", "./grafana-plugin/e2e-tests/"],
     child_context=".",
     target="dev",
     extra_cmds=["ADD ./grafana-plugin/src/plugin.json /etc/grafana-plugin/src/plugin.json"],
@@ -54,10 +55,56 @@ local_resource(
     "build-ui",
     labels=["OnCallUI"],
     cmd="cd grafana-plugin && yarn install && yarn build:dev",
-    serve_cmd="cd grafana-plugin && ONCALL_API_URL=http://oncall-dev-engine:8080 yarn watch",
+    serve_cmd="cd grafana-plugin && yarn watch",
     allow_parallel=True,
 )
 
+local_resource(
+    "e2e-tests",
+    labels=["E2eTests"],
+    cmd="cd grafana-plugin && yarn test:e2e",
+    trigger_mode=TRIGGER_MODE_MANUAL,
+    auto_init=False,
+    resource_deps=["build-ui", "grafana", "grafana-oncall-app-provisioning-configmap", "engine"]
+)
+
+cmd_button(
+    name="E2E Tests - headless run",
+    argv=["sh", "-c", "yarn --cwd ./grafana-plugin test:e2e $STOP_ON_FIRST_FAILURE"],
+    text="Restart headless run",
+    resource="e2e-tests",
+    icon_name="replay",
+    inputs=[
+        text_input("BROWSERS", "Browsers (e.g. \"chromium,firefox,webkit\")", "chromium", "chromium,firefox,webkit"), 
+        bool_input("REPORTER", "Use HTML reporter", True, 'html', 'line'),
+        bool_input("STOP_ON_FIRST_FAILURE", "Stop on first failure", True, "-x", ""),
+    ]
+)
+
+cmd_button(
+    name="E2E Tests - open watch mode",
+    argv=["sh", "-c", "yarn --cwd grafana-plugin test:e2e:watch"],
+    text="Open watch mode",
+    resource="e2e-tests",
+    icon_name="visibility",
+)
+
+cmd_button(
+    name="E2E Tests - show report",
+    argv=["sh", "-c", "yarn --cwd grafana-plugin playwright show-report"],
+    text="Show last HTML report",
+    resource="e2e-tests",
+    icon_name="assignment",
+)
+
+cmd_button(
+    name="E2E Tests - stop current run",
+    argv=["sh", "-c", "kill -9 $(pgrep -f test:e2e)"],
+    text="Stop",
+    resource="e2e-tests",
+    icon_name="dangerous",
+)
+
 yaml = helm("helm/oncall", name=HELM_PREFIX, values=["./dev/helm-local.yml", "./dev/helm-local.dev.yml"])
 
 k8s_yaml(yaml)

diff --git a/dev/README.md b/dev/README.md
@@ -243,13 +243,18 @@ are run on pull request CI builds. New features should ideally include a new/mod
 
 To run these tests locally simply do the following:
 
-```bash
-npx playwright install  # install playwright dependencies
-cp ./grafana-plugin/e2e-tests/.env.example ./grafana-plugin/e2e-tests/.env
-# you may need to tweak the values in ./grafana-plugin/.env according to your local setup
-cd grafana-plugin
-yarn test:e2e
-```
+1. Install Playwright dependencies with `npx playwright install`
+2. [Launch the environment](#launch-the-environment)
+3. Then you interact with tests in 2 different ways:
+   1. Using `Tilt` - open _E2eTests_ section where you will find 4 buttons:
+      1. Restart headless run (you can configure browsers, reporter and failure allowance there)
+      2. Open watch mode
+      3. Show last HTML report
+      4. Stop (stops any pending e2e test process)
+   2. Using `make`:
+      1. `make test:e2e` to start headless run
+      2. `make test:e2e:watch` to open watch mode
+      3. `make test:e2e:show:report` to open last HTML report
 
 ## Helm unit tests
 

diff --git a/dev/helm-local.yml b/dev/helm-local.yml
@@ -1,4 +1,4 @@
-base_url: localhost:30001
+base_url: localhost:8080
 base_url_protocol: http
 env:
   - name: GRAFANA_CLOUD_NOTIFICATIONS_ENABLED

diff --git a/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py b/engine/apps/alerts/escalation_snapshot/escalation_snapshot_mixin.py
@@ -5,6 +5,7 @@
 import pytz
 from celery import uuid as celery_uuid
 from dateutil.parser import parse
+from django.utils import timezone
 from django.utils.functional import cached_property
 from rest_framework.exceptions import ValidationError
 
@@ -212,6 +213,12 @@ def pause_escalation(self) -> bool:
             return False
         return self.raw_escalation_snapshot.get("pause_escalation", False)
 
+    @property
+    def last_active_escalation_policy_order(self) -> typing.Optional[int]:
+        if not self.raw_escalation_snapshot:
+            return None
+        return self.raw_escalation_snapshot.get("last_active_escalation_policy_order")
+
     @property
     def next_step_eta(self) -> typing.Optional[datetime.datetime]:
         """
@@ -223,6 +230,19 @@ def next_step_eta(self) -> typing.Optional[datetime.datetime]:
         raw_next_step_eta = self.raw_escalation_snapshot.get("next_step_eta")
         return None if not raw_next_step_eta else parse(raw_next_step_eta).replace(tzinfo=pytz.UTC)
 
+    def next_step_eta_is_valid(self) -> typing.Optional[bool]:
+        """
+        `next_step_eta` should never be less than the current time (with a 5 minute buffer provided)
+        as this field should be updated as the escalation policy is executed over time. If it is, this means that
+        an escalation policy step has been missed, or is substantially delayed
+
+        if `next_step_eta` is `None` then `None` is returned, otherwise a boolean is returned
+        representing the result of the time comparision
+        """
+        if self.next_step_eta is None:
+            return None
+        return self.next_step_eta > (timezone.now() - datetime.timedelta(minutes=5))
+
     def update_next_step_eta(self, increase_by_timedelta: datetime.timedelta) -> typing.Optional[dict]:
         """
         update next_step_eta field directly to avoid serialization overhead

diff --git a/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py b/engine/apps/alerts/escalation_snapshot/snapshot_classes/escalation_snapshot.py
@@ -3,7 +3,6 @@
 import typing
 
 from celery.utils.log import get_task_logger
-from django.utils import timezone
 
 from apps.alerts.escalation_snapshot.serializers import EscalationSnapshotSerializer
 from apps.alerts.models.alert_group_log_record import AlertGroupLogRecord
@@ -90,19 +89,6 @@ def executed_escalation_policy_snapshots(self) -> typing.List["EscalationPolicyS
             return []
         return self.escalation_policies_snapshots[: self.last_active_escalation_policy_order + 1]
 
-    def next_step_eta_is_valid(self) -> typing.Optional[bool]:
-        """
-        `next_step_eta` should never be less than the current time (with a 5 minute buffer provided)
-        as this field should be updated as the escalation policy is executed over time. If it is, this means that
-        an escalation policy step has been missed, or is substantially delayed
-
-        if `next_step_eta` is `None` then `None` is returned, otherwise a boolean is returned
-        representing the result of the time comparision
-        """
-        if self.next_step_eta is None:
-            return None
-        return self.next_step_eta > (timezone.now() - datetime.timedelta(minutes=5))
-
     def save_to_alert_group(self) -> None:
         self.alert_group.raw_escalation_snapshot = self.convert_to_dict()
         self.alert_group.save(update_fields=["raw_escalation_snapshot"])

diff --git a/engine/apps/alerts/models/alert_group.py b/engine/apps/alerts/models/alert_group.py
@@ -489,6 +489,7 @@ def skip_escalation_in_slack(self):
             AlertGroup.ACCOUNT_INACTIVE,
             AlertGroup.RATE_LIMITED,
             AlertGroup.CHANNEL_NOT_SPECIFIED,
+            AlertGroup.RESTRICTED_ACTION,
         )
 
     def is_alert_a_resolve_signal(self, alert):

diff --git a/engine/apps/alerts/tasks/acknowledge_reminder.py b/engine/apps/alerts/tasks/acknowledge_reminder.py
@@ -1,7 +1,9 @@
+from datetime import timedelta
 from functools import partial
 
 from django.conf import settings
 from django.db import transaction
+from django.utils import timezone
 
 from common.custom_celery_tasks import shared_dedicated_queue_retry_task
 
@@ -61,6 +63,11 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str
             (alert_group.pk, unacknowledge_process_id), countdown=unacknowledge_timeout
         )
     else:
+        if alert_group.started_at < timezone.now() - timedelta(days=settings.ACKNOWLEDGE_REMINDER_TASK_EXPIRY_DAYS):
+            task_logger.info(
+                f"alert group {alert_group_pk} not renewing acknowledgement reminder, started_at is too old. {log_info}"
+            )
+            return
         acknowledge_reminder_task.apply_async(
             (alert_group.pk, unacknowledge_process_id), countdown=acknowledge_reminder_timeout
         )

diff --git a/engine/apps/alerts/tasks/check_escalation_finished.py b/engine/apps/alerts/tasks/check_escalation_finished.py
@@ -4,7 +4,7 @@
 import requests
 from celery import shared_task
 from django.conf import settings
-from django.db.models import Avg, F, Max
+from django.db.models import Avg, F, Max, Q
 from django.utils import timezone
 
 from apps.alerts.tasks.task_logger import task_logger
@@ -29,26 +29,26 @@ def send_alert_group_escalation_auditor_task_heartbeat() -> None:
 
 
 def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
-    escalation_snapshot = alert_group.escalation_snapshot
+    raw_escalation_snapshot: dict = alert_group.raw_escalation_snapshot
     alert_group_id = alert_group.id
     base_msg = f"Alert group {alert_group_id}"
 
-    if not alert_group.escalation_chain_exists:
+    if not raw_escalation_snapshot:
+        msg = f"{base_msg} does not have an escalation snapshot associated with it, this should never occur"
+
+        task_logger.warning(msg)
+        raise AlertGroupEscalationPolicyExecutionAuditException(msg)
+
+    if not raw_escalation_snapshot.get("escalation_chain_snapshot"):
         task_logger.info(
             f"{base_msg} does not have an escalation chain associated with it, and therefore it is expected "
             "that it will not have an escalation snapshot, skipping further validation"
         )
         return
 
-    if not escalation_snapshot:
-        msg = f"{base_msg} does not have an escalation snapshot associated with it, this should never occur"
-
-        task_logger.warning(msg)
-        raise AlertGroupEscalationPolicyExecutionAuditException(msg)
-
     task_logger.info(f"{base_msg} has an escalation snapshot associated with it, auditing if it executed properly")
 
-    escalation_policies_snapshots = escalation_snapshot.escalation_policies_snapshots
+    escalation_policies_snapshots = raw_escalation_snapshot.get("escalation_policies_snapshots")
 
     if not escalation_policies_snapshots:
         task_logger.info(
@@ -59,18 +59,19 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
         f"{base_msg}'s escalation snapshot has a populated escalation_policies_snapshots, continuing validation"
     )
 
-    if escalation_snapshot.next_step_eta_is_valid() is False:
-        msg = (
-            f"{base_msg}'s escalation snapshot does not have a valid next_step_eta: {escalation_snapshot.next_step_eta}"
-        )
+    if alert_group.next_step_eta_is_valid() is False:
+        msg = f"{base_msg}'s escalation snapshot does not have a valid next_step_eta: {alert_group.next_step_eta}"
 
         task_logger.warning(msg)
         raise AlertGroupEscalationPolicyExecutionAuditException(msg)
 
-    task_logger.info(f"{base_msg}'s escalation snapshot has a valid next_step_eta: {escalation_snapshot.next_step_eta}")
+    task_logger.info(f"{base_msg}'s escalation snapshot has a valid next_step_eta: {alert_group.next_step_eta}")
 
-    executed_escalation_policy_snapshots = escalation_snapshot.executed_escalation_policy_snapshots
-    num_of_executed_escalation_policy_snapshots = len(executed_escalation_policy_snapshots)
+    num_of_executed_escalation_policy_snapshots = (
+        alert_group.last_active_escalation_policy_order + 1
+        if alert_group.last_active_escalation_policy_order is not None
+        else 0
+    )
 
     if num_of_executed_escalation_policy_snapshots == 0:
         task_logger.info(
@@ -81,9 +82,39 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
             f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies"
         )
 
+    check_personal_notifications_task.apply_async((alert_group_id,))
+
     task_logger.info(f"{base_msg} passed the audit checks")
 
 
+@shared_task
+def check_personal_notifications_task(alert_group_id) -> None:
+    # Check personal notifications are completed
+    # triggered (< 5min ago) == failed + success
+    from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord
+
+    triggered = UserNotificationPolicyLogRecord.objects.filter(
+        alert_group_id=alert_group_id,
+        type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
+        notification_step=UserNotificationPolicy.Step.NOTIFY,
+        created_at__lte=timezone.now() - timezone.timedelta(minutes=5),
+    ).count()
+    completed = UserNotificationPolicyLogRecord.objects.filter(
+        Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED)
+        | Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_SUCCESS),
+        alert_group_id=alert_group_id,
+        notification_step=UserNotificationPolicy.Step.NOTIFY,
+    ).count()
+
+    base_msg = f"Alert group {alert_group_id}"
+    delta = triggered - completed
+    if delta > 0:
+        # TODO: when success notifications are setup for every backend, raise exception here
+        task_logger.info(f"{base_msg} has ({delta}) uncompleted personal notifications")
+    else:
+        task_logger.info(f"{base_msg} personal notifications check passed")
+
+
 @shared_task
 def check_escalation_finished_task() -> None:
     """
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,3 +10,5 @@ venv

		yarn.lock
		node_modules

		test-results