Skip to content

Commit

Permalink
v1.3.81
Browse files Browse the repository at this point in the history
  • Loading branch information
joeyorlando authored Dec 28, 2023
2 parents bfa7442 + 33fcb87 commit b6a4448
Show file tree
Hide file tree
Showing 59 changed files with 856 additions and 266 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ venv

yarn.lock
node_modules

test-results
8 changes: 8 additions & 0 deletions .prettierrc.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
overrides: [
{
files: ["*.yml", "*.yaml"],
options: {
singleQuote: false,
},
},
];
23 changes: 23 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,29 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## Unreleased

## v1.3.81 (2023-12-28)

### Added

- Support e2e tests in Tilt and Makefile ([#3516](https://github.com/grafana/oncall/pull/3516))
- Support PATCH method for outgoing webhooks by @ravishankar15 ([#3580](https://github.com/grafana/oncall/pull/3580))

### Changed

- Limit acknowledge reminders to stop repeating after 1 month @mderynck ([#3571](https://github.com/grafana/oncall/pull/3571))

### Fixed

- Check reason to skip notification in Slack to avoid task perform_notification retries @Ferril ([#3562](https://github.com/grafana/oncall/pull/3562))
- Fix alert group table columns validation @Ferril ([#3577](https://github.com/grafana/oncall/pull/3577))
- Fix posting message about rate limit to Slack @Ferril ([#3582](https://github.com/grafana/oncall/pull/3582))
- Fix issue with parsing sender email address from email message for inbound email integration endpoint @Ferril ([#3586](https://github.com/grafana/oncall/pull/3586))
- Fix PUT /api/v1/escalation_policies/id issue when updating `from_time` and `to_time` by @joeyorlando ([#3581](https://github.com/grafana/oncall/pull/3581))
- Fix issue where duplicate team options would show up in the teams dropdown for the `/escalate` Slack command
by @joeyorlando ([#3590](https://github.com/grafana/oncall/pull/3590))

## v1.3.80 (2023-12-14)

### Added
Expand Down
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,15 @@ engine-manage: ## run Django's `manage.py` script, inside of a docker container
## https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-makemigrations
$(call run_engine_docker_command,python manage.py $(CMD))

test-e2e: ## run the e2e tests in headless mode
yarn --cwd grafana-plugin test:e2e

test-e2e-watch: ## start e2e tests in watch mode
yarn --cwd grafana-plugin test:e2e:watch

test-e2e-show-report: ## open last e2e test report
yarn --cwd grafana-plugin playwright show-report

ui-test: ## run the UI tests
$(call run_ui_docker_command,yarn test)

Expand Down
51 changes: 49 additions & 2 deletions Tiltfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
load('ext://uibutton', 'cmd_button', 'location', 'text_input', 'bool_input')
running_under_parent_tiltfile = os.getenv("TILT_PARENT", "false") == "true"
# The user/pass that you will login to Grafana with
grafana_admin_user_pass = os.getenv("GRAFANA_ADMIN_USER_PASS", "oncall")
Expand Down Expand Up @@ -36,7 +37,7 @@ docker_build_sub(
"localhost:63628/oncall/engine:dev",
context="./engine",
cache_from=["grafana/oncall:latest", "grafana/oncall:dev"],
ignore=["./grafana-plugin/test-results/", "./grafana-plugin/dist/", "./grafana-plugin/e2e-tests/"],
ignore=["./test-results/", "./grafana-plugin/dist/", "./grafana-plugin/e2e-tests/"],
child_context=".",
target="dev",
extra_cmds=["ADD ./grafana-plugin/src/plugin.json /etc/grafana-plugin/src/plugin.json"],
Expand All @@ -54,10 +55,56 @@ local_resource(
"build-ui",
labels=["OnCallUI"],
cmd="cd grafana-plugin && yarn install && yarn build:dev",
serve_cmd="cd grafana-plugin && ONCALL_API_URL=http://oncall-dev-engine:8080 yarn watch",
serve_cmd="cd grafana-plugin && yarn watch",
allow_parallel=True,
)

local_resource(
"e2e-tests",
labels=["E2eTests"],
cmd="cd grafana-plugin && yarn test:e2e",
trigger_mode=TRIGGER_MODE_MANUAL,
auto_init=False,
resource_deps=["build-ui", "grafana", "grafana-oncall-app-provisioning-configmap", "engine"]
)

cmd_button(
name="E2E Tests - headless run",
argv=["sh", "-c", "yarn --cwd ./grafana-plugin test:e2e $STOP_ON_FIRST_FAILURE"],
text="Restart headless run",
resource="e2e-tests",
icon_name="replay",
inputs=[
text_input("BROWSERS", "Browsers (e.g. \"chromium,firefox,webkit\")", "chromium", "chromium,firefox,webkit"),
bool_input("REPORTER", "Use HTML reporter", True, 'html', 'line'),
bool_input("STOP_ON_FIRST_FAILURE", "Stop on first failure", True, "-x", ""),
]
)

cmd_button(
name="E2E Tests - open watch mode",
argv=["sh", "-c", "yarn --cwd grafana-plugin test:e2e:watch"],
text="Open watch mode",
resource="e2e-tests",
icon_name="visibility",
)

cmd_button(
name="E2E Tests - show report",
argv=["sh", "-c", "yarn --cwd grafana-plugin playwright show-report"],
text="Show last HTML report",
resource="e2e-tests",
icon_name="assignment",
)

cmd_button(
name="E2E Tests - stop current run",
argv=["sh", "-c", "kill -9 $(pgrep -f test:e2e)"],
text="Stop",
resource="e2e-tests",
icon_name="dangerous",
)

yaml = helm("helm/oncall", name=HELM_PREFIX, values=["./dev/helm-local.yml", "./dev/helm-local.dev.yml"])

k8s_yaml(yaml)
Expand Down
19 changes: 12 additions & 7 deletions dev/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -243,13 +243,18 @@ are run on pull request CI builds. New features should ideally include a new/mod

To run these tests locally simply do the following:

```bash
npx playwright install # install playwright dependencies
cp ./grafana-plugin/e2e-tests/.env.example ./grafana-plugin/e2e-tests/.env
# you may need to tweak the values in ./grafana-plugin/.env according to your local setup
cd grafana-plugin
yarn test:e2e
```
1. Install Playwright dependencies with `npx playwright install`
2. [Launch the environment](#launch-the-environment)
3. Then you interact with tests in 2 different ways:
1. Using `Tilt` - open _E2eTests_ section where you will find 4 buttons:
1. Restart headless run (you can configure browsers, reporter and failure allowance there)
2. Open watch mode
3. Show last HTML report
4. Stop (stops any pending e2e test process)
2. Using `make`:
1. `make test:e2e` to start headless run
2. `make test:e2e:watch` to open watch mode
3. `make test:e2e:show:report` to open last HTML report

## Helm unit tests

Expand Down
2 changes: 1 addition & 1 deletion dev/helm-local.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
base_url: localhost:30001
base_url: localhost:8080
base_url_protocol: http
env:
- name: GRAFANA_CLOUD_NOTIFICATIONS_ENABLED
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytz
from celery import uuid as celery_uuid
from dateutil.parser import parse
from django.utils import timezone
from django.utils.functional import cached_property
from rest_framework.exceptions import ValidationError

Expand Down Expand Up @@ -212,6 +213,12 @@ def pause_escalation(self) -> bool:
return False
return self.raw_escalation_snapshot.get("pause_escalation", False)

@property
def last_active_escalation_policy_order(self) -> typing.Optional[int]:
if not self.raw_escalation_snapshot:
return None
return self.raw_escalation_snapshot.get("last_active_escalation_policy_order")

@property
def next_step_eta(self) -> typing.Optional[datetime.datetime]:
"""
Expand All @@ -223,6 +230,19 @@ def next_step_eta(self) -> typing.Optional[datetime.datetime]:
raw_next_step_eta = self.raw_escalation_snapshot.get("next_step_eta")
return None if not raw_next_step_eta else parse(raw_next_step_eta).replace(tzinfo=pytz.UTC)

def next_step_eta_is_valid(self) -> typing.Optional[bool]:
"""
`next_step_eta` should never be less than the current time (with a 5 minute buffer provided)
as this field should be updated as the escalation policy is executed over time. If it is, this means that
an escalation policy step has been missed, or is substantially delayed
if `next_step_eta` is `None` then `None` is returned, otherwise a boolean is returned
representing the result of the time comparision
"""
if self.next_step_eta is None:
return None
return self.next_step_eta > (timezone.now() - datetime.timedelta(minutes=5))

def update_next_step_eta(self, increase_by_timedelta: datetime.timedelta) -> typing.Optional[dict]:
"""
update next_step_eta field directly to avoid serialization overhead
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import typing

from celery.utils.log import get_task_logger
from django.utils import timezone

from apps.alerts.escalation_snapshot.serializers import EscalationSnapshotSerializer
from apps.alerts.models.alert_group_log_record import AlertGroupLogRecord
Expand Down Expand Up @@ -90,19 +89,6 @@ def executed_escalation_policy_snapshots(self) -> typing.List["EscalationPolicyS
return []
return self.escalation_policies_snapshots[: self.last_active_escalation_policy_order + 1]

def next_step_eta_is_valid(self) -> typing.Optional[bool]:
"""
`next_step_eta` should never be less than the current time (with a 5 minute buffer provided)
as this field should be updated as the escalation policy is executed over time. If it is, this means that
an escalation policy step has been missed, or is substantially delayed
if `next_step_eta` is `None` then `None` is returned, otherwise a boolean is returned
representing the result of the time comparision
"""
if self.next_step_eta is None:
return None
return self.next_step_eta > (timezone.now() - datetime.timedelta(minutes=5))

def save_to_alert_group(self) -> None:
self.alert_group.raw_escalation_snapshot = self.convert_to_dict()
self.alert_group.save(update_fields=["raw_escalation_snapshot"])
Expand Down
1 change: 1 addition & 0 deletions engine/apps/alerts/models/alert_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,7 @@ def skip_escalation_in_slack(self):
AlertGroup.ACCOUNT_INACTIVE,
AlertGroup.RATE_LIMITED,
AlertGroup.CHANNEL_NOT_SPECIFIED,
AlertGroup.RESTRICTED_ACTION,
)

def is_alert_a_resolve_signal(self, alert):
Expand Down
7 changes: 7 additions & 0 deletions engine/apps/alerts/tasks/acknowledge_reminder.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from datetime import timedelta
from functools import partial

from django.conf import settings
from django.db import transaction
from django.utils import timezone

from common.custom_celery_tasks import shared_dedicated_queue_retry_task

Expand Down Expand Up @@ -61,6 +63,11 @@ def acknowledge_reminder_task(alert_group_pk: int, unacknowledge_process_id: str
(alert_group.pk, unacknowledge_process_id), countdown=unacknowledge_timeout
)
else:
if alert_group.started_at < timezone.now() - timedelta(days=settings.ACKNOWLEDGE_REMINDER_TASK_EXPIRY_DAYS):
task_logger.info(
f"alert group {alert_group_pk} not renewing acknowledgement reminder, started_at is too old. {log_info}"
)
return
acknowledge_reminder_task.apply_async(
(alert_group.pk, unacknowledge_process_id), countdown=acknowledge_reminder_timeout
)
Expand Down
65 changes: 48 additions & 17 deletions engine/apps/alerts/tasks/check_escalation_finished.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import requests
from celery import shared_task
from django.conf import settings
from django.db.models import Avg, F, Max
from django.db.models import Avg, F, Max, Q
from django.utils import timezone

from apps.alerts.tasks.task_logger import task_logger
Expand All @@ -29,26 +29,26 @@ def send_alert_group_escalation_auditor_task_heartbeat() -> None:


def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
escalation_snapshot = alert_group.escalation_snapshot
raw_escalation_snapshot: dict = alert_group.raw_escalation_snapshot
alert_group_id = alert_group.id
base_msg = f"Alert group {alert_group_id}"

if not alert_group.escalation_chain_exists:
if not raw_escalation_snapshot:
msg = f"{base_msg} does not have an escalation snapshot associated with it, this should never occur"

task_logger.warning(msg)
raise AlertGroupEscalationPolicyExecutionAuditException(msg)

if not raw_escalation_snapshot.get("escalation_chain_snapshot"):
task_logger.info(
f"{base_msg} does not have an escalation chain associated with it, and therefore it is expected "
"that it will not have an escalation snapshot, skipping further validation"
)
return

if not escalation_snapshot:
msg = f"{base_msg} does not have an escalation snapshot associated with it, this should never occur"

task_logger.warning(msg)
raise AlertGroupEscalationPolicyExecutionAuditException(msg)

task_logger.info(f"{base_msg} has an escalation snapshot associated with it, auditing if it executed properly")

escalation_policies_snapshots = escalation_snapshot.escalation_policies_snapshots
escalation_policies_snapshots = raw_escalation_snapshot.get("escalation_policies_snapshots")

if not escalation_policies_snapshots:
task_logger.info(
Expand All @@ -59,18 +59,19 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
f"{base_msg}'s escalation snapshot has a populated escalation_policies_snapshots, continuing validation"
)

if escalation_snapshot.next_step_eta_is_valid() is False:
msg = (
f"{base_msg}'s escalation snapshot does not have a valid next_step_eta: {escalation_snapshot.next_step_eta}"
)
if alert_group.next_step_eta_is_valid() is False:
msg = f"{base_msg}'s escalation snapshot does not have a valid next_step_eta: {alert_group.next_step_eta}"

task_logger.warning(msg)
raise AlertGroupEscalationPolicyExecutionAuditException(msg)

task_logger.info(f"{base_msg}'s escalation snapshot has a valid next_step_eta: {escalation_snapshot.next_step_eta}")
task_logger.info(f"{base_msg}'s escalation snapshot has a valid next_step_eta: {alert_group.next_step_eta}")

executed_escalation_policy_snapshots = escalation_snapshot.executed_escalation_policy_snapshots
num_of_executed_escalation_policy_snapshots = len(executed_escalation_policy_snapshots)
num_of_executed_escalation_policy_snapshots = (
alert_group.last_active_escalation_policy_order + 1
if alert_group.last_active_escalation_policy_order is not None
else 0
)

if num_of_executed_escalation_policy_snapshots == 0:
task_logger.info(
Expand All @@ -81,9 +82,39 @@ def audit_alert_group_escalation(alert_group: "AlertGroup") -> None:
f"{base_msg}'s escalation snapshot has {num_of_executed_escalation_policy_snapshots} executed escalation policies"
)

check_personal_notifications_task.apply_async((alert_group_id,))

task_logger.info(f"{base_msg} passed the audit checks")


@shared_task
def check_personal_notifications_task(alert_group_id) -> None:
# Check personal notifications are completed
# triggered (< 5min ago) == failed + success
from apps.base.models import UserNotificationPolicy, UserNotificationPolicyLogRecord

triggered = UserNotificationPolicyLogRecord.objects.filter(
alert_group_id=alert_group_id,
type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_TRIGGERED,
notification_step=UserNotificationPolicy.Step.NOTIFY,
created_at__lte=timezone.now() - timezone.timedelta(minutes=5),
).count()
completed = UserNotificationPolicyLogRecord.objects.filter(
Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_FAILED)
| Q(type=UserNotificationPolicyLogRecord.TYPE_PERSONAL_NOTIFICATION_SUCCESS),
alert_group_id=alert_group_id,
notification_step=UserNotificationPolicy.Step.NOTIFY,
).count()

base_msg = f"Alert group {alert_group_id}"
delta = triggered - completed
if delta > 0:
# TODO: when success notifications are setup for every backend, raise exception here
task_logger.info(f"{base_msg} has ({delta}) uncompleted personal notifications")
else:
task_logger.info(f"{base_msg} personal notifications check passed")


@shared_task
def check_escalation_finished_task() -> None:
"""
Expand Down
Loading

0 comments on commit b6a4448

Please sign in to comment.