Skip to content

Commit

Permalink
Merge pull request #181 from appuio/map-alerts-to-syn-teams
Browse files Browse the repository at this point in the history
Automatic alert mapping from application (instance) to team
  • Loading branch information
bastjan authored Oct 3, 2023
2 parents 4a7adc7 + 51c9fb3 commit fc8a2b3
Show file tree
Hide file tree
Showing 36 changed files with 732 additions and 240 deletions.
4 changes: 2 additions & 2 deletions .cruft.json
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
{
"template": "https://github.com/projectsyn/commodore-component-template.git",
"commit": "d8afca0d957d69b362c2cb45e3f6faa13662dfe2",
"commit": "43e241e3b7daa4ad42a57889bb313e71098bf1dc",
"checkout": "main",
"context": {
"cookiecutter": {
"name": "OpenShift4 Monitoring",
"slug": "openshift4-monitoring",
"parameter_key": "openshift4_monitoring",
"test_cases": "capacity-alerts release-4.11 remote-write user-workload-monitoring capacity-alerts-with-node-labels vsphere team-label custom-rules release-4.12 release-4.13",
"test_cases": "capacity-alerts release-4.11 remote-write user-workload-monitoring capacity-alerts-with-node-labels vsphere custom-rules release-4.12 release-4.13 team-routing",
"add_lib": "y",
"add_pp": "n",
"add_golden": "y",
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ jobs:
- user-workload-monitoring
- capacity-alerts-with-node-labels
- vsphere
- team-label
- custom-rules
- release-4.12
- release-4.13
- team-routing
defaults:
run:
working-directory: ${{ env.COMPONENT_NAME }}
Expand All @@ -62,10 +62,10 @@ jobs:
- user-workload-monitoring
- capacity-alerts-with-node-labels
- vsphere
- team-label
- custom-rules
- release-4.12
- release-4.13
- team-routing
defaults:
run:
working-directory: ${{ env.COMPONENT_NAME }}
Expand Down
2 changes: 1 addition & 1 deletion Makefile.vars.mk
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,4 @@ KUBENT_IMAGE ?= ghcr.io/doitintl/kube-no-trouble:latest
KUBENT_DOCKER ?= $(DOCKER_CMD) $(DOCKER_ARGS) $(root_volume) --entrypoint=/app/kubent $(KUBENT_IMAGE)

instance ?= capacity-alerts
test_instances = tests/capacity-alerts.yml tests/release-4.11.yml tests/remote-write.yml tests/user-workload-monitoring.yml tests/capacity-alerts-with-node-labels.yml tests/vsphere.yml tests/team-label.yml tests/custom-rules.yml tests/release-4.12.yml tests/release-4.13.yml
test_instances = tests/capacity-alerts.yml tests/release-4.11.yml tests/remote-write.yml tests/user-workload-monitoring.yml tests/capacity-alerts-with-node-labels.yml tests/vsphere.yml tests/custom-rules.yml tests/release-4.12.yml tests/release-4.13.yml tests/team-routing.yml
9 changes: 9 additions & 0 deletions class/defaults.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
parameters:
openshift4_monitoring:
fallback_team: null

=_metadata:
library_aliases:
prom.libsonnet: openshift4-monitoring-prom.libsonnet
Expand Down Expand Up @@ -99,6 +101,13 @@ parameters:
equal:
- namespace
- alertname
alertManagerAutoDiscovery:
enabled: true
debug_config_map: false
team_receiver_format: team_default_%s
additional_alert_matchers: []
prepend_routes: []
append_routes: []
alerts:
includeNamespaces:
- appuio.*
Expand Down
96 changes: 96 additions & 0 deletions component/alert-routing-discovery.libsonnet
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
local com = import 'lib/commodore.libjsonnet';
local kap = import 'lib/kapitan.libjsonnet';
local kube = import 'lib/kube.libjsonnet';
local prom = import 'lib/prom.libsonnet';

local inv = kap.inventory();
local params = inv.parameters;

local adParams = params.openshift4_monitoring.alertManagerAutoDiscovery;

local nullReceiver = '__component_openshift4_monitoring_null';

// discoverNS returns the namespace for the given application.
// It looks into the follwing places:
// - params.<app>.namespace
// - params.<app>.namespace.name
// It does respect aliased applications and looks in the instance first and then in the base application.
local discoverNS = function(app)
local f = function(k)
if std.objectHas(params, k) then
local p = params[k];
if std.objectHas(p, 'namespace') then
if std.isString(p.namespace) then
p.namespace
else if std.isObject(p.namespace) && std.objectHas(p.namespace, 'name') && std.isString(p.namespace.name) then
p.namespace.name;

local ks = prom.appKeys(app);
local aliased = f(ks[0]);
if aliased != null then
aliased
else if std.length(ks) == 2 then
f(ks[1]);


local ownerOrFallbackTeam =
if std.objectHas(params, 'syn') && std.objectHas(params.syn, 'owner') then
params.syn.owner
else
params.openshift4_monitoring.fallback_team;

// teamToNS is a map from a team to namespaces.
local teamToNS = std.foldl(
function(prev, app)
prev { [prom.teamForApplication(app)]+: [ discoverNS(app) ] },
inv.applications,
{}
);

// teamBasedRouting contains discovered routes for teams.
// The routes are set up with `continue: true` so we can route to multiple teams.
// The last route catches all alerts already routed to a team.
local teamBasedRouting = std.map(
function(k) {
receiver: adParams.team_receiver_format % k,
matchers: adParams.additional_alert_matchers + [
'namespace =~ "%s"' % std.join('|', teamToNS[k]),
],
continue: true,
},
std.objectFields(teamToNS)
) + [ {
// catch all alerts already routed to a team
receiver: nullReceiver,
matchers: adParams.additional_alert_matchers + [
'namespace =~ "%s"' % std.join('|', std.foldl(function(prev, nss) prev + nss, std.objectValues(teamToNS), [])),
],
continue: false,
} ];

local alertmanagerConfig =
local routes = std.get(params.openshift4_monitoring.alertManagerConfig.route, 'routes', []);
params.openshift4_monitoring.alertManagerConfig {
receivers+: [ { name: nullReceiver } ],
route+: {
routes: adParams.prepend_routes + teamBasedRouting + adParams.append_routes + routes + if ownerOrFallbackTeam != null then [ {
receiver: adParams.team_receiver_format % ownerOrFallbackTeam,
} ] else [ { receiver: nullReceiver } ],
},
};

{
debugConfigMap: kube.ConfigMap('discovery-debug') {
data: {
local discoveredNamespaces = std.foldl(function(prev, app) prev { [app]: discoverNS(app) }, inv.applications, {}),
local discoveredTeams = std.foldl(function(prev, app) prev { [app]: prom.teamForApplication(app) }, inv.applications, {}),
applications: std.manifestJsonMinified(inv.applications),
discovered_namespaces: std.manifestYamlDoc(discoveredNamespaces),
apps_without_namespaces: std.manifestYamlDoc(std.foldl(function(prev, app) if discoveredNamespaces[app] == null then prev + [ app ] else prev, std.objectFields(discoveredNamespaces), [])),
discovered_teams: std.manifestYamlDoc(discoveredTeams),
proposed_routes: std.manifestYamlDoc(teamBasedRouting),
alertmanager: std.manifestYamlDoc(alertmanagerConfig),
},
},
alertmanagerConfig: alertmanagerConfig,
}
6 changes: 5 additions & 1 deletion component/main.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ local params = inv.parameters.openshift4_monitoring;
local rules = import 'rules.jsonnet';
local capacity = import 'capacity.libsonnet';

local alertDiscovery = import 'alert-routing-discovery.libsonnet';

local ns =
if params.namespace != 'openshift-monitoring' then
error 'Component openshift4-monitoring does not support values for parameter `namespace` other than "openshift-monitoring".'
Expand Down Expand Up @@ -105,9 +107,11 @@ local customRules =
namespace: ns,
},
stringData: {
'alertmanager.yaml': std.manifestYamlDoc(params.alertManagerConfig),
'alertmanager.yaml': if params.alertManagerAutoDiscovery.enabled then std.manifestYamlDoc(alertDiscovery.alertmanagerConfig) else alertDiscovery.alertmanagerConfig,
},
},
[if params.alertManagerAutoDiscovery.enabled && params.alertManagerAutoDiscovery.debug_config_map then '99_discovery_debug_cm']: alertDiscovery.debugConfigMap,

[if params.enableAlertmanagerIsolationNetworkPolicy then '20_networkpolicy']: std.map(function(p) com.namespaced('openshift-monitoring', p), import 'networkpolicy.libsonnet'),
[if params.enableUserWorkload && params.enableUserWorkloadAlertmanagerIsolationNetworkPolicy then '20_user_workload_networkpolicy']: std.map(function(p) com.namespaced('openshift-user-workload-monitoring', p), import 'networkpolicy.libsonnet'),
rbac: import 'rbac.libsonnet',
Expand Down
70 changes: 70 additions & 0 deletions docs/modules/ROOT/pages/references/parameters.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,76 @@ A dictionary holding the configuration for the AlertManager.

See the https://docs.openshift.com/container-platform/latest/monitoring/cluster_monitoring/configuring-the-monitoring-stack.html#configuring-alertmanager[OpenShift docs] for available parameters.


== `alertManagerAutoDiscovery`

[horizontal]
type:: dictionary
default::
+
[source,yaml]
----
alertManagerAutoDiscovery:
enabled: true
debug_config_map: false
team_receiver_format: team_default_%s
additional_alert_matchers: []
prepend_routes: []
append_routes: []
----

`alertManagerAutoDiscovery` holds the configuration for the Alertmanager auto-discovery feature.

The auto-discovery routes alerts to the configured teams based on their namespaces and the top-level `syn.teams[*].instances` and `syn.owner` parameters.
Auto-discovery first creates a list of Commodore component instances by parsing the `applications` array using the same rules as Commodore itself (see also the https://syn.tools/commodore/reference/architecture.html#_component_instantiation[Commodore component instantiation documentation]).
For each discovered instance, the component then renders the instance parameters, and reads the cmoponent's namespace from field `namespace` or `namespace.name` in the rendered parameters.
Finally, routing rules are generated to route alerts from the discovered namespaces to the associated component instance's owning team.

.`syn` Team Example
[source,yaml]
----
syn:
owner: daring-donkeys
teams:
electric-elephants:
instances: [postgres]
----

The auto-discovery feature is enabled by default.
A ConfigMap can be enabled with `debug_config_map` to debug the auto-discovery feature.

The configuration is merged with the `alertManagerConfig` parameter.
Route receivers are generated for each team based on the `team_receiver_format` parameter.
The routes are ordered as follows:

[source]
----
alertManagerAutoDiscovery.prepend_routes + generated routes + alertManagerAutoDiscovery.append_routes + alertManagerConfig.routes + route all to syn.owner
----

`additional_alert_matchers` is a list of additional alert matchers to add to the generated routes.
This can be used to handle special cases where the auto-discovery feature does not work as expected.
For example if an alert should go to a different team than the namespace suggests based on a label.

[source,yaml]
----
alertManagerAutoDiscovery:
additional_alert_matchers:
- 'syn_team = ""'
# becomes
- continue: true
matchers:
- syn_team = ""
- namespace =~ "my-ns"
receiver: team_default_lovable-lizards
- continue: false
matchers:
- syn_team = ""
- namespace =~ "my-ns"
receiver: __component_openshift4_monitoring_null
----


== `alerts`

[horizontal]
Expand Down
35 changes: 4 additions & 31 deletions lib/openshift4-monitoring-alert-patching.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
// arbitrary alert rules to adhere to the format required by the component's
// approach for allowing us to patch upstream rules.
local com = import 'lib/commodore.libjsonnet';
local prom = import 'lib/prom.libsonnet';

local inv = com.inventory();

local global_alert_params =
Expand All @@ -20,35 +22,6 @@ local global_alert_params =
customAnnotations: {},
};

local syn_team =
local instance = inv.parameters._instance;
local syn = if std.objectHas(inv.parameters, 'syn') then {
owner: std.get(inv.parameters.syn, 'owner', ''),
teams: std.get(inv.parameters.syn, 'teams', { teams: {} }),
} else { owner: '', teams: {} };
local team_instances = [
{
team: tn,
instances: std.get(syn.teams[tn], 'instances', []),
}
for tn in std.objectFields(syn.teams)
];
local team = std.foldl(
function(o, ti)
if std.member(ti.instances, instance) then
o + [ ti.team ]
else
o,
team_instances,
[]
);
if std.length(team) > 1 then
error "Multiple owners for instance '%s': %s" % [ instance, team ]
else if std.length(team) == 1 then
team[0]
else
syn.owner;

/**
* \brief filter alert rules in the provided group
*
Expand Down Expand Up @@ -140,7 +113,7 @@ local patchRule(rule, patches={}, patchName=true) =
then
rule.labels.syn_team
else
syn_team;
prom.teamForApplication(inv.parameters._instance);
rule {
// Change alert names so we don't get multiple alerts with the same
// name, as the logging operator deploys its own copy of these
Expand All @@ -153,7 +126,7 @@ local patchRule(rule, patches={}, patchName=true) =
syn_component: inv.parameters._instance,
// mark alert as belonging to the team in whose context the
// function is called.
[if syn_team_label != '' then 'syn_team']: syn_team_label,
[if syn_team_label != null then 'syn_team']: syn_team_label,
},
annotations+:
std.get(global_alert_params.customAnnotations, super.alert, {}),
Expand Down
Loading

0 comments on commit fc8a2b3

Please sign in to comment.