From 2dd550e1db687590abca17d5366074599a31a6bd Mon Sep 17 00:00:00 2001 From: Adrian Haas Date: Fri, 19 Jul 2024 14:07:28 +0200 Subject: [PATCH] Add support for OCP 4.15 --- class/defaults.yml | 17 +++---- class/openshift4-monitoring.yml | 43 ++++++++--------- .../prometheus_rules.yaml | 47 ++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 47 ++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- .../openshift4-monitoring/cronjobs.yaml | 2 +- .../prometheus_rules.yaml | 2 +- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 47 ++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 2 +- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 18 ++++--- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 47 ++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 48 +++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 47 ++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- .../prometheus_rules.yaml | 47 ++++++++++++++---- .../openshift4-monitoring/silence.yaml | 2 +- 23 files changed, 310 insertions(+), 124 deletions(-) diff --git a/class/defaults.yml b/class/defaults.yml index 8a62e243..3347ed68 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -7,21 +7,15 @@ parameters: prom.libsonnet: openshift4-monitoring-prom.libsonnet alert-patching.libsonnet: openshift4-monitoring-alert-patching.libsonnet namespace: openshift-monitoring - # TODO: select based on reported OCP version once we have dynamic facts - manifests_version: release-4.14 - =_cluster_monitoring_operator_version_map: - release-4.13: release-4.13 - release-4.14: release-4.14 - =_etcd_operator_version_map: - release-4.13: release-4.13 - release-4.14: release-4.14 + manifests_version: release-4.15 # no release branches newer than 4.9 exist =_operator_lifecycle_manager_map: release-4.13: release-4.9 release-4.14: release-4.9 + release-4.15: release-4.9 jsonnetfile_parameters: - cmo_version: ${openshift4_monitoring:_cluster_monitoring_operator_version_map:${openshift4_monitoring:manifests_version}} - etcd_version: ${openshift4_monitoring:_etcd_operator_version_map:${openshift4_monitoring:manifests_version}} + cmo_version: ${openshift4_monitoring:manifests_version} + etcd_version: ${openshift4_monitoring:manifests_version} defaultConfig: nodeSelector: node-role.kubernetes.io/infra: '' @@ -211,6 +205,7 @@ parameters: expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) (count by (instance) (node_cpu_info{}) * 100) release-4.13: {} release-4.14: {} + release-4.15: {} # Alerts to ignore for user workload monitoring ignoreUserWorkload: [] @@ -237,7 +232,7 @@ parameters: images: oc: image: quay.io/appuio/oc - tag: v4.14 + tag: v4.15 node_exporter: registry: quay.io repository: prometheus/node-exporter diff --git a/class/openshift4-monitoring.yml b/class/openshift4-monitoring.yml index 3d7b7066..4dc1b1f5 100644 --- a/class/openshift4-monitoring.yml +++ b/class/openshift4-monitoring.yml @@ -2,14 +2,6 @@ parameters: openshift4_monitoring: =_manifest_urls: kube-apiserver: - release-4.11: - api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/api-usage.yaml - cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/cpu-utilization.yaml - slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/kube-apiserver-slos-basic.yaml - release-4.12: - api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/api-usage.yaml - cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/cpu-utilization.yaml - slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/kube-apiserver-slos-basic.yaml release-4.13: api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/api-usage.yaml cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/cpu-utilization.yaml @@ -18,28 +10,20 @@ parameters: api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/api-usage.yaml cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/cpu-utilization.yaml slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/kube-apiserver-slos-basic.yaml + release-4.15: + api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/api-usage.yaml + cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/cpu-utilization.yaml + slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/kube-apiserver-slos-basic.yaml machine-api-operator: - release-4.11: - prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.11/install/0000_90_machine-api-operator_04_alertrules.yaml - release-4.12: - prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.12/install/0000_90_machine-api-operator_04_alertrules.yaml release-4.13: prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.13/install/0000_90_machine-api-operator_04_alertrules.yaml release-4.14: prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.14/install/0000_90_machine-api-operator_04_alertrules.yaml + release-4.15: + prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.15/install/0000_90_machine-api-operator_04_alertrules.yaml ovn-kubernetes: - release-4.11: - common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml - # We use the "self-hosted" variant of the control-plane alerts, so - # we don't have to worry about unresolved gotemplate references. - control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml - release-4.12: - common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml - # We use the "self-hosted" variant of the control-plane alerts, so - # we don't have to worry about unresolved gotemplate references. - control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml release-4.13: common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml # We use the "self-hosted" variant of the control-plane alerts, so @@ -56,11 +40,24 @@ parameters: # when selecting OVNKubernetes as the network plugin during # installation. control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.14/bindata/network/ovn-kubernetes/self-hosted/multi-zone-interconnect/alert-rules-control-plane.yaml + release-4.15: + common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml + # We handle the gotemplate stuff in Jsonnet for now, since Jinja + # can't deal with gotemplate expressions like `{{.OvnkubeMasterReplicas}}`. + # The only templates that are in the alerting rules can be handled + # with a simple string replace. + control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.15/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml + + cloud-credential-operator: + release-4.13: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.13/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml + release-4.14: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.14/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml + release-4.15: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.15/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml + kapitan: dependencies: - type: https - source: https://raw.githubusercontent.com/openshift/cloud-credential-operator/${openshift4_monitoring:manifests_version}/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml + source: ${openshift4_monitoring:_manifest_urls:cloud-credential-operator:${openshift4_monitoring:manifests_version}} output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/cloud-credential-operator.yaml # Download cluster-version-operator rules YAML to folder # `manifests_requiring_prerendering/`, because we cannot prerender diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index b8f2fdad..94eda30e 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -221,6 +221,25 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -392,13 +411,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -426,7 +449,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -829,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1352,10 +1375,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1367,7 +1392,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1379,6 +1405,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/capacity-alerts-with-node-labels/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index b8f2fdad..94eda30e 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -221,6 +221,25 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -392,13 +411,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -426,7 +449,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -829,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1352,10 +1375,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1367,7 +1392,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1379,6 +1405,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/capacity-alerts/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/cronjobs.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/cronjobs.yaml index ac862ee1..296bf7ac 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/cronjobs.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/cronjobs.yaml @@ -99,7 +99,7 @@ spec: command: - /usr/local/bin/script.sh env: [] - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: job ports: [] diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 03c9afc6..9edd5259 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -834,7 +834,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",namespace!~"(openshift-adp)",job="kube-state-metrics"} diff --git a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/custom-rules/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index d4432bde..86455112 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -355,6 +355,25 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -526,13 +545,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -560,7 +583,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -963,7 +986,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1486,10 +1509,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1501,7 +1526,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1513,6 +1539,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/ovn-kubernetes/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index a62709c3..ede3d6be 100644 --- a/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -831,7 +831,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} diff --git a/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/release-4.13/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index b8f2fdad..3ef75985 100644 --- a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -392,13 +392,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -829,7 +833,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} diff --git a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/release-4.14/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index b8f2fdad..94eda30e 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -221,6 +221,25 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -392,13 +411,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -426,7 +449,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -829,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1352,10 +1375,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1367,7 +1392,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1379,6 +1405,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/remote-write/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index 726f5f95..3fca1e1d 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -233,6 +233,26 @@ spec: syn_team: clumsy-donkeys - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring + syn_team: clumsy-donkeys - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -413,13 +433,17 @@ spec: syn_team: clumsy-donkeys - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -448,7 +472,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -872,7 +896,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1425,10 +1449,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1441,7 +1467,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1453,6 +1480,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/team-routing/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index d0800206..6a2c3adf 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -221,6 +221,25 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -392,13 +411,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -426,7 +449,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -829,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1352,10 +1375,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1367,7 +1392,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1379,6 +1405,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/user-workload-monitoring/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: [] diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml index b9e0f802..fd44674f 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/prometheus_rules.yaml @@ -221,6 +221,25 @@ spec: syn_component: openshift4-monitoring - name: syn-cluster-operators rules: + - alert: SYN_CannotEvaluateConditionalUpdates + annotations: + description: Failure to evaluate conditional update matches means that + Cluster Version Operator cannot decide whether an update path is recommended + or not. + summary: Cluster Version Operator cannot evaluate conditional update matches + for {{ $value | humanizeDuration }}. + syn_component: openshift4-monitoring + expr: | + max by (version, condition, status, reason) + ( + ( + time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"} + ) >= 3600 + ) + labels: + severity: warning + syn: 'true' + syn_component: openshift4-monitoring - alert: SYN_ClusterOperatorDegraded annotations: description: The {{ $labels.name }} operator is degraded because {{ $labels.reason @@ -392,13 +411,17 @@ spec: syn_component: openshift4-monitoring - alert: SYN_HighOverallControlPlaneCPU annotations: - description: |- - On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes. - On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html + description: Given three control plane nodes, the overall CPU utilization + may only be about 2/3 of all available capacity. This is because if + a single control plane node fails, the remaining two must handle the + load of the cluster in order to be HA. If the cluster is using more + than 2/3 of all capacity, if one control plane node fails, the remaining + two are likely to fail when they take the load. To fix this, increase + the CPU and memory on your control plane nodes. runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md - summary: CPU utilization across all control plane nodes is more than 60% - of the total available CPU. Control plane node outage may cause a cascading - failure; increase available CPU. + summary: CPU utilization across all three control plane nodes is higher + than two control plane nodes can sustain; a single control plane node + outage may cause a cascading failure; increase available CPU. syn_component: openshift4-monitoring expr: | sum( @@ -426,7 +449,7 @@ spec: summary: etcd cluster database is running full. syn_component: openshift4-monitoring expr: | - (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95 + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical @@ -829,7 +852,7 @@ spec: syn_component: openshift4-monitoring expr: | ( - max without (revision) ( + max by(namespace, statefulset) ( kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} unless kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"} @@ -1352,10 +1375,12 @@ spec: runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md summary: Clock not synchronising. syn_component: openshift4-monitoring - expr: | + expr: |- + ( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: critical @@ -1367,7 +1392,8 @@ spec: 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. syn_component: openshift4-monitoring - expr: | + expr: |- + ( ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and @@ -1379,6 +1405,7 @@ spec: and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) + ) and on() absent(up{job="ptp-monitor-service"}) for: 10m labels: severity: warning diff --git a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/silence.yaml b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/silence.yaml index c3e45f77..ccae3b65 100644 --- a/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/silence.yaml +++ b/tests/golden/vsphere/openshift4-monitoring/openshift4-monitoring/silence.yaml @@ -63,7 +63,7 @@ spec: configMapKeyRef: key: silences.json name: silence - image: quay.io/appuio/oc:v4.14 + image: quay.io/appuio/oc:v4.15 imagePullPolicy: IfNotPresent name: silence ports: []