Skip to content

Commit

Permalink
Add support for OCP 4.15
Browse files Browse the repository at this point in the history
  • Loading branch information
haasad committed Jul 24, 2024
1 parent 24f7fe7 commit 2dd550e
Show file tree
Hide file tree
Showing 23 changed files with 310 additions and 124 deletions.
17 changes: 6 additions & 11 deletions class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,15 @@ parameters:
prom.libsonnet: openshift4-monitoring-prom.libsonnet
alert-patching.libsonnet: openshift4-monitoring-alert-patching.libsonnet
namespace: openshift-monitoring
# TODO: select based on reported OCP version once we have dynamic facts
manifests_version: release-4.14
=_cluster_monitoring_operator_version_map:
release-4.13: release-4.13
release-4.14: release-4.14
=_etcd_operator_version_map:
release-4.13: release-4.13
release-4.14: release-4.14
manifests_version: release-4.15
# no release branches newer than 4.9 exist
=_operator_lifecycle_manager_map:
release-4.13: release-4.9
release-4.14: release-4.9
release-4.15: release-4.9
jsonnetfile_parameters:
cmo_version: ${openshift4_monitoring:_cluster_monitoring_operator_version_map:${openshift4_monitoring:manifests_version}}
etcd_version: ${openshift4_monitoring:_etcd_operator_version_map:${openshift4_monitoring:manifests_version}}
cmo_version: ${openshift4_monitoring:manifests_version}
etcd_version: ${openshift4_monitoring:manifests_version}
defaultConfig:
nodeSelector:
node-role.kubernetes.io/infra: ''
Expand Down Expand Up @@ -211,6 +205,7 @@ parameters:
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > on (instance) (count by (instance) (node_cpu_info{}) * 100)
release-4.13: {}
release-4.14: {}
release-4.15: {}
# Alerts to ignore for user workload monitoring
ignoreUserWorkload: []

Expand All @@ -237,7 +232,7 @@ parameters:
images:
oc:
image: quay.io/appuio/oc
tag: v4.14
tag: v4.15
node_exporter:
registry: quay.io
repository: prometheus/node-exporter
Expand Down
43 changes: 20 additions & 23 deletions class/openshift4-monitoring.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,6 @@ parameters:
openshift4_monitoring:
=_manifest_urls:
kube-apiserver:
release-4.11:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.11/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.12:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.12/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.13:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.13/bindata/assets/alerts/cpu-utilization.yaml
Expand All @@ -18,28 +10,20 @@ parameters:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.14/bindata/assets/alerts/kube-apiserver-slos-basic.yaml
release-4.15:
api-usage: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/api-usage.yaml
cpu-utilization: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/cpu-utilization.yaml
slos: https://raw.githubusercontent.com/openshift/cluster-kube-apiserver-operator/release-4.15/bindata/assets/alerts/kube-apiserver-slos-basic.yaml

machine-api-operator:
release-4.11:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.11/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.12:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.12/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.13:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.13/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.14:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.14/install/0000_90_machine-api-operator_04_alertrules.yaml
release-4.15:
prometheus: https://raw.githubusercontent.com/openshift/machine-api-operator/release-4.15/install/0000_90_machine-api-operator_04_alertrules.yaml

ovn-kubernetes:
release-4.11:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
# we don't have to worry about unresolved gotemplate references.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
release-4.12:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
# we don't have to worry about unresolved gotemplate references.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml
release-4.13:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We use the "self-hosted" variant of the control-plane alerts, so
Expand All @@ -56,11 +40,24 @@ parameters:
# when selecting OVNKubernetes as the network plugin during
# installation.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.14/bindata/network/ovn-kubernetes/self-hosted/multi-zone-interconnect/alert-rules-control-plane.yaml
release-4.15:
common: https://raw.githubusercontent.com/openshift/cluster-network-operator/${openshift4_monitoring:manifests_version}/bindata/network/ovn-kubernetes/common/alert-rules.yaml
# We handle the gotemplate stuff in Jsonnet for now, since Jinja
# can't deal with gotemplate expressions like `{{.OvnkubeMasterReplicas}}`.
# The only templates that are in the alerting rules can be handled
# with a simple string replace.
control_plane: https://raw.githubusercontent.com/openshift/cluster-network-operator/release-4.15/bindata/network/ovn-kubernetes/self-hosted/alert-rules-control-plane.yaml

cloud-credential-operator:
release-4.13: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.13/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
release-4.14: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.14/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
release-4.15: https://raw.githubusercontent.com/openshift/cloud-credential-operator/release-4.15/manifests/0000_90_cloud-credential-operator_03_alertrules.yaml


kapitan:
dependencies:
- type: https
source: https://raw.githubusercontent.com/openshift/cloud-credential-operator/${openshift4_monitoring:manifests_version}/manifests/0000_90_cloud-credential-operator_04_alertrules.yaml
source: ${openshift4_monitoring:_manifest_urls:cloud-credential-operator:${openshift4_monitoring:manifests_version}}
output_path: dependencies/openshift4-monitoring/manifests/${openshift4_monitoring:manifests_version}/cloud-credential-operator.yaml
# Download cluster-version-operator rules YAML to folder
# `manifests_requiring_prerendering/`, because we cannot prerender
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,25 @@ spec:
syn_component: openshift4-monitoring
- name: syn-cluster-operators
rules:
- alert: SYN_CannotEvaluateConditionalUpdates
annotations:
description: Failure to evaluate conditional update matches means that
Cluster Version Operator cannot decide whether an update path is recommended
or not.
summary: Cluster Version Operator cannot evaluate conditional update matches
for {{ $value | humanizeDuration }}.
syn_component: openshift4-monitoring
expr: |
max by (version, condition, status, reason)
(
(
time()-cluster_version_conditional_update_condition_seconds{condition="Recommended", status="Unknown"}
) >= 3600
)
labels:
severity: warning
syn: 'true'
syn_component: openshift4-monitoring
- alert: SYN_ClusterOperatorDegraded
annotations:
description: The {{ $labels.name }} operator is degraded because {{ $labels.reason
Expand Down Expand Up @@ -392,13 +411,17 @@ spec:
syn_component: openshift4-monitoring
- alert: SYN_HighOverallControlPlaneCPU
annotations:
description: |-
On a multi-node cluster with three control plane nodes, the overall CPU utilization may only be about 2/3 of all available capacity. This is because if a single control plane node fails, the remaining two must handle the load of the cluster in order to be HA. If the cluster is using more than 2/3 of all capacity, if one control plane node fails, the remaining two are likely to fail when they take the load. To fix this, increase the CPU and memory on your control plane nodes.
On a single node OpenShift (SNO) cluster, this alert will also fire if the 2/3 of the CPU cores of the node are in use by any workload. This level of CPU utlization of an SNO cluster is probably not a problem under most circumstances, but high levels of utilization may result in degraded performance. To manage this alert or silence it in case of false positives see the following link: https://docs.openshift.com/container-platform/latest/monitoring/managing-alerts.html
description: Given three control plane nodes, the overall CPU utilization
may only be about 2/3 of all available capacity. This is because if
a single control plane node fails, the remaining two must handle the
load of the cluster in order to be HA. If the cluster is using more
than 2/3 of all capacity, if one control plane node fails, the remaining
two are likely to fail when they take the load. To fix this, increase
the CPU and memory on your control plane nodes.
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-kube-apiserver-operator/ExtremelyHighIndividualControlPlaneCPU.md
summary: CPU utilization across all control plane nodes is more than 60%
of the total available CPU. Control plane node outage may cause a cascading
failure; increase available CPU.
summary: CPU utilization across all three control plane nodes is higher
than two control plane nodes can sustain; a single control plane node
outage may cause a cascading failure; increase available CPU.
syn_component: openshift4-monitoring
expr: |
sum(
Expand Down Expand Up @@ -426,7 +449,7 @@ spec:
summary: etcd cluster database is running full.
syn_component: openshift4-monitoring
expr: |
(last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95
for: 10m
labels:
severity: critical
Expand Down Expand Up @@ -829,7 +852,7 @@ spec:
syn_component: openshift4-monitoring
expr: |
(
max without (revision) (
max by(namespace, statefulset) (
kube_statefulset_status_current_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
unless
kube_statefulset_status_update_revision{namespace=~"(appuio.*|cilium|default|kube-.*|openshift-.*|syn.*)",job="kube-state-metrics"}
Expand Down Expand Up @@ -1352,10 +1375,12 @@ spec:
runbook_url: https://github.com/openshift/runbooks/blob/master/alerts/cluster-monitoring-operator/NodeClockNotSynchronising.md
summary: Clock not synchronising.
syn_component: openshift4-monitoring
expr: |
expr: |-
(
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
and
node_timex_maxerror_seconds{job="node-exporter"} >= 16
) and on() absent(up{job="ptp-monitor-service"})
for: 10m
labels:
severity: critical
Expand All @@ -1367,7 +1392,8 @@ spec:
0.05s. Ensure NTP is configured correctly on this host.
summary: Clock skew detected.
syn_component: openshift4-monitoring
expr: |
expr: |-
(
(
node_timex_offset_seconds{job="node-exporter"} > 0.05
and
Expand All @@ -1379,6 +1405,7 @@ spec:
and
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
)
) and on() absent(up{job="ptp-monitor-service"})
for: 10m
labels:
severity: warning
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ spec:
configMapKeyRef:
key: silences.json
name: silence
image: quay.io/appuio/oc:v4.14
image: quay.io/appuio/oc:v4.15
imagePullPolicy: IfNotPresent
name: silence
ports: []
Expand Down
Loading

0 comments on commit 2dd550e

Please sign in to comment.