appuio · DebakelOrakel · Jul 22, 2024 · Jul 18, 2024
@@ -1,14 +1,17 @@
 https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml
+https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.9/config/prometheus/collector_alerts.yaml release-5.9/collector_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml
 
 https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml
+https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.9/elasticsearch_operator_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml
 
 https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml
+https://raw.githubusercontent.com/openshift/loki/release-5.9/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.9/lokistack_prometheus_alerts.yaml
 https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml
@@ -8,7 +8,7 @@ parameters:
       "False": {}
 
     namespace: openshift-logging
-    version: '5.8'
+    version: '5.9'
     channel: 'stable-${openshift4_logging:version}'
     alerts: 'release-${openshift4_logging:version}'
 

@@ -125,7 +125,13 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos.
 
 // Elasticstack alerts
 
-local isVersion58 = if params.version == '5.8' || params.version == 'master' then true else false;
+local isVersion58 =
+  local major = std.split(params.version, '.')[0];
+  local minor = std.split(params.version, '.')[1];
+  if major == 'master' then true
+  else if std.parseInt(major) >= 6 then true
+  else if std.parseInt(major) == 5 && std.parseInt(minor) >= 8 then true
+  else false;
 
 local esStorageGroup = {
   name: 'elasticsearch_node_storage.alerts',

@@ -0,0 +1,115 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: collector
+  namespace: openshift-logging
+spec:
+  groups:
+  - name: logging_collector.alerts
+    rules:
+    - alert: CollectorNodeDown
+      annotations:
+        message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
+        summary: "Collector cannot be scraped"
+      expr: |
+        up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
+      for: 10m
+      labels:
+        service: collector
+        severity: critical
+    - alert: CollectorHighErrorRate
+      annotations:
+        message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
+        summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
+      expr: |
+        100 * (
+            collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
+          /
+            collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
+          ) > 0.001
+      for: 15m
+      labels:
+        service: collector
+        severity: critical
+    - alert: CollectorVeryHighErrorRate
+      annotations:
+        message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
+        summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
+      expr: |
+        100 * (
+            collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
+          /
+            collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
+          ) > 0.05
+      for: 15m
+      labels:
+        service: collector
+        severity: critical
+    - alert: FluentdQueueLengthIncreasing
+      annotations:
+        message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously."
+        summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}."
+      expr: |
+        sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h)))  + on(pod,plugin_id)  ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 )
+      for: 1h
+      labels:
+        service: collector
+        severity: Warning
+    - alert: ElasticsearchDeprecation
+      annotations:
+        message: "The OpenShift Elasticsearch Operator is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to using the OpenShift Elasticsearch Operator to manage the default log storage, you can use the Loki Operator."
+        summary: "Detected Elasticsearch as the in-cluster storage which is deprecated and will be removed in a future release."
+      expr: |
+        sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
+      for: 5m
+      labels:
+        service: storage
+        severity: Warning
+        namespace: openshift-logging
+    - alert: FluentdDeprecation
+      annotations:
+        message: "Fluentd is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to Fluentd, you can use Vector instead."
+        summary: "Detected Fluentd as the collector which is deprecated and will be removed in a future release."
+      expr: |
+        sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
+      for: 5m
+      labels:
+        service: collector
+        severity: Warning
+        namespace: openshift-logging
+    - alert: KibanaDeprecation
+      annotations:
+        message: "The Kibana web console is now deprecated and is planned to be removed in a future logging release."
+        summary: "Detected Kibana as the visualization which is deprecated and will be removed in a future release."
+      expr: |
+        sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
+      for: 5m
+      labels:
+        service: visualization
+        severity: Warning
+        namespace: openshift-logging
+    - alert: DiskBufferUsage
+      annotations:
+        message: "Collectors potentially consuming too much node disk, {{ $value }}% "
+        summary: "Detected consuming too much node disk on $labels.hostname host"
+      expr: |
+        (label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)') 
+        / on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100  > 15
+      for: 5m
+      labels:
+        service: collector
+        severity: Warning
+  - name: logging_clusterlogging_telemetry.rules
+    rules:
+    - expr: |
+        sum by(cluster)(log_collected_bytes_total)
+      record: cluster:log_collected_bytes_total:sum
+    - expr: |
+        sum by(cluster)(log_logged_bytes_total)
+      record: cluster:log_logged_bytes_total:sum
+    - expr: |
+        sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m]))
+      record: collector:log_num_errors:sum_rate
+    - expr: |
+        sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m]))
+      record: collector:received_events:sum_rate
@@ -0,0 +1,224 @@
+---
+"groups":
+- "name": logging_elasticsearch.alerts
+  "rules":
+  - "alert": ElasticsearchClusterNotHealthy
+    "annotations":
+      "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
+      "summary": "Cluster health status is RED"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red"
+    "expr": |
+      sum by (cluster) (es_cluster_status == 2)
+    "for": 7m
+    "labels":
+      "namespace": openshift-logging
+      "severity": critical
+
+  - "alert": ElasticsearchClusterNotHealthy
+    "annotations":
+      "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
+      "summary": "Cluster health status is YELLOW"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow"
+    "expr": |
+      sum by (cluster) (es_cluster_status == 1)
+    "for": 20m
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning
+
+  - "alert": ElasticsearchWriteRequestsRejectionJumps
+    "annotations":
+      "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
+      "summary": "High Write Rejection Ratio - {{ $value }}%"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps"
+    "expr": |
+      round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5
+    "for": 10m
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning
+
+  - "alert": ElasticsearchNodeDiskWatermarkReached
+    "annotations":
+      "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
+      "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached"
+    "expr": |
+      sum by (instance, pod) (
+        round(
+          (1 - (
+            es_fs_path_available_bytes /
+            es_fs_path_total_bytes
+          )
+        ) * 100, 0.001)
+      ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
+    "for": 5m
+    "labels":
+      "namespace": openshift-logging
+      "severity": info
+
+  - "alert": ElasticsearchNodeDiskWatermarkReached
+    "annotations":
+      "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
+      "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached"
+    "expr": |
+      sum by (instance, pod) (
+        round(
+          (1 - (
+            es_fs_path_available_bytes /
+            es_fs_path_total_bytes
+          )
+        ) * 100, 0.001)
+      ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
+    "for": 5m
+    "labels":
+      "namespace": openshift-logging
+      "severity": critical
+
+  - "alert": ElasticsearchNodeDiskWatermarkReached
+    "annotations":
+      "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark."
+      "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached"
+    "expr": |
+      sum by (instance, pod) (
+        round(
+          (1 - (
+            es_fs_path_available_bytes /
+            es_fs_path_total_bytes
+          )
+        ) * 100, 0.001)
+      ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
+    "for": 5m
+    "labels":
+      "namespace": openshift-logging
+      "severity": critical
+
+  - "alert": ElasticsearchJVMHeapUseHigh
+    "annotations":
+      "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
+      "summary": "JVM Heap usage on the node is high"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High"
+    "expr": |
+      sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75
+    "for": 10m
+    "labels":
+      "namespace": openshift-logging
+      "severity": info
+
+  - "alert": AggregatedLoggingSystemCPUHigh
+    "annotations":
+      "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
+      "summary": "System CPU usage is high"
+      "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High"
+    "expr": |
+      sum by (cluster, instance, node) (es_os_cpu_percent) > 90
+    "for": 1m
+    "labels":
+      "namespace": openshift-logging
+      "severity": info
+
+  - "alert": ElasticsearchProcessCPUHigh
+    "annotations":
+      "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
+      "summary": "ES process CPU usage is high"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High"
+    "expr": |
+      sum by (cluster, instance, node) (es_process_cpu_percent) > 90
+    "for": 1m
+    "labels":
+      "namespace": openshift-logging
+      "severity": info
+
+  - "alert": ElasticsearchDiskSpaceRunningLow
+    "annotations":
+      "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h."
+      "summary": "Cluster low on disk space"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low"
+    "expr": |
+      sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0
+    "for": 1h
+    "labels":
+      "namespace": openshift-logging
+      "severity": critical
+
+  - "alert": ElasticsearchHighFileDescriptorUsage
+    "annotations":
+      "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour."
+      "summary": "Cluster low on file descriptors"
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high"
+    "expr": |
+      predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0
+    "for": 10m
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning
+
+  - "alert": ElasticsearchOperatorCSVNotSuccessful
+    "annotations":
+      "message": "Elasticsearch Operator CSV has not reconciled succesfully."
+      "summary": "Elasticsearch Operator CSV Not Successful"
+    "expr": |
+      csv_succeeded{name =~ "elasticsearch-operator.*"} == 0
+    "for": 10m
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning
+
+  - "alert": ElasticsearchNodeDiskWatermarkReached
+    "annotations":
+      "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
+      "summary": "Disk Low Watermark is predicted to be reached within next 6h."
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached"
+    "expr": |
+      sum by (instance, pod) (
+        round(
+          (1 - (
+            predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
+            predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
+          )
+        ) * 100, 0.001)
+      ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
+    "for": 1h
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning
+
+  - "alert": ElasticsearchNodeDiskWatermarkReached
+    "annotations":
+      "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
+      "summary": "Disk High Watermark is predicted to be reached within next 6h."
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached"
+    "expr": |
+      sum by (instance, pod) (
+        round(
+          (1 - (
+            predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
+            predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
+          )
+        ) * 100, 0.001)
+      ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
+    "for": 1h
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning
+
+  - "alert": ElasticsearchNodeDiskWatermarkReached
+    "annotations":
+      "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark."
+      "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h."
+      "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached"
+    "expr": |
+      sum by (instance, pod) (
+        round(
+          (1 - (
+            predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
+            predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
+          )
+        ) * 100, 0.001)
+      ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
+    "for": 1h
+    "labels":
+      "namespace": openshift-logging
+      "severity": warning