Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to OpenShift Logging 5.9 #141

Merged
merged 1 commit into from
Jul 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions alerts.txt
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.9/config/prometheus/collector_alerts.yaml release-5.9/collector_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml

https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.9/elasticsearch_operator_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml

https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/release-5.9/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.9/lokistack_prometheus_alerts.yaml
https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml
2 changes: 1 addition & 1 deletion class/defaults.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ parameters:
"False": {}

namespace: openshift-logging
version: '5.8'
version: '5.9'
channel: 'stable-${openshift4_logging:version}'
alerts: 'release-${openshift4_logging:version}'

Expand Down
8 changes: 7 additions & 1 deletion component/alertrules.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,13 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos.

// Elasticstack alerts

local isVersion58 = if params.version == '5.8' || params.version == 'master' then true else false;
local isVersion58 =
local major = std.split(params.version, '.')[0];
local minor = std.split(params.version, '.')[1];
if major == 'master' then true
else if std.parseInt(major) >= 6 then true
else if std.parseInt(major) == 5 && std.parseInt(minor) >= 8 then true
else false;

local esStorageGroup = {
name: 'elasticsearch_node_storage.alerts',
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: collector
namespace: openshift-logging
spec:
groups:
- name: logging_collector.alerts
rules:
- alert: CollectorNodeDown
annotations:
message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m."
summary: "Collector cannot be scraped"
expr: |
up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0
for: 10m
labels:
service: collector
severity: critical
- alert: CollectorHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.001
for: 15m
labels:
service: collector
severity: critical
- alert: CollectorVeryHighErrorRate
annotations:
message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component."
summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high"
expr: |
100 * (
collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
/
collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"}
) > 0.05
for: 15m
labels:
service: collector
severity: critical
- alert: FluentdQueueLengthIncreasing
annotations:
message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously."
summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}."
expr: |
sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 )
for: 1h
labels:
service: collector
severity: Warning
- alert: ElasticsearchDeprecation
annotations:
message: "The OpenShift Elasticsearch Operator is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to using the OpenShift Elasticsearch Operator to manage the default log storage, you can use the Loki Operator."
summary: "Detected Elasticsearch as the in-cluster storage which is deprecated and will be removed in a future release."
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='elasticsearch'}) > 0
for: 5m
labels:
service: storage
severity: Warning
namespace: openshift-logging
- alert: FluentdDeprecation
annotations:
message: "Fluentd is deprecated and is planned to be removed in a future release. Red Hat provides bug fixes and support for this feature during the current release lifecycle, but this feature no longer receives enhancements. As an alternative to Fluentd, you can use Vector instead."
summary: "Detected Fluentd as the collector which is deprecated and will be removed in a future release."
expr: |
sum(kube_pod_labels{namespace="openshift-logging", label_implementation='fluentd', label_app_kubernetes_io_managed_by="cluster-logging-operator"}) > 0
for: 5m
labels:
service: collector
severity: Warning
namespace: openshift-logging
- alert: KibanaDeprecation
annotations:
message: "The Kibana web console is now deprecated and is planned to be removed in a future logging release."
summary: "Detected Kibana as the visualization which is deprecated and will be removed in a future release."
expr: |
sum(kube_pod_labels{namespace="openshift-logging",label_component='kibana'}) > 0
for: 5m
labels:
service: visualization
severity: Warning
namespace: openshift-logging
- alert: DiskBufferUsage
annotations:
message: "Collectors potentially consuming too much node disk, {{ $value }}% "
summary: "Detected consuming too much node disk on $labels.hostname host"
expr: |
(label_replace(sum by(hostname) (vector_buffer_byte_size{component_kind='sink', buffer_type='disk'}), 'instance', '$1', 'hostname', '(.*)')
/ on(instance) group_left() sum by(instance) (node_filesystem_size_bytes{mountpoint='/var'})) * 100 > 15
for: 5m
labels:
service: collector
severity: Warning
- name: logging_clusterlogging_telemetry.rules
rules:
- expr: |
sum by(cluster)(log_collected_bytes_total)
record: cluster:log_collected_bytes_total:sum
- expr: |
sum by(cluster)(log_logged_bytes_total)
record: cluster:log_logged_bytes_total:sum
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m]))
record: collector:log_num_errors:sum_rate
- expr: |
sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m]))
record: collector:received_events:sum_rate
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
---
"groups":
- "name": logging_elasticsearch.alerts
"rules":
- "alert": ElasticsearchClusterNotHealthy
"annotations":
"message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet."
"summary": "Cluster health status is RED"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red"
"expr": |
sum by (cluster) (es_cluster_status == 2)
"for": 7m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchClusterNotHealthy
"annotations":
"message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated."
"summary": "Cluster health status is YELLOW"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow"
"expr": |
sum by (cluster) (es_cluster_status == 1)
"for": 20m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchWriteRequestsRejectionJumps
"annotations":
"message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed."
"summary": "High Write Rejection Ratio - {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps"
"expr": |
round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
es_fs_path_available_bytes /
es_fs_path_total_bytes
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
es_fs_path_available_bytes /
es_fs_path_total_bytes
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark."
"summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
es_fs_path_available_bytes /
es_fs_path_total_bytes
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
"for": 5m
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchJVMHeapUseHigh
"annotations":
"message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "JVM Heap usage on the node is high"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High"
"expr": |
sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75
"for": 10m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": AggregatedLoggingSystemCPUHigh
"annotations":
"message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "System CPU usage is high"
"runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High"
"expr": |
sum by (cluster, instance, node) (es_os_cpu_percent) > 90
"for": 1m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchProcessCPUHigh
"annotations":
"message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%."
"summary": "ES process CPU usage is high"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High"
"expr": |
sum by (cluster, instance, node) (es_process_cpu_percent) > 90
"for": 1m
"labels":
"namespace": openshift-logging
"severity": info

- "alert": ElasticsearchDiskSpaceRunningLow
"annotations":
"message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h."
"summary": "Cluster low on disk space"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low"
"expr": |
sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0
"for": 1h
"labels":
"namespace": openshift-logging
"severity": critical

- "alert": ElasticsearchHighFileDescriptorUsage
"annotations":
"message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour."
"summary": "Cluster low on file descriptors"
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high"
"expr": |
predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchOperatorCSVNotSuccessful
"annotations":
"message": "Elasticsearch Operator CSV has not reconciled succesfully."
"summary": "Elasticsearch Operator CSV Not Successful"
"expr": |
csv_succeeded{name =~ "elasticsearch-operator.*"} == 0
"for": 10m
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node."
"summary": "Disk Low Watermark is predicted to be reached within next 6h."
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node."
"summary": "Disk High Watermark is predicted to be reached within next 6h."
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning

- "alert": ElasticsearchNodeDiskWatermarkReached
"annotations":
"message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark."
"summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h."
"runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached"
"expr": |
sum by (instance, pod) (
round(
(1 - (
predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) /
predict_linear(es_fs_path_total_bytes[3h], 6 * 3600)
)
) * 100, 0.001)
) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct
"for": 1h
"labels":
"namespace": openshift-logging
"severity": warning
Loading
Loading