From 2a308e721a8655609014d96b142800596ac8ac48 Mon Sep 17 00:00:00 2001 From: Stephan Feurer Date: Mon, 8 Jan 2024 15:08:27 +0100 Subject: [PATCH] Support OpenShift Logging 5.8 --- alerts.txt | 5 +- class/defaults.yml | 4 +- component/alertrules.libsonnet | 10 +- .../master/collector_prometheus_alerts.yaml | 71 ++++++ .../master/fluentd_prometheus_alerts.yaml | 64 ----- .../collector_prometheus_alerts.yaml | 71 ++++++ ...sticsearch_operator_prometheus_alerts.yaml | 224 ++++++++++++++++++ .../lokistack_prometheus_alerts.yaml | 177 ++++++++++++++ component/loki.libsonnet | 16 +- .../ROOT/pages/references/parameters.adoc | 2 +- .../openshift4-logging/20_subscriptions.yaml | 4 +- .../60_collector_alerts.yaml | 61 +++++ .../60_elasticsearch_alerts.yaml | 50 ---- .../openshift4-logging/20_subscriptions.yaml | 4 +- .../openshift4-logging/50_loki_stack.yaml | 2 +- .../60_collector_alerts.yaml | 61 +++++ .../60_collector_alerts.yaml | 61 +++++ .../60_elasticsearch_alerts.yaml | 50 ---- .../openshift4-logging/20_subscriptions.yaml | 4 +- .../60_collector_alerts.yaml | 61 +++++ .../60_elasticsearch_alerts.yaml | 50 ---- tests/master.yml | 4 +- 22 files changed, 819 insertions(+), 237 deletions(-) create mode 100644 component/extracted_alerts/master/collector_prometheus_alerts.yaml delete mode 100644 component/extracted_alerts/master/fluentd_prometheus_alerts.yaml create mode 100644 component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml create mode 100644 component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml create mode 100644 component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml create mode 100644 tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml create mode 100644 tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml create mode 100644 tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml create mode 100644 tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml diff --git a/alerts.txt b/alerts.txt index b34b388..bb2bdfb 100644 --- a/alerts.txt +++ b/alerts.txt @@ -1,11 +1,14 @@ https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.6/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.6/fluentd_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.7/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert release-5.7/fluentd_prometheus_alerts.yaml -https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/internal/metrics/alerts/fluentd.go.FluentdPrometheusAlert master/fluentd_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/cluster-logging-operator/release-5.8/config/prometheus/collector_alerts.yaml release-5.8/collector_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/cluster-logging-operator/master/config/prometheus/collector_alerts.yaml master/collector_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.6/files/prometheus_alerts.yml release-5.6/elasticsearch_operator_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.7/files/prometheus_alerts.yml release-5.7/elasticsearch_operator_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/elasticsearch-operator/release-5.8/files/prometheus_alerts.yml release-5.8/elasticsearch_operator_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/elasticsearch-operator/master/files/prometheus_alerts.yml master/elasticsearch_operator_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/loki/release-5.6/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.6/lokistack_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/loki/release-5.7/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.7/lokistack_prometheus_alerts.yaml +https://raw.githubusercontent.com/openshift/loki/release-5.8/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml release-5.8/lokistack_prometheus_alerts.yaml https://raw.githubusercontent.com/openshift/loki/main/operator/internal/manifests/internal/alerts/prometheus-alerts.yaml master/lokistack_prometheus_alerts.yaml diff --git a/class/defaults.yml b/class/defaults.yml index e8250c4..ea95628 100644 --- a/class/defaults.yml +++ b/class/defaults.yml @@ -8,7 +8,7 @@ parameters: "False": {} namespace: openshift-logging - version: '5.7' + version: '5.8' channel: 'stable-${openshift4_logging:version}' alerts: 'release-${openshift4_logging:version}' @@ -29,7 +29,7 @@ parameters: endpoint: '' bucketnames: '${cluster:name}-logstore' spec: - size: 1x.extra-small + size: 1x.demo storage: schemas: - version: v12 diff --git a/component/alertrules.libsonnet b/component/alertrules.libsonnet index 21e523b..853a579 100644 --- a/component/alertrules.libsonnet +++ b/component/alertrules.libsonnet @@ -125,14 +125,17 @@ local prometheus_rules(name, groups, baseURL) = kube._Object('monitoring.coreos. // Elasticstack alerts +local isVersion58 = if params.version == '5.8' || params.version == 'master' then true else false; + local esStorageGroup = { name: 'elasticsearch_node_storage.alerts', rules: [ predictESStorage ], }; +local fluentdGroup = if !isVersion58 then loadFile('fluentd_prometheus_alerts.yaml')[0].groups else []; local esGroups = loadFile('elasticsearch_operator_prometheus_alerts.yaml')[0].groups + - loadFile('fluentd_prometheus_alerts.yaml')[0].groups + + fluentdGroup + [ if predict_storage_alert.enabled then esStorageGroup, ]; @@ -143,7 +146,12 @@ local esBaseURL = 'https://github.com/openshift/elasticsearch-operator/blob/mast local lokiGroups = loadFile('lokistack_prometheus_alerts.yaml')[0].groups; local lokiBaseURL = 'https://github.com/grafana/loki/blob/main/operator/docs/lokistack/sop.md'; +// Collector alerts + +local collectorGroups = loadFile('collector_prometheus_alerts.yaml')[0].spec.groups; + { [if elasticsearch.enabled then '60_elasticsearch_alerts']: prometheus_rules('syn-elasticsearch-logging-rules', esGroups, esBaseURL), [if loki.enabled then '60_lokistack_alerts']: prometheus_rules('syn-loki-logging-rules', lokiGroups, lokiBaseURL), + [if isVersion58 then '60_collector_alerts']: prometheus_rules('syn-collector-rules', collectorGroups, ''), } diff --git a/component/extracted_alerts/master/collector_prometheus_alerts.yaml b/component/extracted_alerts/master/collector_prometheus_alerts.yaml new file mode 100644 index 0000000..c4f1663 --- /dev/null +++ b/component/extracted_alerts/master/collector_prometheus_alerts.yaml @@ -0,0 +1,71 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: collector + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: CollectorNodeDown + annotations: + message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + summary: "Collector cannot be scraped" + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + - alert: CollectorHighErrorRate + annotations: + message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." + summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.001 + for: 15m + labels: + service: collector + severity: critical + - alert: CollectorVeryHighErrorRate + annotations: + message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." + summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.05 + for: 15m + labels: + service: collector + severity: critical + - alert: FluentdQueueLengthIncreasing + annotations: + message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." + summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." + expr: | + sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) + for: 1h + labels: + service: collector + severity: Warning + - name: logging_clusterlogging_telemetry.rules + rules: + - expr: | + sum by(cluster)(log_collected_bytes_total) + record: cluster:log_collected_bytes_total:sum + - expr: | + sum by(cluster)(log_logged_bytes_total) + record: cluster:log_logged_bytes_total:sum + - expr: | + sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) + record: collector:log_num_errors:sum_rate + - expr: | + sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) + record: collector:received_events:sum_rate diff --git a/component/extracted_alerts/master/fluentd_prometheus_alerts.yaml b/component/extracted_alerts/master/fluentd_prometheus_alerts.yaml deleted file mode 100644 index 7772c47..0000000 --- a/component/extracted_alerts/master/fluentd_prometheus_alerts.yaml +++ /dev/null @@ -1,64 +0,0 @@ - -"groups": -- "name": "logging_fluentd.alerts" - "rules": - - "alert": "FluentdNodeDown" - "annotations": - "message": "Prometheus could not scrape fluentd {{ $labels.container }} for more than 10m." - "summary": "Fluentd cannot be scraped" - "expr": | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - "for": "10m" - "labels": - "service": "collector" - "severity": "critical" - namespace: "openshift-logging" - - "alert": "FluentdQueueLengthIncreasing" - "annotations": - "message": "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." - "summary": "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." - "expr": | - sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) - "for": "1h" - "labels": - "service": "collector" - "severity": "Warning" - namespace: "openshift-logging" - - alert: FluentDHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - severity: warning - namespace: "openshift-logging" - - alert: FluentDVeryHighErrorRate - annotations: - message: |- - {{ $value }}% of records have resulted in an error by fluentd {{ $labels.instance }}. - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - severity: critical - namespace: "openshift-logging" -- "name": "logging_clusterlogging_telemetry.rules" - "rules": - - "expr": | - sum by(cluster)(log_collected_bytes_total) - "record": "cluster:log_collected_bytes_total:sum" - - "expr": | - sum by(cluster)(log_logged_bytes_total) - "record": "cluster:log_logged_bytes_total:sum" diff --git a/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml new file mode 100644 index 0000000..c4f1663 --- /dev/null +++ b/component/extracted_alerts/release-5.8/collector_prometheus_alerts.yaml @@ -0,0 +1,71 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: collector + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: CollectorNodeDown + annotations: + message: "Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod }} collector component for more than 10m." + summary: "Collector cannot be scraped" + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + - alert: CollectorHighErrorRate + annotations: + message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." + summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are high" + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.001 + for: 15m + labels: + service: collector + severity: critical + - alert: CollectorVeryHighErrorRate + annotations: + message: "{{ $value }}% of records have resulted in an error by {{ $labels.namespace }}/{{ $labels.pod }} collector component." + summary: "{{ $labels.namespace }}/{{ $labels.pod }} collector component errors are very high" + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.05 + for: 15m + labels: + service: collector + severity: critical + - alert: FluentdQueueLengthIncreasing + annotations: + message: "For the last hour, fluentd {{ $labels.pod }} output '{{ $labels.plugin_id }}' average buffer queue length has increased continuously." + summary: "Fluentd pod {{ $labels.pod }} is unable to keep up with traffic over time for forwarder output {{ $labels.plugin_id }}." + expr: | + sum by (pod,plugin_id) ( 0 * (deriv(fluentd_output_status_emit_records[1m] offset 1h))) + on(pod,plugin_id) ( deriv(fluentd_output_status_buffer_queue_length[10m]) > 0 and delta(fluentd_output_status_buffer_queue_length[1h]) > 1 ) + for: 1h + labels: + service: collector + severity: Warning + - name: logging_clusterlogging_telemetry.rules + rules: + - expr: | + sum by(cluster)(log_collected_bytes_total) + record: cluster:log_collected_bytes_total:sum + - expr: | + sum by(cluster)(log_logged_bytes_total) + record: cluster:log_logged_bytes_total:sum + - expr: | + sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_errors_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_num_errors[2m])) + record: collector:log_num_errors:sum_rate + - expr: | + sum by(pod, namespace, app_kubernetes_io_part_of)(rate(vector_component_received_events_total[2m])) or sum by(pod, namespace, app_kubernetes_io_part_of)(rate(fluentd_output_status_emit_records[2m])) + record: collector:received_events:sum_rate diff --git a/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml new file mode 100644 index 0000000..8f79010 --- /dev/null +++ b/component/extracted_alerts/release-5.8/elasticsearch_operator_prometheus_alerts.yaml @@ -0,0 +1,224 @@ +--- +"groups": +- "name": logging_elasticsearch.alerts + "rules": + - "alert": ElasticsearchClusterNotHealthy + "annotations": + "message": "Cluster {{ $labels.cluster }} health status has been RED for at least 7m. Cluster does not accept writes, shards may be missing or master node hasn't been elected yet." + "summary": "Cluster health status is RED" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Red" + "expr": | + sum by (cluster) (es_cluster_status == 2) + "for": 7m + "labels": + "namespace": openshift-logging + "severity": critical + + - "alert": ElasticsearchClusterNotHealthy + "annotations": + "message": "Cluster {{ $labels.cluster }} health status has been YELLOW for at least 20m. Some shard replicas are not allocated." + "summary": "Cluster health status is YELLOW" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Cluster-Health-is-Yellow" + "expr": | + sum by (cluster) (es_cluster_status == 1) + "for": 20m + "labels": + "namespace": openshift-logging + "severity": warning + + - "alert": ElasticsearchWriteRequestsRejectionJumps + "annotations": + "message": "High Write Rejection Ratio at {{ $labels.node }} node in {{ $labels.cluster }} cluster. This node may not be keeping up with the indexing speed." + "summary": "High Write Rejection Ratio - {{ $value }}%" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Write-Requests-Rejection-Jumps" + "expr": | + round( writing:reject_ratio:rate2m * 100, 0.001 ) > 5 + "for": 10m + "labels": + "namespace": openshift-logging + "severity": warning + + - "alert": ElasticsearchNodeDiskWatermarkReached + "annotations": + "message": "Disk Low Watermark Reached at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." + "summary": "Disk Low Watermark Reached - disk saturation is {{ $value }}%" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" + "expr": | + sum by (instance, pod) ( + round( + (1 - ( + es_fs_path_available_bytes / + es_fs_path_total_bytes + ) + ) * 100, 0.001) + ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct + "for": 5m + "labels": + "namespace": openshift-logging + "severity": info + + - "alert": ElasticsearchNodeDiskWatermarkReached + "annotations": + "message": "Disk High Watermark Reached at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." + "summary": "Disk High Watermark Reached - disk saturation is {{ $value }}%" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" + "expr": | + sum by (instance, pod) ( + round( + (1 - ( + es_fs_path_available_bytes / + es_fs_path_total_bytes + ) + ) * 100, 0.001) + ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct + "for": 5m + "labels": + "namespace": openshift-logging + "severity": critical + + - "alert": ElasticsearchNodeDiskWatermarkReached + "annotations": + "message": "Disk Flood Stage Watermark Reached at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." + "summary": "Disk Flood Stage Watermark Reached - disk saturation is {{ $value }}%" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" + "expr": | + sum by (instance, pod) ( + round( + (1 - ( + es_fs_path_available_bytes / + es_fs_path_total_bytes + ) + ) * 100, 0.001) + ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct + "for": 5m + "labels": + "namespace": openshift-logging + "severity": critical + + - "alert": ElasticsearchJVMHeapUseHigh + "annotations": + "message": "JVM Heap usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." + "summary": "JVM Heap usage on the node is high" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-JVM-Heap-Use-is-High" + "expr": | + sum by (cluster, instance, node) (es_jvm_mem_heap_used_percent) > 75 + "for": 10m + "labels": + "namespace": openshift-logging + "severity": info + + - "alert": AggregatedLoggingSystemCPUHigh + "annotations": + "message": "System CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." + "summary": "System CPU usage is high" + "runbook_url": "[[.RunbookBaseURL]]#Aggregated-Logging-System-CPU-is-High" + "expr": | + sum by (cluster, instance, node) (es_os_cpu_percent) > 90 + "for": 1m + "labels": + "namespace": openshift-logging + "severity": info + + - "alert": ElasticsearchProcessCPUHigh + "annotations": + "message": "ES process CPU usage on the node {{ $labels.node }} in {{ $labels.cluster }} cluster is {{ $value }}%." + "summary": "ES process CPU usage is high" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Process-CPU-is-High" + "expr": | + sum by (cluster, instance, node) (es_process_cpu_percent) > 90 + "for": 1m + "labels": + "namespace": openshift-logging + "severity": info + + - "alert": ElasticsearchDiskSpaceRunningLow + "annotations": + "message": "Cluster {{ $labels.cluster }} is predicted to be out of disk space within the next 6h." + "summary": "Cluster low on disk space" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Disk-Space-is-Running-Low" + "expr": | + sum(predict_linear(es_fs_path_available_bytes[6h], 6 * 3600)) < 0 + "for": 1h + "labels": + "namespace": openshift-logging + "severity": critical + + - "alert": ElasticsearchHighFileDescriptorUsage + "annotations": + "message": "Cluster {{ $labels.cluster }} is predicted to be out of file descriptors within the next hour." + "summary": "Cluster low on file descriptors" + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-FileDescriptor-Usage-is-high" + "expr": | + predict_linear(es_process_file_descriptors_max_number[1h], 3600) - predict_linear(es_process_file_descriptors_open_number[1h], 3600) < 0 + "for": 10m + "labels": + "namespace": openshift-logging + "severity": warning + + - "alert": ElasticsearchOperatorCSVNotSuccessful + "annotations": + "message": "Elasticsearch Operator CSV has not reconciled succesfully." + "summary": "Elasticsearch Operator CSV Not Successful" + "expr": | + csv_succeeded{name =~ "elasticsearch-operator.*"} == 0 + "for": 10m + "labels": + "namespace": openshift-logging + "severity": warning + + - "alert": ElasticsearchNodeDiskWatermarkReached + "annotations": + "message": "Disk Low Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Shards can not be allocated to this node anymore. You should consider adding more disk to the node." + "summary": "Disk Low Watermark is predicted to be reached within next 6h." + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Low-Watermark-Reached" + "expr": | + sum by (instance, pod) ( + round( + (1 - ( + predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / + predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) + ) + ) * 100, 0.001) + ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_low_pct + "for": 1h + "labels": + "namespace": openshift-logging + "severity": warning + + - "alert": ElasticsearchNodeDiskWatermarkReached + "annotations": + "message": "Disk High Watermark is predicted to be reached within the next 6h at {{ $labels.pod }} pod. Some shards will be re-allocated to different nodes if possible. Make sure more disk space is added to the node or drop old indices allocated to this node." + "summary": "Disk High Watermark is predicted to be reached within next 6h." + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-High-Watermark-Reached" + "expr": | + sum by (instance, pod) ( + round( + (1 - ( + predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / + predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) + ) + ) * 100, 0.001) + ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_high_pct + "for": 1h + "labels": + "namespace": openshift-logging + "severity": warning + + - "alert": ElasticsearchNodeDiskWatermarkReached + "annotations": + "message": "Disk Flood Stage Watermark is predicted to be reached within the next 6h at {{ $labels.pod }}. Every index having a shard allocated on this node is enforced a read-only block. The index block must be released manually when the disk utilization falls below the high watermark." + "summary": "Disk Flood Stage Watermark is predicted to be reached within next 6h." + "runbook_url": "[[.RunbookBaseURL]]#Elasticsearch-Node-Disk-Flood-Watermark-Reached" + "expr": | + sum by (instance, pod) ( + round( + (1 - ( + predict_linear(es_fs_path_available_bytes[3h], 6 * 3600) / + predict_linear(es_fs_path_total_bytes[3h], 6 * 3600) + ) + ) * 100, 0.001) + ) > on(instance, pod) es_cluster_routing_allocation_disk_watermark_flood_stage_pct + "for": 1h + "labels": + "namespace": openshift-logging + "severity": warning \ No newline at end of file diff --git a/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml new file mode 100644 index 0000000..f378c49 --- /dev/null +++ b/component/extracted_alerts/release-5.8/lokistack_prometheus_alerts.yaml @@ -0,0 +1,177 @@ +--- +groups: +- name: logging_loki.alerts + rules: + - alert: LokiRequestErrors + annotations: + message: |- + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + summary: "At least 10% of requests are responded by 5xx server errors." + runbook_url: "[[ .RunbookURL ]]#Loki-Request-Errors" + expr: | + sum( + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code=~"5.."} + ) by (job, namespace, route) + / + sum( + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m + ) by (job, namespace, route) + * 100 + > 10 + for: 15m + labels: + severity: critical + - alert: LokiStackWriteRequestErrors + annotations: + message: |- + {{ printf "%.2f" $value }}% of write requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. + summary: "At least 10% of write requests to the lokistack-gateway are responded with 5xx server errors." + runbook_url: "[[ .RunbookURL ]]#LokiStack-Write-Request-Errors" + expr: | + sum( + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler="push"} + ) by (job, namespace) + / + sum( + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler="push"} + ) by (job, namespace) + * 100 + > 10 + for: 15m + labels: + severity: critical + - alert: LokiStackReadRequestErrors + annotations: + message: |- + {{ printf "%.2f" $value }}% of query requests from {{ $labels.job }} in {{ $labels.namespace }} are returned with server errors. + summary: "At least 10% of query requests to the lokistack-gateway are responded with 5xx server errors." + runbook_url: "[[ .RunbookURL ]]#LokiStack-Read-Request-Errors" + expr: | + sum( + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{code=~"5..", handler=~"query|query_range|label|labels|label_values"} + ) by (job, namespace) + / + sum( + code_handler_job_namespace:lokistack_gateway_http_requests:irate1m{handler=~"query|query_range|label|labels|label_values"} + ) by (job, namespace) + * 100 + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: |- + {{ $labels.job }} is experiencing an increase of {{ $value }} panics. + summary: "A panic was triggered." + runbook_url: "[[ .RunbookURL ]]#Loki-Request-Panics" + expr: | + sum( + increase( + loki_panic_total[10m] + ) + ) by (job, namespace) + > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: |- + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + summary: "The 99th percentile is experiencing high latency (higher than 1 second)." + runbook_url: "[[ .RunbookURL ]]#Loki-Request-Latency" + expr: | + histogram_quantile(0.99, + sum( + irate( + loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[1m] + ) + ) by (job, le, namespace, route) + ) + > 1 + for: 15m + labels: + severity: critical + - alert: LokiTenantRateLimit + annotations: + message: |- + {{ $labels.job }} {{ $labels.route }} is experiencing 429 errors. + summary: "At least 10% of requests are responded with the rate limit error code." + runbook_url: "[[ .RunbookURL ]]#Loki-Tenant-Rate-Limit" + expr: | + sum( + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m{status_code="429"} + ) by (job, namespace, route) + / + sum( + job_namespace_route_statuscode:loki_request_duration_seconds_count:irate1m + ) by (job, namespace, route) + * 100 + > 10 + for: 15m + labels: + severity: warning + - alert: LokiStorageSlowWrite + annotations: + message: |- + The storage path is experiencing slow write response rates. + summary: "The storage path is experiencing slow write response rates." + runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Write" + expr: | + histogram_quantile(0.99, + sum( + job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="WRITE"} + ) by (job, le, namespace) + ) + > 1 + for: 15m + labels: + severity: warning + - alert: LokiStorageSlowRead + annotations: + message: |- + The storage path is experiencing slow read response rates. + summary: "The storage path is experiencing slow read response rates." + runbook_url: "[[ .RunbookURL ]]#Loki-Storage-Slow-Read" + expr: | + histogram_quantile(0.99, + sum( + job_le_namespace_operation:loki_boltdb_shipper_request_duration_seconds_bucket:rate5m{operation="Shipper.Query"} + ) by (job, le, namespace) + ) + > 5 + for: 15m + labels: + severity: warning + - alert: LokiWritePathHighLoad + annotations: + message: |- + The write path is experiencing high load. + summary: "The write path is experiencing high load, causing backpressure storage flushing." + runbook_url: "[[ .RunbookURL ]]#Loki-Write-Path-High-Load" + expr: | + sum( + loki_ingester_wal_replay_flushing + ) by (job, namespace) + > 0 + for: 15m + labels: + severity: warning + - alert: LokiReadPathHighLoad + annotations: + message: |- + The read path is experiencing high load. + summary: "The read path has high volume of queries, causing longer response times." + runbook_url: "[[ .RunbookURL ]]#Loki-Read-Path-High-Load" + expr: | + histogram_quantile(0.99, + sum( + rate( + loki_logql_querystats_latency_seconds_bucket[5m] + ) + ) by (job, le, namespace) + ) + > 30 + for: 15m + labels: + severity: warning diff --git a/component/loki.libsonnet b/component/loki.libsonnet index 8b072e6..01ee290 100644 --- a/component/loki.libsonnet +++ b/component/loki.libsonnet @@ -12,35 +12,35 @@ local loki = inv.parameters.openshift4_logging.components.lokistack; local lokistack_spec = { template: { compactor: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 1, + [if loki.spec.size == '1x.demo' then 'replicas']: 1, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, distributor: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 2, + [if loki.spec.size == '1x.demo' then 'replicas']: 2, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, gateway: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 1, + [if loki.spec.size == '1x.demo' then 'replicas']: 1, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, indexGateway: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 2, + [if loki.spec.size == '1x.demo' then 'replicas']: 2, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, ingester: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 2, + [if loki.spec.size == '1x.demo' then 'replicas']: 2, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, querier: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 1, + [if loki.spec.size == '1x.demo' then 'replicas']: 1, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, queryFrontend: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 1, + [if loki.spec.size == '1x.demo' then 'replicas']: 1, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, ruler: { - [if loki.spec.size == '1x.extra-small' then 'replicas']: 1, + [if loki.spec.size == '1x.demo' then 'replicas']: 1, nodeSelector: { 'node-role.kubernetes.io/infra': '' }, }, }, diff --git a/docs/modules/ROOT/pages/references/parameters.adoc b/docs/modules/ROOT/pages/references/parameters.adoc index 1e01228..5911554 100644 --- a/docs/modules/ROOT/pages/references/parameters.adoc +++ b/docs/modules/ROOT/pages/references/parameters.adoc @@ -18,7 +18,7 @@ The namespace in which to install the operator. [horizontal] type:: string -default:: `5.7` +default:: `5.8` The logging stack version to deploy. This parameter is used in the default values for parameters `channel` and `alerts`. diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml index ceb25df..f9b7b56 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -7,7 +7,7 @@ metadata: name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.7 + channel: stable-5.8 config: resources: limits: @@ -29,7 +29,7 @@ metadata: name: elasticsearch-operator namespace: openshift-operators-redhat spec: - channel: stable-5.7 + channel: stable-5.8 config: resources: limits: diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml new file mode 100644 index 0000000..11efa15 --- /dev/null +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -0,0 +1,61 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: syn-collector-rules + name: syn-collector-rules + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: SYN_CollectorNodeDown + annotations: + message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + }} collector component for more than 10m. + summary: Collector cannot be scraped + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.001 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorVeryHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are very high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.05 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging diff --git a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml index c92e81f..ab921b9 100644 --- a/tests/golden/defaults/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml +++ b/tests/golden/defaults/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml @@ -183,56 +183,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging - - name: logging_fluentd.alerts - rules: - - alert: SYN_FluentdNodeDown - annotations: - message: Prometheus could not scrape fluentd {{ $labels.container }} for - more than 10m. - summary: Fluentd cannot be scraped - expr: | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - for: 10m - labels: - namespace: openshift-logging - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentDHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by fluentd - {{ $labels.instance }}.' - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentDVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by fluentd - {{ $labels.instance }}.' - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - name: elasticsearch_node_storage.alerts rules: - alert: SYN_ElasticsearchExpectNodeToReachDiskWatermark diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml index f2fd2e1..2c1caee 100644 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/lokistack/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -7,7 +7,7 @@ metadata: name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.7 + channel: stable-5.8 config: resources: limits: @@ -29,7 +29,7 @@ metadata: name: loki-operator namespace: openshift-operators-redhat spec: - channel: stable-5.7 + channel: stable-5.8 config: resources: limits: diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml index fab1024..33d0ed7 100644 --- a/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml +++ b/tests/golden/lokistack/openshift4-logging/openshift4-logging/50_loki_stack.yaml @@ -12,7 +12,7 @@ spec: ingestion: ingestionBurstSize: 9 ingestionRate: 5 - size: 1x.extra-small + size: 1x.demo storage: schemas: - effectiveDate: '2022-06-01' diff --git a/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml new file mode 100644 index 0000000..11efa15 --- /dev/null +++ b/tests/golden/lokistack/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -0,0 +1,61 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: syn-collector-rules + name: syn-collector-rules + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: SYN_CollectorNodeDown + annotations: + message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + }} collector component for more than 10m. + summary: Collector cannot be scraped + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.001 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorVeryHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are very high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.05 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml new file mode 100644 index 0000000..11efa15 --- /dev/null +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -0,0 +1,61 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: syn-collector-rules + name: syn-collector-rules + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: SYN_CollectorNodeDown + annotations: + message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + }} collector component for more than 10m. + summary: Collector cannot be scraped + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.001 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorVeryHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are very high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.05 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging diff --git a/tests/golden/master/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml b/tests/golden/master/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml index c92e81f..ab921b9 100644 --- a/tests/golden/master/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml +++ b/tests/golden/master/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml @@ -183,56 +183,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging - - name: logging_fluentd.alerts - rules: - - alert: SYN_FluentdNodeDown - annotations: - message: Prometheus could not scrape fluentd {{ $labels.container }} for - more than 10m. - summary: Fluentd cannot be scraped - expr: | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - for: 10m - labels: - namespace: openshift-logging - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentDHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by fluentd - {{ $labels.instance }}.' - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentDVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by fluentd - {{ $labels.instance }}.' - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - name: elasticsearch_node_storage.alerts rules: - alert: SYN_ElasticsearchExpectNodeToReachDiskWatermark diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml index ceb25df..f9b7b56 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/20_subscriptions.yaml @@ -7,7 +7,7 @@ metadata: name: cluster-logging namespace: openshift-logging spec: - channel: stable-5.7 + channel: stable-5.8 config: resources: limits: @@ -29,7 +29,7 @@ metadata: name: elasticsearch-operator namespace: openshift-operators-redhat spec: - channel: stable-5.7 + channel: stable-5.8 config: resources: limits: diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml new file mode 100644 index 0000000..11efa15 --- /dev/null +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_collector_alerts.yaml @@ -0,0 +1,61 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + annotations: {} + labels: + name: syn-collector-rules + name: syn-collector-rules + namespace: openshift-logging +spec: + groups: + - name: logging_collector.alerts + rules: + - alert: SYN_CollectorNodeDown + annotations: + message: Prometheus could not scrape {{ $labels.namespace }}/{{ $labels.pod + }} collector component for more than 10m. + summary: Collector cannot be scraped + expr: | + up{app_kubernetes_io_component = "collector", app_kubernetes_io_part_of = "cluster-logging"} == 0 + for: 10m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.001 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging + - alert: SYN_CollectorVeryHighErrorRate + annotations: + message: '{{ $value }}% of records have resulted in an error by {{ $labels.namespace + }}/{{ $labels.pod }} collector component.' + summary: '{{ $labels.namespace }}/{{ $labels.pod }} collector component + errors are very high' + expr: | + 100 * ( + collector:log_num_errors:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + / + collector:received_events:sum_rate{app_kubernetes_io_part_of = "cluster-logging"} + ) > 0.05 + for: 15m + labels: + service: collector + severity: critical + syn: 'true' + syn_component: openshift4-logging diff --git a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml index c92e81f..ab921b9 100644 --- a/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml +++ b/tests/golden/multilineerr/openshift4-logging/openshift4-logging/60_elasticsearch_alerts.yaml @@ -183,56 +183,6 @@ spec: severity: warning syn: 'true' syn_component: openshift4-logging - - name: logging_fluentd.alerts - rules: - - alert: SYN_FluentdNodeDown - annotations: - message: Prometheus could not scrape fluentd {{ $labels.container }} for - more than 10m. - summary: Fluentd cannot be scraped - expr: | - up{job = "collector", container = "collector"} == 0 or absent(up{job="collector", container="collector"}) == 1 - for: 10m - labels: - namespace: openshift-logging - service: collector - severity: critical - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentDHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by fluentd - {{ $labels.instance }}.' - summary: FluentD output errors are high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 10 - for: 15m - labels: - namespace: openshift-logging - severity: warning - syn: 'true' - syn_component: openshift4-logging - - alert: SYN_FluentDVeryHighErrorRate - annotations: - message: '{{ $value }}% of records have resulted in an error by fluentd - {{ $labels.instance }}.' - summary: FluentD output errors are very high - expr: | - 100 * ( - sum by(instance)(rate(fluentd_output_status_num_errors[2m])) - / - sum by(instance)(rate(fluentd_output_status_emit_records[2m])) - ) > 25 - for: 15m - labels: - namespace: openshift-logging - severity: critical - syn: 'true' - syn_component: openshift4-logging - name: elasticsearch_node_storage.alerts rules: - alert: SYN_ElasticsearchExpectNodeToReachDiskWatermark diff --git a/tests/master.yml b/tests/master.yml index 5487d90..be9ca18 100644 --- a/tests/master.yml +++ b/tests/master.yml @@ -24,6 +24,4 @@ parameters: openshift4_logging: channel: 'stable' - components: - elasticsearch: - alerts: 'master' + alerts: 'master'