Name: prometheus-kube-prometheus-stack-prometheus-rulefiles-0 Namespace: monitoring Labels: managed-by=prometheus-operator prometheus-name=kube-prometheus-stack-prometheus Annotations: Data ==== monitoring-kube-prometheus-stack-k8s.rules.container-cpu-usage-seconds-tot-d702030f-29c8-40da-ba4d-a2bc234e72d4.yaml: ---- groups: - name: k8s.rules.container_cpu_usage_seconds_total rules: - expr: |- sum by (cluster, namespace, pod, container) ( irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate monitoring-kube-prometheus-stack-kube-7cada89b-3612-42db-be91-80845b41640c.yaml: ---- groups: - name: kubernetes-apps rules: - alert: KubePodCrashLooping annotations: description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping summary: Pod is crash looping. expr: | max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1 for: 15m labels: severity: P3 - alert: KubePodNotReady annotations: description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready summary: Pod has been in a non-ready state for more than 15 minutes. expr: | sum by (namespace, pod, cluster) ( max by(namespace, pod, cluster) ( kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"} ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) ( 1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) ) ) > 0 for: 15m labels: severity: P3 - alert: KubeDeploymentGenerationMismatch annotations: description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment }} does not match, this indicates that the Deployment has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch summary: Deployment generation mismatch due to possible roll-back expr: | kube_deployment_status_observed_generation{job="kube-state-metrics"} != kube_deployment_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: P3 - alert: KubeDeploymentReplicasMismatch annotations: description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch summary: Deployment has not matched the expected number of replicas. expr: | ( kube_deployment_spec_replicas{job="kube-state-metrics"} > kube_deployment_status_replicas_available{job="kube-state-metrics"} ) and ( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0 ) for: 15m labels: severity: P3 - alert: KubeDeploymentRolloutStuck annotations: description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment }} is not progressing for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentrolloutstuck summary: Deployment rollout is not progressing. expr: | kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics"} != 0 for: 15m labels: severity: P3 - alert: KubeStatefulSetReplicasMismatch annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has not matched the expected number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch summary: StatefulSet has not matched the expected number of replicas. expr: | ( kube_statefulset_status_replicas_ready{job="kube-state-metrics"} != kube_statefulset_replicas{job="kube-state-metrics"} ) and ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m]) == 0 ) for: 15m labels: severity: P3 - alert: KubeStatefulSetGenerationMismatch annotations: description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset }} does not match, this indicates that the StatefulSet has failed but has not been rolled back. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch summary: StatefulSet generation mismatch due to possible roll-back expr: | kube_statefulset_status_observed_generation{job="kube-state-metrics"} != kube_statefulset_metadata_generation{job="kube-state-metrics"} for: 15m labels: severity: P3 - alert: KubeStatefulSetUpdateNotRolledOut annotations: description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update has not been rolled out. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout summary: StatefulSet update has not been rolled out. expr: | ( max by(namespace, statefulset, job, cluster) ( kube_statefulset_status_current_revision{job="kube-state-metrics"} unless kube_statefulset_status_update_revision{job="kube-state-metrics"} ) * on(namespace, statefulset, job, cluster) ( kube_statefulset_replicas{job="kube-state-metrics"} != kube_statefulset_status_replicas_updated{job="kube-state-metrics"} ) ) and on(namespace, statefulset, job, cluster) ( changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m]) == 0 ) for: 15m labels: severity: P3 - alert: KubeDaemonSetRolloutStuck annotations: description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15m. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck summary: DaemonSet rollout is stuck. expr: | ( ( kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} ) or ( kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} != 0 ) or ( kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} ) or ( kube_daemonset_status_number_available{job="kube-state-metrics"} != kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} ) ) and ( changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}[5m]) == 0 ) for: 15m labels: severity: P3 - alert: KubeContainerWaiting annotations: description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container {{ $labels.container}} has been in waiting state for longer than 1 hour. (reason: "{{ $labels.reason }}").' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting summary: Pod container waiting longer than 1 hour expr: | kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0 for: 1h labels: severity: P3 - alert: KubeDaemonSetNotScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are not scheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled summary: DaemonSet pods are not scheduled. expr: | kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 for: 10m labels: severity: P3 - alert: KubeDaemonSetMisScheduled annotations: description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} are running where they are not supposed to run.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled summary: DaemonSet pods are misscheduled. expr: | kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 for: 15m labels: severity: P3 - alert: KubeJobNotCompleted annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more than {{ "43200" | humanizeDuration }} to complete. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted summary: Job did not complete in time expr: | time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"} and kube_job_status_active{job="kube-state-metrics"} > 0) > 43200 labels: severity: P3 - alert: KubeJobFailed annotations: description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. Removing failed job after investigation should clear this alert. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed summary: Job failed to complete. expr: | kube_job_failed{job="kube-state-metrics"} > 0 for: 15m labels: severity: P4 - alert: KubeHpaReplicasMismatch annotations: description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has not matched the desired number of replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch summary: HPA has not matched desired number of replicas. expr: | (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"} != kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} > kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"}) and (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} < kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}) and changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0 for: 15m labels: severity: P3 - alert: KubeHpaMaxedOut annotations: description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} has been running at max replicas for longer than 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout summary: HPA is running at max replicas expr: | kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"} == kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"} for: 15m labels: severity: P3 - alert: KubePdbNotEnoughHealthyPods annotations: description: PDB {{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects {{ $value }} more healthy pods. The desired number of healthy pods has not been met for at least 15m. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepdbnotenoughhealthypods summary: PDB does not have enough healthy pods. expr: | ( kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics"} - kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics"} ) > 0 for: 15m labels: severity: P3 - name: kubernetes-resources rules: - alert: KubeCPUOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Pods by {{ printf "%.2f" $value }} CPU shares and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit summary: Cluster has overcommitted CPU resource requests. expr: | # Non-HA clusters. ( ( sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 0 ) and count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3 ) or # HA clusters. ( sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - ( # Skip clusters with only one allocatable node. ( sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) ) > 0 ) > 0 ) for: 10m labels: severity: P3 - alert: KubeMemoryOvercommit annotations: description: Cluster has overcommitted memory resource requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node failure. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit summary: Cluster has overcommitted memory resource requests. expr: | # Non-HA clusters. ( ( sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 0 ) and count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3 ) or # HA clusters. ( sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - ( # Skip clusters with only one allocatable node. ( sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) ) > 0 ) > 0 ) for: 10m labels: severity: P3 - alert: KubeCPUQuotaOvercommit annotations: description: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit summary: Cluster has overcommitted CPU resource requests. expr: | sum ( min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"}) ) / sum ( kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"} ) > 1.5 for: 5m labels: severity: P3 - alert: KubeMemoryQuotaOvercommit annotations: description: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit summary: Cluster has overcommitted memory resource requests. expr: | sum ( min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"}) ) / sum ( kube_node_status_allocatable{resource="memory", job="kube-state-metrics"} ) > 1.5 for: 5m labels: severity: P3 - alert: KubeQuotaAlmostFull annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull summary: Namespace quota is going to be full. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 0.9 < 1 for: 15m labels: severity: P5 - alert: KubeQuotaFullyUsed annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused summary: Namespace quota is fully used. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) == 1 for: 15m labels: severity: P5 - alert: KubeQuotaExceeded annotations: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage }} of its {{ $labels.resource }} quota. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded summary: Namespace quota has exceeded the limits. expr: | kube_resourcequota{job="kube-state-metrics", type="used"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) > 1 for: 15m labels: severity: P3 - alert: CPUThrottlingHigh annotations: description: '{{ $value | humanizePercentage }} throttling of CPU in namespace {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh summary: Processes experience elevated CPU throttling. expr: | sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) / on (cluster, namespace, pod, container, instance) group_left sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node) > ( 25 / 100 ) for: 15m labels: severity: P5 - name: kubernetes-storage rules: - alert: KubePersistentVolumeFillingUp annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: | ( kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} ) < 0.03 and kubelet_volume_stats_used_bytes{job="kubelet"} > 0 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m labels: severity: P1 - alert: KubePersistentVolumeFillingUp annotations: description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value | humanizePercentage }} is available. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup summary: PersistentVolume is filling up. expr: | ( kubelet_volume_stats_available_bytes{job="kubelet"} / kubelet_volume_stats_capacity_bytes{job="kubelet"} ) < 0.15 and kubelet_volume_stats_used_bytes{job="kubelet"} > 0 and predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h labels: severity: P3 - alert: KubePersistentVolumeInodesFillingUp annotations: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup summary: PersistentVolumeInodes are filling up. expr: | ( kubelet_volume_stats_inodes_free{job="kubelet"} / kubelet_volume_stats_inodes{job="kubelet"} ) < 0.03 and kubelet_volume_stats_inodes_used{job="kubelet"} > 0 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m labels: severity: P1 - alert: KubePersistentVolumeInodesFillingUp annotations: description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is expected to run out of inodes within four days. Currently {{ $value | humanizePercentage }} of its inodes are free. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup summary: PersistentVolumeInodes are filling up. expr: | ( kubelet_volume_stats_inodes_free{job="kubelet"} / kubelet_volume_stats_inodes{job="kubelet"} ) < 0.15 and kubelet_volume_stats_inodes_used{job="kubelet"} > 0 and predict_linear(kubelet_volume_stats_inodes_free{job="kubelet"}[6h], 4 * 24 * 3600) < 0 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on(cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1h labels: severity: P3 - alert: KubePersistentVolumeErrors annotations: description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors summary: PersistentVolume is having issues with provisioning. expr: | kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 for: 5m labels: severity: P1 - name: kubernetes-system rules: - alert: KubeVersionMismatch annotations: description: There are {{ $value }} different semantic versions of Kubernetes components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch summary: Different semantic versions of Kubernetes components running. expr: | count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1 for: 15m labels: severity: P3 - alert: KubeClientErrors annotations: description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance }}' is experiencing {{ $value | humanizePercentage }} errors. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors summary: Kubernetes API server client is experiencing errors. expr: | (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace) / sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace)) > 0.01 for: 15m labels: severity: P3 - name: kube-apiserver-slos rules: - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum by(cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000) and on(cluster) sum by(cluster) (apiserver_request:burnrate5m) > (14.40 * 0.01000) for: 2m labels: long: 1h severity: P1 short: 5m - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum by(cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000) and on(cluster) sum by(cluster) (apiserver_request:burnrate30m) > (6.00 * 0.01000) for: 15m labels: long: 6h severity: P1 short: 30m - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum by(cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000) and on(cluster) sum by(cluster) (apiserver_request:burnrate2h) > (3.00 * 0.01000) for: 1h labels: long: 1d severity: P3 short: 2h - alert: KubeAPIErrorBudgetBurn annotations: description: The API server is burning too much error budget. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn summary: The API server is burning too much error budget. expr: | sum by(cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000) and on(cluster) sum by(cluster) (apiserver_request:burnrate6h) > (1.00 * 0.01000) for: 3h labels: long: 3d severity: P3 short: 6h - name: kubernetes-system-apiserver rules: - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 for: 5m labels: severity: P3 - alert: KubeClientCertificateExpiration annotations: description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration summary: Client certificate is about to expire. expr: | histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 and on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 for: 5m labels: severity: P1 - alert: KubeAggregatedAPIErrors annotations: description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name }} has reported {{ $labels.reason }} errors. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors summary: Kubernetes aggregated API has reported errors. expr: | sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0 for: 10m labels: severity: P3 - alert: KubeAggregatedAPIDown annotations: description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace }} has been only {{ $value | humanize }}% available over the last 10m. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapidown summary: Kubernetes aggregated API is down. expr: | (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85 for: 5m labels: severity: P3 - alert: KubeAPIDown annotations: description: KubeAPI has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="apiserver"} == 1) for: 15m labels: severity: P1 - alert: KubeAPITerminatedRequests annotations: description: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapiterminatedrequests summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage }} of its incoming requests. expr: | sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20 for: 5m labels: severity: P3 - name: kubernetes-system-kubelet rules: - alert: KubeNodeNotReady annotations: description: '{{ $labels.node }} has been unready for more than 15 minutes.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready summary: Node is not ready. expr: | kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 and on (cluster, node) kube_node_spec_unschedulable{job="kube-state-metrics"} == 0 for: 15m labels: severity: P3 - alert: KubeNodePressure annotations: description: '{{ $labels.node }} has active Condition {{ $labels.condition }}. This is caused by resource usage exceeding eviction thresholds.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodepressure summary: Node has as active Condition. expr: | kube_node_status_condition{job="kube-state-metrics",condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1 and on (cluster, node) kube_node_spec_unschedulable{job="kube-state-metrics"} == 0 for: 10m labels: severity: P5 - alert: KubeNodeUnreachable annotations: description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable summary: Node is unreachable. expr: | (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1 for: 15m labels: severity: P3 - alert: KubeletTooManyPods annotations: description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage }} of its Pod capacity. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods summary: Kubelet is running at capacity. expr: | ( max by (cluster, instance) ( kubelet_running_pods{job="kubelet"} > 1 ) * on (cluster, instance) group_left(node) max by (cluster, instance, node) ( kubelet_node_name{job="kubelet"} ) ) / on (cluster, node) group_left() max by (cluster, node) ( kube_node_status_capacity{job="kube-state-metrics", resource="pods"} != 1 ) > 0.95 for: 15m labels: severity: P5 - alert: KubeNodeReadinessFlapping annotations: description: The readiness status of node {{ $labels.node }} has changed {{ $value }} times in the last 15 minutes. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping summary: Node readiness status is flapping. expr: | sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2 and on (cluster, node) kube_node_spec_unschedulable{job="kube-state-metrics"} == 0 for: 15m labels: severity: P3 - alert: KubeNodeEviction annotations: description: Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal }}. Eviction occurs when eviction thresholds are crossed, typically caused by Pods exceeding RAM/ephemeral-storage limits. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeeviction summary: Node is evicting pods. expr: | sum(rate(kubelet_evictions{job="kubelet"}[15m])) by(cluster, eviction_signal, instance) * on (cluster, instance) group_left(node) max by (cluster, instance, node) ( kubelet_node_name{job="kubelet"} ) > 0 for: 0s labels: severity: P5 - alert: KubeletPlegDurationHigh annotations: description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{ $value }} seconds on node {{ $labels.node }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist. expr: | node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10 for: 5m labels: severity: P3 - alert: KubeletPodStartUpLatencyHigh annotations: description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds on node {{ $labels.node }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh summary: Kubelet Pod startup latency is too high. expr: | histogram_quantile(0.99, sum by (cluster, instance, le) ( topk by (cluster, instance, le, operation_type) (1, rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m]) ) ) ) * on(cluster, instance) group_left(node) topk by (cluster, instance, node) (1, kubelet_node_name{job="kubelet"} ) > 60 for: 15m labels: severity: P3 - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: | kubelet_certificate_manager_client_ttl_seconds < 604800 labels: severity: P3 - alert: KubeletClientCertificateExpiration annotations: description: Client certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration summary: Kubelet client certificate is about to expire. expr: | kubelet_certificate_manager_client_ttl_seconds < 86400 labels: severity: P1 - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: | kubelet_certificate_manager_server_ttl_seconds < 604800 labels: severity: P3 - alert: KubeletServerCertificateExpiration annotations: description: Server certificate for Kubelet on node {{ $labels.node }} expires in {{ $value | humanizeDuration }}. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration summary: Kubelet server certificate is about to expire. expr: | kubelet_certificate_manager_server_ttl_seconds < 86400 labels: severity: P1 - alert: KubeletClientCertificateRenewalErrors annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its client certificate ({{ $value | humanize }} errors in the last 5 minutes). runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors summary: Kubelet has failed to renew its client certificate. expr: | increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0 for: 15m labels: severity: P3 - alert: KubeletServerCertificateRenewalErrors annotations: description: Kubelet on node {{ $labels.node }} has failed to renew its server certificate ({{ $value | humanize }} errors in the last 5 minutes). runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors summary: Kubelet has failed to renew its server certificate. expr: | increase(kubelet_server_expiration_renew_errors[5m]) > 0 for: 15m labels: severity: P3 - alert: KubeletDown annotations: description: Kubelet has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kubelet"} == 1) for: 15m labels: severity: P1 - name: kubernetes-system-scheduler rules: - alert: KubeSchedulerDown annotations: description: KubeScheduler has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-scheduler"} == 1) for: 15m labels: severity: P1 - name: kubernetes-system-controller-manager rules: - alert: KubeControllerManagerDown annotations: description: KubeControllerManager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-controller-manager"} == 1) for: 15m labels: severity: P1 - name: kubernetes-system-kube-proxy rules: - alert: KubeProxyDown annotations: description: KubeProxy has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeproxydown summary: Target disappeared from Prometheus target discovery. expr: | absent(up{job="kube-proxy"} == 1) for: 15m labels: severity: P1 - interval: 3m name: kube-apiserver-availability.rules rules: - expr: | avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30 record: code_verb:apiserver_request_total:increase30d - expr: | sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"}) labels: verb: read record: code:apiserver_request_total:increase30d - expr: | sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) labels: verb: write record: code:apiserver_request_total:increase30d - expr: | sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h])) record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h - expr: | sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30) record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d - expr: | sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"}) record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h - expr: | sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"}) record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d - expr: | 1 - ( ( # write too slow sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0)) ) + ( # read too slow sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0)) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0)) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0)) ) ) + # errors sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0)) ) / sum by (cluster) (code:apiserver_request_total:increase30d) labels: verb: all record: apiserver_request:availability30d - expr: | 1 - ( sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"}) - ( # too slow sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0)) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0)) + sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0)) ) + # errors sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0)) ) / sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"}) labels: verb: read record: apiserver_request:availability30d - expr: | 1 - ( ( # too slow sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"}) - sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0)) ) + # errors sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0)) ) / sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"}) labels: verb: write record: apiserver_request:availability30d - expr: | sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: code_resource:apiserver_request_total:rate5m - expr: | sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: code_resource:apiserver_request_total:rate5m - expr: | sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h])) record: code_verb:apiserver_request_total:increase1h - expr: | sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h])) record: code_verb:apiserver_request_total:increase1h - expr: | sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h])) record: code_verb:apiserver_request_total:increase1h - expr: | sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) record: code_verb:apiserver_request_total:increase1h - name: kube-apiserver-burnrate.rules rules: - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1d])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1d])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1d])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d])) labels: verb: read record: apiserver_request:burnrate1d - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1h])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1h])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1h])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h])) labels: verb: read record: apiserver_request:burnrate1h - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[2h])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[2h])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[2h])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h])) labels: verb: read record: apiserver_request:burnrate2h - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[30m])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[30m])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[30m])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m])) labels: verb: read record: apiserver_request:burnrate30m - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[3d])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[3d])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[3d])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d])) labels: verb: read record: apiserver_request:burnrate3d - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[5m])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[5m])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[5m])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m])) labels: verb: read record: apiserver_request:burnrate5m - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - ( ( sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[6h])) or vector(0) ) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[6h])) + sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[6h])) ) ) + # errors sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h])) labels: verb: read record: apiserver_request:burnrate6h - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1d])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d])) labels: verb: write record: apiserver_request:burnrate1d - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1h])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h])) labels: verb: write record: apiserver_request:burnrate1h - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[2h])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h])) labels: verb: write record: apiserver_request:burnrate2h - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[30m])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m])) labels: verb: write record: apiserver_request:burnrate30m - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[3d])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d])) labels: verb: write record: apiserver_request:burnrate3d - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[5m])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m])) labels: verb: write record: apiserver_request:burnrate5m - expr: | ( ( # too slow sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h])) - sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[6h])) ) + sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h])) ) / sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h])) labels: verb: write record: apiserver_request:burnrate6h - name: kube-apiserver-histogram.rules rules: - expr: | histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: "0.99" verb: read record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0 labels: quantile: "0.99" verb: write record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile - name: k8s.rules.container_cpu_usage_seconds_total rules: - expr: | sum by (cluster, namespace, pod, container) ( rate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m - expr: | sum by (cluster, namespace, pod, container) ( irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m]) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) ( 1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate - name: k8s.rules.container_memory_working_set_bytes rules: - expr: | container_memory_working_set_bytes{job="cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_working_set_bytes - name: k8s.rules.container_memory_rss rules: - expr: | container_memory_rss{job="cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_rss - name: k8s.rules.container_memory_cache rules: - expr: | container_memory_cache{job="cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_cache - name: k8s.rules.container_memory_swap rules: - expr: | container_memory_swap{job="cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_swap - name: k8s.rules.container_memory_requests rules: - expr: | kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - expr: | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_memory:kube_pod_container_resource_requests:sum - name: k8s.rules.container_cpu_requests rules: - expr: | kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - expr: | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum - name: k8s.rules.container_memory_limits rules: - expr: | kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits - expr: | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_memory:kube_pod_container_resource_limits:sum - name: k8s.rules.container_cpu_limits rules: - expr: | kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits - expr: | sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_cpu:kube_pod_container_resource_limits:sum - name: k8s.rules.pod_owner rules: - expr: | max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on (cluster, replicaset, namespace) group_left(owner_name) topk by(cluster, replicaset, namespace) ( 1, max by (cluster, replicaset, namespace, owner_name) ( kube_replicaset_owner{job="kube-state-metrics", owner_kind=""} ) ), "workload", "$1", "replicaset", "(.*)" ) ) labels: workload_type: replicaset record: namespace_workload_pod:kube_pod_owner:relabel - expr: | max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on(replicaset, namespace, cluster) group_left(owner_name) topk by(cluster, replicaset, namespace) ( 1, max by (cluster, replicaset, namespace, owner_name) ( kube_replicaset_owner{job="kube-state-metrics", owner_kind="Deployment"} ) ), "workload", "$1", "owner_name", "(.*)" ) ) labels: workload_type: deployment record: namespace_workload_pod:kube_pod_owner:relabel - expr: | max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) ) labels: workload_type: daemonset record: namespace_workload_pod:kube_pod_owner:relabel - expr: | max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)") ) labels: workload_type: statefulset record: namespace_workload_pod:kube_pod_owner:relabel - expr: | group by (cluster, namespace, workload, pod) ( label_join( group by (cluster, namespace, job_name, pod, owner_name) ( label_join( kube_pod_owner{job="kube-state-metrics", owner_kind="Job"} , "job_name", "", "owner_name") ) * on (cluster, namespace, job_name) group_left() group by (cluster, namespace, job_name) ( kube_job_owner{job="kube-state-metrics", owner_kind=~"Pod|"} ) , "workload", "", "owner_name") ) labels: workload_type: job record: namespace_workload_pod:kube_pod_owner:relabel - expr: | max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="", owner_name=""}, "workload", "$1", "pod", "(.+)") ) labels: workload_type: barepod record: namespace_workload_pod:kube_pod_owner:relabel - expr: | max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="Node"}, "workload", "$1", "pod", "(.+)") ) labels: workload_type: staticpod record: namespace_workload_pod:kube_pod_owner:relabel - expr: | group by (cluster, namespace, workload, workload_type, pod) ( label_join( label_join( group by (cluster, namespace, job_name, pod) ( label_join( kube_pod_owner{job="kube-state-metrics", owner_kind="Job"} , "job_name", "", "owner_name") ) * on (cluster, namespace, job_name) group_left(owner_kind, owner_name) group by (cluster, namespace, job_name, owner_kind, owner_name) ( kube_job_owner{job="kube-state-metrics", owner_kind!="Pod", owner_kind!=""} ) , "workload", "", "owner_name") , "workload_type", "", "owner_kind") OR label_replace( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"} , "replicaset", "$1", "owner_name", "(.+)" ) * on(cluster, namespace, replicaset) group_left(owner_kind, owner_name) group by (cluster, namespace, replicaset, owner_kind, owner_name) ( kube_replicaset_owner{job="kube-state-metrics", owner_kind!="Deployment", owner_kind!=""} ) , "workload", "$1", "owner_name", "(.+)") OR label_replace( group by (cluster, namespace, pod, owner_name, owner_kind) ( kube_pod_owner{job="kube-state-metrics", owner_kind!="ReplicaSet", owner_kind!="DaemonSet", owner_kind!="StatefulSet", owner_kind!="Job", owner_kind!="Node", owner_kind!=""} ) , "workload", "$1", "owner_name", "(.+)" ) , "workload_type", "$1", "owner_kind", "(.+)") ) record: namespace_workload_pod:kube_pod_owner:relabel - name: kube-scheduler.rules rules: - expr: | histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.99" record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.99" record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.99" record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.9" record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.9" record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.9" record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.5" record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.5" record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile - expr: | histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) labels: quantile: "0.5" record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile - name: node.rules rules: - expr: | topk by(cluster, namespace, pod) (1, max by (cluster, node, namespace, pod) ( label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)") )) record: 'node_namespace_pod:kube_pod_info:' - expr: | count by (cluster, node) ( node_cpu_seconds_total{mode="idle",job="node-exporter"} * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:) ) record: node:node_num_cpu:sum - expr: | sum( node_memory_MemAvailable_bytes{job="node-exporter"} or ( node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"} ) ) by (cluster) record: :node_memory_MemAvailable_bytes:sum - expr: | avg by (cluster, node) ( sum without (mode) ( rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m]) ) ) record: node:node_cpu_utilization:ratio_rate5m - expr: | avg by (cluster) ( node:node_cpu_utilization:ratio_rate5m ) record: cluster:node_cpu:ratio_rate5m - name: kubelet.rules rules: - expr: | histogram_quantile( 0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left (node) max by (cluster, instance, node) (kubelet_node_name{job="kubelet"}) ) labels: quantile: "0.99" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - expr: | histogram_quantile( 0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left (node) max by (cluster, instance, node) (kubelet_node_name{job="kubelet"}) ) labels: quantile: "0.9" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile - expr: | histogram_quantile( 0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left (node) max by (cluster, instance, node) (kubelet_node_name{job="kubelet"}) ) labels: quantile: "0.5" record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile monitoring-kube-prometheus-stack-kube-prometheus-node-recording.rules-2074211c-5e93-41aa-9e0d-39008979ef05.yaml: ---- groups: - name: kube-prometheus-node-recording.rules rules: - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance) record: instance:node_cpu:rate:sum - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) record: instance:node_network_receive_bytes:rate:sum - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) record: cluster:node_cpu:sum_rate5m - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu)) record: cluster:node_cpu:ratio monitoring-kube-prometheus-stack-alertmanager-676e321f-306c-46d5-a69f-2fec31495b32.yaml: ---- groups: - name: alertmanager.rules rules: - alert: AlertmanagerFailedReload annotations: description: Configuration has failed to load for {{$labels.instance}}. summary: Reloading an Alertmanager configuration has failed. expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_config_last_reload_successful{job="kube-prometheus-stack-alertmanager"}[5m]) == 0 for: 10m labels: severity: P1 - alert: AlertmanagerMembersInconsistent annotations: description: Alertmanager {{$labels.instance}} has only found {{ $value }} members of the {{$labels.job}} cluster. summary: A member of an Alertmanager cluster has not found all other cluster members. expr: | # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager"}[5m]) < on (namespace,service,cluster) group_left count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager"}[5m])) for: 15m labels: severity: P1 - alert: AlertmanagerFailedToSendAlerts annotations: description: Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}. summary: An Alertmanager instance failed to send notifications. expr: | ( rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager"}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager"}[5m]) ) > 0.01 for: 5m labels: severity: P3 - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. expr: | min by (namespace,service,cluster, integration) ( rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager", integration=~`.*`}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager", integration=~`.*`}[5m]) ) > 0.01 for: 5m labels: severity: P1 - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage }}. summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. expr: | min by (namespace,service,cluster, integration) ( rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager", integration!~`.*`}[5m]) / ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager", integration!~`.*`}[5m]) ) > 0.01 for: 5m labels: severity: P3 - alert: AlertmanagerConfigInconsistent annotations: description: Alertmanager instances within the {{$labels.job}} cluster have different configurations. summary: Alertmanager instances within the same cluster have different configurations. expr: | count by (namespace,service,cluster) ( count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="kube-prometheus-stack-alertmanager"}) ) != 1 for: 20m labels: severity: P1 - alert: AlertmanagerClusterDown annotations: description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have been up for less than half of the last 5m.' summary: Half or more of the Alertmanager instances within the same cluster are down. expr: | ( count by (namespace,service,cluster) ( avg_over_time(up{job="kube-prometheus-stack-alertmanager"}[5m]) < 0.5 ) / count by (namespace,service,cluster) ( up{job="kube-prometheus-stack-alertmanager"} ) ) >= 0.5 for: 5m labels: severity: P1 - alert: AlertmanagerClusterCrashlooping annotations: description: '{{ $value | humanizePercentage }} of Alertmanager instances within the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.' summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. expr: | ( count by (namespace,service,cluster) ( changes(process_start_time_seconds{job="kube-prometheus-stack-alertmanager"}[10m]) > 4 ) / count by (namespace,service,cluster) ( up{job="kube-prometheus-stack-alertmanager"} ) ) >= 0.5 for: 5m labels: severity: P1 monitoring-kube-prometheus-stack-ceph-a47e52d7-da8f-44bf-a5d5-e2f1f83e44a8.yaml: ---- groups: - name: cluster health - name: mon rules: - alert: CephMonDownQuorumAtRisk annotations: description: '{{ $min := printf "floor(count(ceph_mon_metadata{cluster=''%s''}) / 2) + 1" .Labels.cluster | query | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active. Without quorum the cluster will become inoperable, affecting all services and connected clients. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down summary: Monitor quorum is at risk on cluster {{ $labels.cluster }} expr: | ( (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) ( count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1) ) ) == 1 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.3.1 severity: P3 type: ceph_default - alert: CephMonDown annotations: description: '{{ $down := printf "count(ceph_mon_quorum_status{cluster=''%s''} == 0)" .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum is still intact, but the loss of an additional monitor will make your cluster inoperable. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down summary: One or more monitors down on cluster {{ $labels.cluster }} expr: | (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1))) for: 30s labels: severity: P4 type: ceph_default - alert: CephMonDiskspaceCritical annotations: description: The free space available to a monitor's store is critically low. You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}} - {{ .Labels.hostname }} {{- end }} documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit summary: Filesystem space on at least one monitor is critically low on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.3.2 severity: P1 type: ceph_default - alert: CephMonDiskspaceLow annotations: description: The space available to a monitor's store is approaching full (>70% is the default). You should increase the space available to the monitor(s). The default directory is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker node for Rook. Look for old, rotated versions of *.log and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories under /var/lib/rook and other directories on the same filesystem, often /var/log and /var/tmp are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}} - {{ .Labels.hostname }} {{- end }} documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low summary: Drive space on at least one monitor is approaching full on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 for: 5m labels: severity: P3 type: ceph_default - alert: CephMonClockSkew annotations: description: Ceph monitors rely on closely synchronized time to maintain quorum and cluster consistency. This event indicates that the time on at least one mon has drifted too far from the lead mon. Review cluster status with ceph -s. This will show which monitors are affected. Check the time sync status on each monitor host with 'ceph time-sync-status' and the state and peers of your ntpd or chrony daemon. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew summary: Clock skew detected among monitors on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 for: 1m labels: severity: P3 type: ceph_default - name: osd rules: - alert: CephOSDDownHigh annotations: description: '{{ $value | humanize }}% or {{ with printf "count (ceph_osd_up{cluster=''%s''} == 0)" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with printf "count (ceph_osd_up{cluster=''%s''})" .Labels.cluster | query }}{{ . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=''%s''} * on(cluster, ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' summary: More than 10% of OSDs are down on cluster {{ $labels.cluster }} expr: count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up) * 100 >= 10 labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.1 severity: P1 type: ceph_default - alert: CephOSDHostDown annotations: description: 'The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=''%s''} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}' summary: An OSD host is offline on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.8 severity: P3 type: ceph_default - alert: CephOSDDown annotations: description: '{{ $num := printf "count(ceph_osd_up{cluster=''%s''} == 0) " .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range printf "(ceph_osd_up{cluster=''%s''} * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}' documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down summary: An OSD has been marked down on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_DOWN"} == 1 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.2 severity: P3 type: ceph_default - alert: CephOSDNearFull annotations: description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull summary: OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.3 severity: P3 type: ceph_default - alert: CephOSDFull annotations: description: An OSD has reached the FULL threshold. Writes to pools that share the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full summary: OSD full, writes blocked on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_FULL"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.6 severity: P1 type: ceph_default - alert: CephOSDBackfillFull annotations: description: An OSD has reached the BACKFILL FULL threshold. This will prevent rebalance operations from completing. Use 'ceph health detail' and 'ceph osd df' to identify the problem. To resolve, add capacity to the affected OSD's failure domain, restore down/out OSDs, or delete unwanted data. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull summary: OSD(s) too full for backfill operations on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 for: 1m labels: severity: P3 type: ceph_default - alert: CephOSDTooManyRepairs annotations: description: Reads from an OSD have used a secondary PG to return data to the client, indicating a potential failing drive. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs summary: OSD reports a high number of read errors on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 for: 30s labels: severity: P3 type: ceph_default - alert: CephOSDTimeoutsPublicNetwork annotations: description: OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network for latency or loss issues. Use 'ceph health detail' to show the affected OSDs. summary: Network issues delaying OSD heartbeats (public network) on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 for: 1m labels: severity: P4 type: ceph_default - alert: CephOSDTimeoutsClusterNetwork annotations: description: OSD heartbeats on the cluster's 'cluster' network (backend) are slow. Investigate the network for latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. summary: Network issues delaying OSD heartbeats (cluster network) on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 for: 1m labels: severity: P4 type: ceph_default - alert: CephOSDInternalDiskSizeMismatch annotations: description: One or more OSDs have an internal inconsistency between metadata and the size of the device. This could lead to the OSD(s) crashing in future. You should redeploy the affected OSDs. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch summary: OSD size inconsistency error on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 for: 1m labels: severity: P3 type: ceph_default - alert: CephDeviceFailurePredicted annotations: description: The device health module has determined that one or more devices will fail soon. To review device status use 'ceph device ls'. To show a specific device use 'ceph device info '. Mark the OSD out so that data may migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace the device, and redeploy the OSD. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2 summary: Device(s) predicted to fail soon on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 for: 1m labels: severity: P3 type: ceph_default - alert: CephDeviceFailurePredictionTooHigh annotations: description: The device health module has determined that devices predicted to fail can not be remediated automatically, since too many OSDs would be removed from the cluster to ensure performance and availability. Prevent data integrity issues by adding new OSDs so that data may be relocated. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany summary: Too many devices are predicted to fail on cluster {{ $labels.cluster }}, unable to resolve expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.7 severity: P1 type: ceph_default - alert: CephDeviceFailureRelocationIncomplete annotations: description: "The device health module has determined that one or more devices will fail soon, but the normal process of relocating the data on the device to other OSDs in the cluster is blocked. \nEnsure that the cluster has available free space. It may be necessary to add capacity to the cluster to allow data from the failing device to successfully migrate, or to enable the balancer." documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use summary: Device failure is predicted, but unable to relocate data on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 for: 1m labels: severity: P3 type: ceph_default - alert: CephOSDFlapping annotations: description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up {{ $value | humanize }} times once a minute for 5 minutes. This may indicate a network issue (latency, packet loss, MTU mismatch) on the cluster network, or the public network if no cluster network is deployed. Check the network stats on the listed host(s). documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds summary: Network issues are causing OSDs to flap (mark each other down) on cluster {{ $labels.cluster }} expr: (rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) * 60 > 1 labels: oid: 1.3.6.1.4.1.50495.1.2.1.4.4 severity: P3 type: ceph_default - alert: CephOSDReadErrors annotations: description: An OSD has encountered read errors, but the OSD has recovered by retrying the reads. This may indicate an issue with hardware or the kernel. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors summary: Device read errors detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 for: 30s labels: severity: P3 type: ceph_default - name: mds rules: - alert: CephFilesystemDamaged annotations: description: Filesystem metadata has been corrupted. Data may be inaccessible. Analyze metrics from the MDS daemon admin socket, or escalate to support. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages summary: CephFS filesystem is damaged on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.5.1 severity: P1 type: ceph_default - alert: CephFilesystemOffline annotations: description: All MDS ranks are unavailable. The MDS daemons managing metadata are down, rendering the filesystem offline. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down summary: CephFS filesystem is offline on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.5.3 severity: P1 type: ceph_default - alert: CephFilesystemDegraded annotations: description: One or more metadata daemons (MDS ranks) are failed or in a damaged state. At best the filesystem is partially available, at worst the filesystem is completely unusable. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded summary: CephFS filesystem is degraded on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="FS_DEGRADED"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.5.4 severity: P1 type: ceph_default - alert: CephFilesystemMDSRanksLow annotations: description: The filesystem's 'max_mds' setting defines the number of MDS ranks in the filesystem. The current number of active MDS daemons is less than this value. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max summary: Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0 for: 1m labels: severity: P3 type: ceph_default - alert: CephFilesystemInsufficientStandby annotations: description: The minimum number of standby daemons required by standby_count_wanted is less than the current number of standby daemons. Adjust the standby count or increase the number of MDS daemons. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby summary: Ceph filesystem standby daemons too few on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0 for: 1m labels: severity: P3 type: ceph_default - alert: CephFilesystemFailureNoStandby annotations: description: An MDS daemon has failed, leaving only one active rank and no available standby. Investigate the cause of the failure or add a standby MDS. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds summary: MDS daemon failed, no further standby available on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.5.5 severity: P1 type: ceph_default - alert: CephFilesystemReadOnly annotations: description: The filesystem has switched to READ ONLY due to an unexpected error when writing to the metadata pool. Either analyze the output from the MDS daemon admin socket, or escalate to support. documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages summary: CephFS filesystem in read only mode due to write error(s) on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.5.2 severity: P1 type: ceph_default - name: mgr rules: - alert: CephMgrModuleCrash annotations: description: One or more mgr modules have crashed and have yet to be acknowledged by an administrator. A crashed module may impact functionality within the cluster. Use the 'ceph crash' command to determine which module has failed, and archive it to acknowledge the failure. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash summary: A manager module has recently crashed on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.6.1 severity: P1 type: ceph_default - alert: CephMgrPrometheusModuleInactive annotations: description: The mgr/prometheus module at {{ $labels.instance }} is unreachable. This could mean that the module has been disabled or the mgr daemon itself is down. Without the mgr/prometheus module metrics and alerts will no longer function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to to determine whether the mgr is active. If the mgr is not active, restart it, otherwise you can determine module status with 'ceph mgr module ls'. If it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'. summary: The mgr/prometheus module is not available expr: up{job="ceph"} == 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.6.2 severity: P4 type: ceph_default - name: pgs rules: - alert: CephPGsInactive annotations: description: '{{ $value }} PGs have been inactive for more than 5 minutes in pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write requests.' summary: One or more placement groups are inactive on cluster {{ $labels.cluster }} expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_active) > 0 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.7.1 severity: P1 type: ceph_default - alert: CephPGsUnclean annotations: description: '{{ $value }} PGs have been unclean for more than 15 minutes in pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.' summary: One or more placement groups are marked unclean on cluster {{ $labels.cluster }} expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total - ceph_pg_clean) > 0 for: 15m labels: oid: 1.3.6.1.4.1.50495.1.2.1.7.2 severity: P3 type: ceph_default - alert: CephPGsDamaged annotations: description: During data consistency checks (scrub), at least one PG has been flagged as being damaged or inconsistent. Check to see which PG is affected, and attempt a manual repair if necessary. To list problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use the 'ceph pg repair ' command. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged summary: Placement group damaged, manual intervention needed on cluster {{ $labels.cluster }} expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.7.4 severity: P1 type: ceph_default - alert: CephPGRecoveryAtRisk annotations: description: Data redundancy is at risk since one or more OSDs are at or above the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs, or delete unwanted data. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full summary: OSDs are too full for recovery on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.7.5 severity: P1 type: ceph_default - alert: CephPGUnavailableBlockingIO annotations: description: Data availability is reduced, impacting the cluster's ability to service I/O. One or more placement groups (PGs) are in a state that blocks I/O. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability summary: PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.7.3 severity: P1 type: ceph_default - alert: CephPGBackfillAtRisk annotations: description: Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add more capacity, or delete unwanted data. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full summary: Backfill operations are blocked due to lack of free space on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.7.6 severity: P1 type: ceph_default - alert: CephPGNotScrubbed annotations: description: 'One or more PGs have not been scrubbed recently. Scrubs check metadata integrity, protecting against bit-rot. They check that metadata is consistent across data replicas. When PGs miss their scrub interval, it may indicate that the scrub window is too small, or PGs were not in a ''clean'' state during the scrub window. You can manually initiate a scrub with: ceph pg scrub ' documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed summary: Placement group(s) have not been scrubbed on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 for: 5m labels: severity: P3 type: ceph_default - alert: CephPGsHighPerOSD annotations: description: |- The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting). Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs summary: Placement groups per OSD is too high on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1 for: 1m labels: severity: P3 type: ceph_default - alert: CephPGNotDeepScrubbed annotations: description: One or more PGs have not been deep scrubbed recently. Deep scrubs protect against bit-rot. They compare data replicas to ensure consistency. When PGs miss their deep scrub interval, it may indicate that the window is too small or PGs were not in a 'clean' state during the deep-scrub window. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed summary: Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 for: 5m labels: severity: P3 type: ceph_default - name: nodes rules: - alert: CephNodeRootFilesystemFull annotations: description: 'Root volume is dangerously full: {{ $value | humanize }}% free.' summary: Root filesystem is dangerously full expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} * 100 < 5 for: 5m labels: oid: 1.3.6.1.4.1.50495.1.2.1.8.1 severity: P1 type: ceph_default - alert: CephNodeNetworkPacketErrors annotations: description: Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}. summary: One or more NICs reports packet errors on cluster {{ $labels.cluster }} expr: | ( rate(node_network_receive_errs_total{device!="lo"}[1m]) + rate(node_network_transmit_errs_total{device!="lo"}[1m]) ) / ( rate(node_network_receive_packets_total{device!="lo"}[1m]) + rate(node_network_transmit_packets_total{device!="lo"}[1m]) ) >= 0.0001 or ( rate(node_network_receive_errs_total{device!="lo"}[1m]) + rate(node_network_transmit_errs_total{device!="lo"}[1m]) ) >= 10 labels: oid: 1.3.6.1.4.1.50495.1.2.1.8.3 severity: P3 type: ceph_default - alert: CephNodeNetworkBondDegraded annotations: description: Bond {{ $labels.master }} is degraded on Node {{ $labels.instance }}. summary: Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster }} expr: | node_bonding_slaves - node_bonding_active != 0 labels: severity: P3 type: ceph_default - alert: CephNodeInconsistentMTU annotations: description: Node {{ $labels.instance }} has a different MTU size ({{ $value }}) than the median of devices named {{ $labels.device }}. summary: MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster }} expr: node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( max by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == scalar( min by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) ) labels: severity: P3 type: ceph_default - name: pools rules: - alert: CephPoolGrowthWarning annotations: description: Pool '{{ $labels.name }}' will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. summary: Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster }} expr: (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id, instance) group_right() ceph_pool_metadata) >= 95 labels: oid: 1.3.6.1.4.1.50495.1.2.1.9.2 severity: P3 type: ceph_default - alert: CephPoolBackfillFull annotations: description: A pool is approaching the near full threshold, which will prevent recovery/backfill operations from completing. Consider adding more capacity. summary: Free space in a pool is too low for recovery/backfill on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 labels: severity: P3 type: ceph_default - alert: CephPoolFull annotations: description: A pool has reached its MAX quota, or OSDs supporting the pool have reached the FULL threshold. Until this is resolved, writes to the pool will be blocked. Pool Breakdown (top 5) {{- range printf "topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'} * on(cluster,pool_id) group_right ceph_pool_metadata))" .Labels.cluster | query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ) documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full summary: Pool is full - writes are blocked on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="POOL_FULL"} > 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.9.1 severity: P1 type: ceph_default - alert: CephPoolNearFull annotations: description: A pool has exceeded the warning (percent full) threshold, or OSDs supporting the pool have reached the NEARFULL threshold. Writes may continue, but you are at risk of the pool going read-only if more capacity isn't made available. Determine the affected pool with 'ceph df detail', looking at QUOTA BYTES and STORED. Increase the pool's quota, or add capacity to the cluster first then increase the pool's quota (e.g. ceph osd pool set quota max_bytes ). Also ensure that the balancer is active. summary: One or more Ceph pools are nearly full on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 for: 5m labels: severity: P3 type: ceph_default - name: healthchecks rules: - alert: CephSlowOps annotations: description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded)' documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops summary: OSD operations are slow to complete on cluster {{ $labels.cluster }} expr: ceph_healthcheck_slow_ops > 0 for: 30s labels: severity: P3 type: ceph_default - alert: CephDaemonSlowOps annotations: description: '{{ $labels.ceph_daemon }} operations are taking too long to process (complaint time exceeded)' documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops summary: '{{ $labels.ceph_daemon }} operations are slow to complete on cluster {{ $labels.cluster }}' expr: ceph_daemon_health_metrics{type="SLOW_OPS"} > 0 for: 30s labels: severity: P3 type: ceph_default - name: cephadm rules: - alert: CephadmUpgradeFailed annotations: description: The cephadm cluster upgrade process has failed. The cluster remains in an undetermined state. Please review the cephadm logs, to understand the nature of the issue summary: Ceph version upgrade has failed on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.11.2 severity: P1 type: ceph_default - alert: CephadmDaemonFailed annotations: description: A daemon managed by cephadm is no longer active. Determine, which daemon is down with 'ceph health detail'. you may start daemons with the 'ceph orch daemon start ' summary: A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.11.1 severity: P1 type: ceph_default - alert: CephadmPaused annotations: description: Cluster management has been paused manually. This will prevent the orchestrator from service management and reconciliation. If this is not intentional, resume cephadm operations with 'ceph orch resume' documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused summary: Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 for: 1m labels: severity: P3 type: ceph_default - name: hardware rules: - alert: HardwareStorageError annotations: description: Some storage devices are in error. Check `ceph health detail`. summary: Storage devices error(s) detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.13.1 severity: P1 type: ceph_default - alert: HardwareMemoryError annotations: description: DIMM error(s) detected. Check `ceph health detail`. summary: DIMM error(s) detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.13.2 severity: P1 type: ceph_default - alert: HardwareProcessorError annotations: description: Processor error(s) detected. Check `ceph health detail`. summary: Processor error(s) detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.13.3 severity: P1 type: ceph_default - alert: HardwareNetworkError annotations: description: Network error(s) detected. Check `ceph health detail`. summary: Network error(s) detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.13.4 severity: P1 type: ceph_default - alert: HardwarePowerError annotations: description: Power supply error(s) detected. Check `ceph health detail`. summary: Power supply error(s) detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="HARDWARE_POWER"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.13.5 severity: P1 type: ceph_default - alert: HardwareFanError annotations: description: Fan error(s) detected. Check `ceph health detail`. summary: Fan error(s) detected on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="HARDWARE_FANS"} > 0 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.13.6 severity: P1 type: ceph_default - name: PrometheusServer rules: - alert: PrometheusJobMissing annotations: description: The prometheus job that scrapes from Ceph is no longer defined, this will effectively mean you'll have no metrics or alerts for the cluster. Please review the job definitions in the prometheus.yml file of the prometheus instance. summary: The scrape job for Ceph is missing from Prometheus expr: absent(up{job="ceph"}) for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.12.1 severity: P1 type: ceph_default - name: rados rules: - alert: CephObjectMissing annotations: description: The latest version of a RADOS object can not be found, even though all OSDs are up. I/O requests for this object from clients will block (hang). Resolving this issue may require the object to be rolled back to a prior version manually, and manually verified. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound summary: Object(s) marked UNFOUND on cluster {{ $labels.cluster }} expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right(cluster) (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster)) == 1 for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.10.1 severity: P1 type: ceph_default - name: generic rules: - alert: CephDaemonCrash annotations: description: One or more daemons have crashed recently, and need to be acknowledged. This notification ensures that software crashes do not go unseen. To acknowledge a crash, use the 'ceph crash archive ' command. documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash summary: One or more Ceph daemons have crashed, and are pending acknowledgement on cluster {{ $labels.cluster }} expr: ceph_health_detail{name="RECENT_CRASH"} == 1 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.1.2 severity: P1 type: ceph_default - name: rbdmirror rules: - alert: CephRBDMirrorImagesPerDaemonHigh annotations: description: Number of image replications per daemon is not supposed to go beyond threshold 100 summary: Number of image replications are now above 100 on cluster {{ $labels.cluster }} expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots) > 100 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.10.2 severity: P1 type: ceph_default - alert: CephRBDMirrorImagesNotInSync annotations: description: Both local and remote RBD mirror images should be in sync. summary: Some of the RBD mirror images are not in sync with the remote counter parts on cluster {{ $labels.cluster }} expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.10.3 severity: P1 type: ceph_default - alert: CephRBDMirrorImagesNotInSyncVeryHigh annotations: description: More than 10% of the images have synchronization problems. summary: Number of unsynchronized images are very high on cluster {{ $labels.cluster }} expr: count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp)) != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1) for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.10.4 severity: P1 type: ceph_default - alert: CephRBDMirrorImageTransferBandwidthHigh annotations: description: Detected a heavy increase in bandwidth for rbd replications (over 80%) in the last 30 min. This might not be a problem, but it is good to review the number of images being replicated simultaneously summary: The replication network usage on cluster {{ $labels.cluster }} has been increased over 80% in the last 30 minutes. Review the number of images being replicated. This alert will be cleaned automatically after 30 minutes expr: rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80 for: 1m labels: oid: 1.3.6.1.4.1.50495.1.2.1.10.5 severity: P3 type: ceph_default - name: nvmeof rules: - alert: NVMeoFSubsystemNamespaceLimit annotations: description: Subsystems have a max namespace limit defined at creation time. This alert means that no more namespaces can be added to {{ $labels.nqn }} summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces on cluster {{ $labels.cluster }}' expr: (count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata)) >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFMultipleNamespacesOfRBDImage annotations: description: Each NVMeoF namespace must have a unique RBD pool and image, across all different gateway groups. summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be reused for multiple NVMeoF namespace ' expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name) (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata)) > 1 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFTooManyGateways annotations: description: You may create many gateways, but 32 is the tested limit summary: Max supported gateways exceeded on cluster {{ $labels.cluster }} expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFMaxGatewayGroupSize annotations: description: You may create many gateways in a gateway group, but 8 is the tested limit summary: Max gateways within a gateway group ({{ $labels.group }}) exceeded on cluster {{ $labels.cluster }} expr: count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFMaxGatewayGroups annotations: description: You may create many gateway groups, but 4 is the tested limit summary: Max gateway groups exceeded on cluster {{ $labels.cluster }} expr: count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster) > 4.00 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFSingleGateway annotations: description: Although a single member gateway group is valid, it should only be used for test purposes summary: The gateway group {{ $labels.group }} consists of a single gateway - HA is not possible on cluster {{ $labels.cluster }} expr: count(ceph_nvmeof_gateway_info) by(cluster,group) == 1 for: 5m labels: severity: P3 type: ceph_default - alert: NVMeoFHighGatewayCPU annotations: description: Typically, high CPU may indicate degraded performance. Consider increasing the number of reactor cores summary: CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster {{ $labels.cluster }} expr: label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*") > 80.00 for: 10m labels: severity: P3 type: ceph_default - alert: NVMeoFGatewayOpenSecurity annotations: description: It is good practice to ensure subsystems use host security to reduce the risk of unexpected data loss summary: Subsystem {{ $labels.nqn }} has been defined without host level security on cluster {{ $labels.cluster }} expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"} for: 5m labels: severity: P3 type: ceph_default - alert: NVMeoFTooManySubsystems annotations: description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of 128 subsystems. Current count: {{ $value }}.' summary: The number of subsystems defined to the NVMeoF gateway reached or exceeded the supported values on cluster {{ $labels.cluster }} expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= 128.00 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFTooManyNamespaces annotations: description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded the supported maximum of 2048 namespaces. Current count: {{ $value }}.' summary: The number of namespaces defined to the NVMeoF gateway reached or exceeded supported values on cluster {{ $labels.cluster }} expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?")) >= 2048.00 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFVersionMismatch annotations: description: This may indicate an issue with deployment. Check cephadm logs summary: Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster }} expr: count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster) > 1 for: 1h labels: severity: P3 type: ceph_default - alert: NVMeoFHighClientCount annotations: description: The supported limit for clients connecting to a subsystem is 128 summary: The number of clients connected to {{ $labels.nqn }} is too high on cluster {{ $labels.cluster }} expr: ceph_nvmeof_subsystem_host_count > 128.00 for: 1m labels: severity: P3 type: ceph_default - alert: NVMeoFMissingListener annotations: description: For every subsystem, each gateway should have a listener to balance traffic between gateways. summary: No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem expr: ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0 for: 10m labels: severity: P3 type: ceph_default - alert: NVMeoFZeroListenerSubsystem annotations: description: NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners. summary: No listeners added to {{ $labels.nqn }} subsystem expr: sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0 for: 10m labels: severity: P3 type: ceph_default - alert: NVMeoFHighHostCPU annotations: description: High CPU on a gateway host can lead to CPU contention and performance degradation summary: The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host }}) on cluster {{ $labels.cluster }} expr: 100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*")) * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*")))) >= 80.00 for: 10m labels: severity: P3 type: ceph_default - alert: NVMeoFInterfaceDown annotations: description: A NIC used by one or more subsystems is in a down state summary: Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster }} expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"} for: 30s labels: oid: 1.3.6.1.4.1.50495.1.2.1.14.1 severity: P3 type: ceph_default - alert: NVMeoFInterfaceDuplex annotations: description: Until this is resolved, performance from the gateway will be degraded summary: Network interface {{ $labels.device }} is not running in full duplex mode on cluster {{ $labels.cluster }} expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"} for: 30s labels: severity: P3 type: ceph_default - alert: NVMeoFHighReadLatency annotations: description: High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate summary: The average read latency over the last 5 mins has reached 10 ms or more on {{ $labels.gateway }} expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m]) / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*") > 0.01 for: 5m labels: severity: P3 type: ceph_default - alert: NVMeoFHighWriteLatency annotations: description: High latencies may indicate a constraint within the cluster e.g. CPU, network. Please investigate summary: The average write latency over the last 5 mins has reached 20 ms or more on {{ $labels.gateway }} expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m]) / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*") > 0.02 for: 5m labels: severity: P3 type: ceph_default - alert: NVMeoFHostKeepAliveTimeout annotations: description: Host was disconnected due to host keep alive timeout summary: Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from subsystem ({{ $labels.nqn }}) in last 24 hours expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0 for: 1m labels: severity: P3 type: ceph_default - name: cluster health detail rules: - alert: CephHealthDetailError annotations: description: Health check {{ $labels.name }} has been HEALTH_ERROR for more than 5 minutes. Please check 'ceph health detail' for more information. summary: Ceph is in the ERROR state expr: ceph_health_detail{severity="HEALTH_ERROR"} == 1 for: 5m labels: severity: P1 - alert: CephHealthDetailWarning annotations: description: Health check {{ $labels.name }} has been HEALTH_WARN for more than 15 minutes. Please check 'ceph health detail' for more information. summary: Ceph is in the WARNING state expr: ceph_health_detail{severity="HEALTH_WARN"} == 1 for: 15m labels: severity: P3 monitoring-kube-prometheus-stack-node-network-9971bd0c-d600-432f-8729-4406d8d0e445.yaml: ---- groups: - name: node-network rules: - alert: NodeNetworkInterfaceFlapping annotations: description: Network interface "{{ $labels.device }}" changing its up status often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }} runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping summary: Network interface is often changing its status expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2 for: 2m labels: severity: warning monitoring-kube-prometheus-stack-ipmi-exporter-2652942a-6fa8-4768-9918-61ecab49b1c4.yaml: ---- groups: - name: rules rules: - alert: IpmiCollectorDown expr: ipmi_up == 0 for: 15m labels: severity: P3 - name: collectors-state-warning rules: - alert: IpmiCurrent expr: ipmi_current_state == 1 labels: severity: P3 - alert: IpmiFanSpeed expr: ipmi_fan_speed_state == 1 labels: severity: P3 - alert: IpmiPower expr: ipmi_power_state == 1 labels: severity: P3 - alert: IpmiSensor expr: ipmi_sensor_state == 1 labels: severity: P3 - alert: IpmiTemperature expr: ipmi_temperature_state == 1 labels: severity: P3 - alert: IpmiVoltage expr: ipmi_voltage_state == 1 labels: severity: P3 - name: collectors-state-critical rules: - alert: IpmiCurrent expr: ipmi_current_state == 2 labels: severity: P1 - alert: IpmiFanSpeed expr: ipmi_fan_speed_state == 2 labels: severity: P1 - alert: IpmiPower expr: ipmi_power_state == 2 labels: severity: P1 - alert: IpmiSensor expr: ipmi_sensor_state{name!="TPM Presence"} == 2 labels: severity: P1 - alert: IpmiTemperature expr: ipmi_temperature_state == 2 labels: severity: P1 - alert: IpmiVoltage expr: ipmi_voltage_state == 2 labels: severity: P1 monitoring-kube-prometheus-stack-mysqld-f5155160-3a68-46d9-b7fb-3b6bf3e084c0.yaml: ---- groups: - name: MySQLdAlerts - name: GaleraAlerts rules: - alert: MySQLGaleraNotReady annotations: description: '{{$labels.job}} on {{$labels.instance}} is not ready.' summary: Galera cluster node not ready. expr: mysql_global_status_wsrep_ready != 1 for: 5m labels: severity: P3 - alert: MySQLGaleraOutOfSync annotations: description: The Galera node {{ $labels.instance }} has wsrep_local_state={{ $value }} which is not the expected value of 4 (Synced). The node is not in Donor state (2) and wsrep_desync is not enabled, indicating an unexpected loss of cluster sync. Normal behavior is wsrep_local_state=4 for all nodes not actively serving as SST donors. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#mysqlgaleraoutofsync summary: 'Percona XtraDB Cluster: Galera node not in sync with cluster' expr: | (mysql_global_status_wsrep_local_state != 4 and mysql_global_status_wsrep_local_state != 2 and mysql_global_variables_wsrep_desync == 0) for: 15m labels: severity: P3 - alert: MySQLGaleraDonorFallingBehind annotations: description: '{{$labels.job}} on {{$labels.instance}} is a donor (hotbackup) and is falling behind (queue size {{$value}}).' summary: XtraDB cluster donor node falling behind. expr: (mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue > 100) for: 5m labels: severity: P3 - alert: MySQLReplicationNotRunning annotations: description: Replication on {{$labels.instance}} (IO or SQL) has been down for more than 2 minutes. summary: Replication is not running. expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running == 0 for: 2m labels: severity: P1 - alert: MySQLReplicationLag annotations: description: Replication on {{$labels.instance}} has fallen behind and is not recovering. summary: MySQL slave replication is lagging. expr: (instance:mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_slave_lag_seconds[5m], 60 * 2) > 0) for: 1m labels: severity: P1 - alert: MySQLHeartbeatLag annotations: description: The heartbeat is lagging on {{$labels.instance}} and is not recovering. summary: MySQL heartbeat is lagging. expr: (instance:mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_heartbeat_lag_seconds[5m], 60 * 2) > 0) for: 1m labels: severity: P1 - alert: MySQLInnoDBLogWaits annotations: description: The innodb logs are waiting for disk at a rate of {{$value}} / second summary: MySQL innodb log writes stalling. expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 labels: severity: P3 - name: mysqld-extras rules: - alert: MysqlTooManyConnections expr: | max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 for: 1m labels: severity: P3 - alert: MysqlHighThreadsRunning expr: | max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 for: 1m labels: severity: P3 - alert: MysqlSlowQueries expr: | increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: P3 - alert: MysqlClusterDown annotations: description: '{{ $labels.instance }} replica is down.' summary: Percona XtraDB Cluster replica is down expr: mysql_up == 0 for: 5m labels: severity: P5 - alert: MysqlClusterDown annotations: description: '{{ $value }}% of replicas are online.' summary: Percona XtraDB Cluster replicas are down expr: round(count(mysql_up==1) / count(mysql_up) * 100) <= 50 for: 5m labels: severity: P3 - alert: MysqlClusterDown annotations: description: All replicas are down. summary: Percona XtraDB Cluster is down expr: count(mysql_up==0) == count(mysql_up) for: 1m labels: severity: P1 - name: mysqld_rules rules: - expr: mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay record: instance:mysql_slave_lag_seconds - expr: mysql_heartbeat_now_timestamp_seconds - mysql_heartbeat_stored_timestamp_seconds record: instance:mysql_heartbeat_lag_seconds - expr: sum without (command) (rate(mysql_global_status_commands_total{command=~"(commit|rollback)"}[5m])) record: job:mysql_transactions:rate5m monitoring-kube-prometheus-stack-goldpinger-55a86a3c-f495-477f-aacf-37243b06a211.yaml: ---- groups: - name: goldpinger rules: - alert: GoldpingerHighUnhealthyRatio annotations: description: 'More than 10% of nodes (current: {{ $value | humanizePercentage }}) are reporting as unhealthy for at least 5 minutes. Normal operation expects 0% unhealthy nodes.' runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingerhighunhealthyratio summary: 'Goldpinger: high percentage of cluster nodes unhealthy' expr: | ( sum(goldpinger_nodes_health_total{status="unhealthy"}) / sum(goldpinger_nodes_health_total) ) > 0.1 for: 5m labels: severity: P2 - alert: GoldpingerNodeUnreachable annotations: description: 'Node with IP {{ $labels.host_ip }} has a median ping latency above 1s from more than 50% (current: {{ $value | humanizePercentage }}) of Goldpinger instances. Normal operation expects all nodes to be reachable with sub-10ms latency.' runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingernodeunreachable summary: 'Goldpinger: node unreachable by majority of cluster' expr: | ( count by (host_ip) ( histogram_quantile(0.5, sum by (instance, host_ip, le) ( rate(goldpinger_peers_response_time_s_bucket{call_type="ping"}[5m]) ) ) > 1.0 ) / scalar(count(goldpinger_cluster_health_total)) ) > 0.5 for: 5m labels: severity: P2 - alert: GoldpingerHighPeerLatency annotations: description: The 95th percentile of peer-to-peer latency is {{ $value | humanizeDuration }}, which exceeds the threshold of 500ms. Normal latency is typically below 10ms. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingerhighpeerlatency summary: 'Goldpinger: high cluster-wide peer latency' expr: | histogram_quantile(0.95, sum by (le) ( rate(goldpinger_peers_response_time_s_bucket{call_type="ping"}[5m]) ) ) > 0.5 for: 15m labels: severity: P3 - alert: GoldpingerHighErrorRate annotations: description: 'More than 5% (current: {{ $value | humanizePercentage }}) of Goldpinger ping attempts are failing. Normal operation expects less than 0.1% error rate.' runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingerhigherrorrate summary: 'Goldpinger: high ping error rate' expr: | ( sum(rate(goldpinger_errors_total{type="ping"}[5m])) / sum(rate(goldpinger_stats_total{action="ping",group="made"}[5m])) ) > 0.05 for: 15m labels: severity: P3 monitoring-kube-prometheus-stack-kube-state-metrics-d075df17-b1b3-44f8-8e8f-e4546bc53387.yaml: ---- groups: - name: kube-state-metrics rules: - alert: KubeStateMetricsListErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors summary: kube-state-metrics is experiencing errors in list operations. expr: |- (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) / sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m labels: severity: critical - alert: KubeStateMetricsWatchErrors annotations: description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors summary: kube-state-metrics is experiencing errors in watch operations. expr: |- (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster) / sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster)) > 0.01 for: 15m labels: severity: critical - alert: KubeStateMetricsShardingMismatch annotations: description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch summary: kube-state-metrics sharding is misconfigured. expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0 for: 15m labels: severity: critical - alert: KubeStateMetricsShardsMissing annotations: description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing summary: kube-state-metrics shards are missing. expr: |- 2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1 - sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster) != 0 for: 15m labels: severity: critical monitoring-kube-prometheus-stack-coredns-b6e9a3a8-ed0f-40d0-bc32-d1343ec17fc9.yaml: ---- groups: - name: coredns rules: - alert: CoreDNSCriticalErrorBudgetBurn annotations: description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }} over the last hour, which exceeds the 1.44% burn-rate threshold (14.4x against 99.9% SLO). At this rate, the 30-day error budget exhausts in under 2.1 days. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednscriticalerrorbudgetburn summary: 'CoreDNS: SERVFAIL rate rapidly consuming error budget' expr: | ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[1h])) / sum(rate(coredns_dns_responses_total{job="coredns"}[1h])) ) > 0.0144 and ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m])) / sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) ) > 0.0144 and sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 1 for: 2m labels: severity: P1 - alert: CoreDNSHighErrorBudgetBurn annotations: description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }} over the last 6 hours, which exceeds the 0.6% burn-rate threshold (6x against 99.9% SLO). At this rate, the 30-day error budget exhausts in under 5 days. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednshigherrorbudgetburn summary: 'CoreDNS: sustained SERVFAIL rate depleting error budget' expr: | ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[6h])) / sum(rate(coredns_dns_responses_total{job="coredns"}[6h])) ) > 0.006 and ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[30m])) / sum(rate(coredns_dns_responses_total{job="coredns"}[30m])) ) > 0.006 and sum(rate(coredns_dns_responses_total{job="coredns"}[30m])) > 1 for: 5m labels: severity: P2 - alert: CoreDNSModerateErrorBudgetBurn annotations: description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }} over the last day, which exceeds the 0.3% burn-rate threshold (3x against 99.9% SLO). At this rate, the 30-day error budget exhausts in under 10 days. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednsmoderateerrorbudgetburn summary: 'CoreDNS: ongoing SERVFAIL rate steadily consuming error budget' expr: | ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[1d])) / sum(rate(coredns_dns_responses_total{job="coredns"}[1d])) ) > 0.003 and ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[2h])) / sum(rate(coredns_dns_responses_total{job="coredns"}[2h])) ) > 0.003 and sum(rate(coredns_dns_responses_total{job="coredns"}[2h])) > 1 for: 15m labels: severity: P3 - alert: CoreDNSLowErrorBudgetBurn annotations: description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }} over the last 3 days, which exceeds the 0.1% burn-rate threshold (1x against 99.9% SLO). At this rate, the 30-day error budget exhausts before the window resets. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednslowerrorbudgetburn summary: 'CoreDNS: low-level SERVFAIL rate eroding error budget' expr: | ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[3d])) / sum(rate(coredns_dns_responses_total{job="coredns"}[3d])) ) > 0.001 and ( sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[6h])) / sum(rate(coredns_dns_responses_total{job="coredns"}[6h])) ) > 0.001 and sum(rate(coredns_dns_responses_total{job="coredns"}[6h])) > 1 for: 1h labels: severity: P4 - alert: CoreDNSDown annotations: description: CoreDNS has disappeared from Prometheus target discovery for more than 15 minutes. This could indicate a crashed CoreDNS pod or a misconfigured scrape target. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednsdown summary: 'CoreDNS: instance has disappeared from Prometheus target discovery' expr: absent(up{job="coredns"} == 1) for: 15m labels: severity: P3 monitoring-kube-prometheus-stack-prometheus-operator-2974a559-2ef4-46bb-9feb-287be77d4f42.yaml: ---- groups: - name: prometheus-operator rules: - alert: PrometheusOperatorListErrors annotations: description: Errors while performing List operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors summary: Errors while performing list operations in controller. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m]))) > 0.4 for: 15m labels: severity: warning - alert: PrometheusOperatorWatchErrors annotations: description: Errors while performing watch operations in controller {{$labels.controller}} in {{$labels.namespace}} namespace. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors summary: Errors while performing watch operations in controller. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])) / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) > 0.4 for: 15m labels: severity: warning - alert: PrometheusOperatorSyncFailed annotations: description: Controller {{ $labels.controller }} in {{ $labels.namespace }} namespace fails to reconcile {{ $value }} objects. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed summary: Last controller reconciliation failed expr: min_over_time(prometheus_operator_syncs{status="failed",job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning - alert: PrometheusOperatorReconcileErrors annotations: description: '{{ $value | humanizePercentage }} of reconciling operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors summary: Errors while reconciling objects. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning - alert: PrometheusOperatorStatusUpdateErrors annotations: description: '{{ $value | humanizePercentage }} of status update operations failed for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorstatusupdateerrors summary: Errors while updating objects status. expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))) > 0.1 for: 10m labels: severity: warning - alert: PrometheusOperatorNodeLookupErrors annotations: description: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors summary: Errors while reconciling Prometheus. expr: rate(prometheus_operator_node_address_lookup_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning - alert: PrometheusOperatorNotReady annotations: description: Prometheus operator in {{ $labels.namespace }} namespace isn't ready to reconcile {{ $labels.controller }} resources. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready summary: Prometheus operator not ready expr: min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]) == 0) for: 5m labels: severity: warning - alert: PrometheusOperatorRejectedResources annotations: description: Prometheus operator in {{ $labels.namespace }} namespace rejected {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }} resources. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources summary: Resources rejected by Prometheus operator expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]) > 0 for: 5m labels: severity: warning monitoring-kube-prometheus-stack-node-exporter-local-273656ff-e3ba-4489-a864-45233d764689.yaml: ---- groups: - name: node rules: - alert: NodeHighLoadAverage expr: node_load5 / count(node_cpu_seconds_total{mode="system"}) without (cpu, mode) > 1.5 for: 30m labels: severity: P3 - alert: NodeHighCpuUsage expr: sum by(instance)(irate(node_cpu_seconds_total{mode='idle'}[5m])) < 1 for: 2m labels: severity: P3 - alert: NodeLowEntropy expr: node_entropy_available_bits / node_entropy_pool_size_bits < 0.20 for: 5m labels: severity: P5 - name: network rules: - alert: NodeNetworkMulticast annotations: description: This can result in high software interrupt load on the node which can bring network performance down. runbook_url: https://github.com/vexxhost/atmosphere/tree/main/roles/kube_prometheus_stack#NodeNetworkMulticast summary: 'High multicast traffic on node {{ $labels.instance }}: {{ $value }} packets/sec' expr: rate(node_network_receive_multicast_total[1m]) > 1000 for: 5m labels: severity: P1 - name: softnet rules: - alert: SingleNodeSoftNetBacklog expr: count(node:softnet:backlog:1m > 5000) > (count(node:softnet:backlog:1m) * 0) for: 1m labels: severity: P3 - alert: MultipleNodesSoftNetBacklog expr: count(node:softnet:backlog:1m > 5000) > (count(node:softnet:backlog:1m) * 0.5) for: 1m labels: severity: P2 - alert: MajorityNodesSoftNetBacklog expr: count(node:softnet:backlog:1m > 5000) > (count(node:softnet:backlog:1m) * 0.75) for: 1m labels: severity: P1 - alert: SingleNodeSoftNetDropped expr: count(node:softnet:dropped:1m > 0) > (count(node:softnet:dropped:1m) * 0) for: 1m labels: severity: P3 - alert: MultipleNodesSoftNetDropped expr: count(node:softnet:dropped:1m > 0) > (count(node:softnet:dropped:1m) * 0.5) for: 1m labels: severity: P2 - alert: MajorityNodesSoftNetDropped expr: count(node:softnet:dropped:1m > 0) > (count(node:softnet:dropped:1m) * 0.75) for: 1m labels: severity: P1 - name: softnet.rules rules: - expr: sum(node_softnet_backlog_len) by (instance) record: node:softnet:backlog:1m - expr: sum(rate(node_softnet_dropped_total[1m])) by (instance) record: node:softnet:dropped:1m monitoring-kube-prometheus-stack-openstack-e2faee54-6cb8-417e-82d5-8b4f7bcd7fcc.yaml: ---- groups: - name: cinder rules: - alert: CinderAgentDisabled annotations: description: A Cinder agent has been administratively disabled for more than 24 hours. summary: Cinder agent disabled expr: openstack_cinder_agent_state{adminState!="enabled"} > 0 for: 24h labels: severity: P5 - alert: CinderAgentDown annotations: description: A Cinder agent has been down for more than 15 minutes. summary: Cinder agent down expr: openstack_cinder_agent_state != 1 for: 15m labels: severity: P3 - alert: CinderAgentGroupDown annotations: description: All instances of a specific Cinder agent have been down for more than 5 minutes. summary: Cinder agent group down expr: min by (exported_service) (openstack_cinder_agent_state) == 0 for: 5m labels: severity: P2 - alert: CinderVolumeError annotations: description: A Cinder volume is in an error state. summary: Cinder volume error expr: openstack_cinder_volume_status{status=~"error.*"} > 0 for: 24h labels: severity: P4 - name: neutron rules: - alert: NeutronAgentDisabled annotations: description: A Neutron agent has been administratively disabled for more than 24 hours. summary: Neutron agent disabled expr: openstack_neutron_agent_state{adminState!="up"} > 0 for: 24h labels: severity: P5 - alert: NeutronAgentDown annotations: description: A Neutron agent has been down for more than 15 minutes. summary: Neutron agent down expr: openstack_neutron_agent_state != 1 for: 15m labels: severity: P3 - alert: NeutronNetworkOutOfIPs annotations: description: The network {{ $labels.network_id }} is currently at {{ $value }}% utilization. If the IP addresses run out, it will impact the provisioning of new ports. summary: '[{{ $labels.network_id }}] Network running out of IPs' expr: (sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""}) and on (network_id) label_replace(openstack_neutron_network{is_external="true", is_shared="true"}, "network_id", "$1", "id", "(.*)")) / (sum by (network_id) (openstack_neutron_network_ip_availabilities_total{project_id!=""}) and on (network_id) label_replace(openstack_neutron_network{is_external="true", is_shared="true"}, "network_id", "$1", "id", "(.*)")) * 100 > 80 for: 6h labels: severity: P3 - alert: NeutronRouterMultipleActiveL3Agents annotations: description: The router with ID {{ $labels.router_id }} has {{ $value }} L3 agents in active state which can cause network resets and traffic drops. summary: Neutron HA router has multiple active L3 agents expr: sum by (router_id) (openstack_neutron_l3_agent_of_router{ha_state="active"}) > 1 for: 5m labels: severity: P3 - name: neutron-port-bindings rules: - alert: NeutronPortBindingFailed annotations: description: At least one Neutron port has failed to bind. summary: Neutron Port Binding Failed expr: count(neutron_port{binding_vif_type="binding_failed"}) > 0 for: 5m labels: severity: P4 - alert: NeutronPortBindingFailed annotations: description: More than 5% of Neutron ports have failed to bind. summary: Neutron Port Binding Failed expr: (count(neutron_port{binding_vif_type="binding_failed"}) / count(neutron_port)) > 0.05 for: 5m labels: severity: P3 - alert: NeutronPortBindingFailed annotations: description: More than 50% of Neutron ports have failed to bind. summary: Neutron Port Binding Failed expr: (count(neutron_port{binding_vif_type="binding_failed"}) / count(neutron_port)) > 0.5 for: 5m labels: severity: P2 - name: nova rules: - alert: NovaServiceDisabled annotations: description: A Nova service has been administratively disabled for more than 24 hours. summary: Nova service disabled expr: openstack_nova_agent_state{adminState!="enabled"} > 0 for: 24h labels: severity: P5 - alert: NovaServiceDown annotations: description: A Nova service has been down for more than 15 minutes. summary: Nova service down expr: openstack_nova_agent_state != 1 for: 15m labels: severity: P3 - alert: NovaServiceGroupDown annotations: description: All instances of a specific Nova service have been down for more than 5 minutes. summary: Nova service group down expr: sum by (exported_service) (openstack_nova_agent_state) == 0 for: 5m labels: severity: P2 - alert: NovaServerTaskStateStuck annotations: description: Nova server with ID {{ $labels.id }} stuck in {{ $labels.task_state }} state for more than 1 hour summary: Nova server stuck in task state expr: openstack_nova_server_task_state > 0 for: 1h labels: severity: P3 - alert: NovaInstanceError annotations: description: A Nova server is in an error state. summary: Nova server error expr: openstack_nova_server_status{status="ERROR"} > 0 for: 24h labels: severity: P4 - alert: NovaFailureRisk annotations: description: The cloud capacity will be at {{ $value }} in the event of the failure of a single hypervisor which puts the cloud at risk of not being able to recover should any hypervisor failures occur. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. summary: '[nova] Failure risk' expr: (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes) * 100 < 0.25 for: 6h labels: severity: P3 - alert: NovaCapacity annotations: description: The cloud capacity is currently at `{{ $value }}` which means there is a risk of running out of capacity due to the timeline required to add new nodes. Please ensure that adequate amount of infrastructure is assigned to this deployment to prevent this. summary: '[nova] Capacity risk' expr: sum ( openstack_nova_memory_used_bytes + on(hostname) group_left(adminState) (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) ) / sum ( openstack_nova_memory_available_bytes*0.90 + on(hostname) group_left(adminState) (0 * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"}) ) * 100 > 75 for: 6h labels: severity: P3 - name: nova-build-requests rules: - alert: NovaStuckBuildRequest annotations: description: 'Instance ID {{ $labels.instance_uuid }} (project: {{ $labels.project_id }}) has been stuck in build request state for more than 1 hour.' summary: Nova build request stuck in queue for more than 1 hour expr: openstack_nova_api_build_request > 0 for: 1h labels: severity: P4 - alert: NovaStuckBuildRequestIncreasing annotations: description: Build request count rate is increasing across the cluster. summary: Nova build request is increasing expr: rate(nova:build_requests:sum[5m]) > 0 for: 15m labels: severity: P3 - name: octavia rules: - alert: OctaviaLoadBalancerMultipleMaster annotations: description: Load balancer with ID {{ $labels.loadbalancer_id }} has multiple MASTER Amphorae for more then 15 minutes. summary: Octavia load balancer has multiple MASTER Amphorae expr: count by(loadbalancer_id) (openstack_loadbalancer_amphora_status{role="MASTER"}) > 1 for: 15m labels: severity: P3 - alert: OctaviaLoadBalancerNotActive annotations: description: Load balancer with ID {{ $labels.id }} stuck in non-active state for more then 15 minutes. summary: Octavia load balancer not active expr: count by (id,name) (openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"}) > 0 for: 15m labels: severity: P3 - alert: OctaviaAmphoraError annotations: description: Amphora with ID {{ $labels.id }} stuck in error state for more then 15 minutes. summary: Octavia Amphora in error state expr: count by (id,name) (openstack_loadbalancer_amphora_status{status="ERROR"}) > 0 for: 15m labels: severity: P3 - alert: OctaviaAmphoraNotOperational annotations: description: Amphora with ID {{ $labels.id }} stuck in non-operational state for more then 1 hour. summary: Octavia Amphora not operational expr: count by (id,name) (openstack_loadbalancer_amphora_status{status!~"READY|ALLOCATED|DELETED"}) > 0 for: 1h labels: severity: P3 - name: recording rules: - expr: sum(openstack_nova_api_build_request) record: nova:build_requests:sum monitoring-kube-prometheus-stack-prometheus-045c15d6-dc31-44cb-87e8-4a5a2b78ed59.yaml: ---- groups: - name: prometheus rules: - alert: PrometheusBadConfig annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to reload its configuration. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig summary: Failed Prometheus configuration reload. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) == 0 for: 10m labels: severity: critical - alert: PrometheusSDRefreshFailure annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to refresh SD with mechanism {{$labels.mechanism}}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure summary: Failed Prometheus SD refresh. expr: increase(prometheus_sd_refresh_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[10m]) > 0 for: 20m labels: severity: warning - alert: PrometheusNotificationQueueRunningFull annotations: description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}} is running full. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull summary: Prometheus alert notification queue predicted to run full in less than 30m. expr: |- # Without min_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m], 60 * 30) > min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) ) for: 15m labels: severity: warning - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers annotations: description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers summary: Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager. expr: |- ( rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) ) * 100 > 1 for: 15m labels: severity: warning - alert: PrometheusNotConnectedToAlertmanagers annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected to any Alertmanagers. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers summary: Prometheus is not connected to any Alertmanagers. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) < 1 for: 10m labels: severity: warning - alert: PrometheusTSDBReloadsFailing annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} reload failures over the last 3h. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing summary: Prometheus has issues reloading blocks from disk. expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning - alert: PrometheusTSDBCompactionsFailing annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value | humanize}} compaction failures over the last 3h. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing summary: Prometheus has issues compacting blocks. expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h]) > 0 for: 4h labels: severity: warning - alert: PrometheusNotIngestingSamples annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting samples. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples summary: Prometheus is not ingesting samples. expr: |- ( sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) <= 0 and ( sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="kube-prometheus-stack-prometheus",namespace="monitoring"}) > 0 or sum without(rule_group) (prometheus_rule_group_rules{job="kube-prometheus-stack-prometheus",namespace="monitoring"}) > 0 ) ) for: 10m labels: severity: warning - alert: PrometheusDuplicateTimestamps annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with different values but duplicated timestamp. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps summary: Prometheus is dropping samples with duplicate timestamps. expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning - alert: PrometheusOutOfOrderTimestamps annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps summary: Prometheus drops samples with out-of-order timestamps. expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning - alert: PrometheusRemoteStorageFailures annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{ $labels.url }} runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures summary: Prometheus fails to send samples to remote storage. expr: |- ( (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) / ( (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) + (rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) ) ) * 100 > 1 for: 15m labels: severity: critical - alert: PrometheusRemoteWriteBehind annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind summary: Prometheus remote write is behind. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) - ignoring(remote_name, url) group_right max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) ) > 120 for: 15m labels: severity: critical - alert: PrometheusRemoteWriteDesiredShards annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{ $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-prometheus",namespace="monitoring"}` $labels.instance | query | first | value }}. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards summary: Prometheus remote write desired shards calculation wants to run more than configured max shards. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. ( max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) ) for: 15m labels: severity: warning - alert: PrometheusRuleFailures annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to evaluate {{ printf "%.0f" $value }} rules in the last 5m. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures summary: Prometheus is failing rule evaluations. expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: critical - alert: PrometheusMissingRuleEvaluations annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{ printf "%.0f" $value }} rule group evaluations in the last 5m. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations summary: Prometheus is missing rule evaluations due to slow rule group evaluation. expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning - alert: PrometheusTargetLimitHit annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because the number of targets exceeded the configured target_limit. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the targets limit. expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning - alert: PrometheusLabelLimitHit annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{ printf "%.0f" $value }} targets because some samples exceeded the configured label_limit, label_name_length_limit or label_value_length_limit. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit summary: Prometheus has dropped targets because some scrape configs have exceeded the labels limit. expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning - alert: PrometheusScrapeBodySizeLimitHit annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured body_size_limit. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit summary: Prometheus has dropped some targets that exceeded body size limit. expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning - alert: PrometheusScrapeSampleLimitHit annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{ printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded the configured sample_limit. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit summary: Prometheus has failed scrapes that have exceeded the configured sample limit. expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0 for: 15m labels: severity: warning - alert: PrometheusTargetSyncFailure annotations: description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}} have failed to sync because invalid configuration was supplied.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure summary: Prometheus has failed to sync targets. expr: increase(prometheus_target_sync_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[30m]) > 0 for: 5m labels: severity: critical - alert: PrometheusHighQueryLoad annotations: description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has less than 20% available capacity in its query engine for the last 15 minutes. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload summary: Prometheus is reaching its maximum capacity serving concurrent requests. expr: avg_over_time(prometheus_engine_queries{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) / max_over_time(prometheus_engine_queries_concurrent_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) > 0.8 for: 15m labels: severity: warning - alert: PrometheusErrorSendingAlertsToAnyAlertmanager annotations: description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager. expr: |- min without (alertmanager) ( rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring",alertmanager!~``}[5m]) / rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring",alertmanager!~``}[5m]) ) * 100 > 3 for: 15m labels: severity: critical monitoring-kube-prometheus-stack-k8s.rules.container-memory-cache-43c31095-42d2-4cc7-b385-78d61463618e.yaml: ---- groups: - name: k8s.rules.container_memory_cache rules: - expr: |- container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_cache monitoring-kube-prometheus-stack-nginx-db028596-4299-49c0-922a-d7e2bd6f4373.yaml: ---- groups: - name: nginx-ingress rules: - alert: NginxIngressCriticalErrorBudgetBurn annotations: description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage }} over the last hour, which exceeds the 1.44% burn-rate threshold (14.4x against 99.9% SLO). At this rate, the 30-day error budget exhausts in under 2.1 days. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingresscriticalerrorbudgetburn summary: 'NGINX Ingress: elevated 5xx errors rapidly consuming error budget' expr: | ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[1h])) / sum by (service) (rate(nginx_ingress_controller_requests[1h])) ) > 0.0144 and ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[5m])) / sum by (service) (rate(nginx_ingress_controller_requests[5m])) ) > 0.0144 and sum by (service) (rate(nginx_ingress_controller_requests[5m])) > 1 for: 2m labels: severity: P2 - alert: NginxIngressHighErrorBudgetBurn annotations: description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage }} over the last 6 hours, which exceeds the 0.6% burn-rate threshold (6x against 99.9% SLO). At this rate, the 30-day error budget exhausts in under 5 days. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingresshigherrorbudgetburn summary: 'NGINX Ingress: sustained 5xx errors depleting error budget' expr: | ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[6h])) / sum by (service) (rate(nginx_ingress_controller_requests[6h])) ) > 0.006 and ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[30m])) / sum by (service) (rate(nginx_ingress_controller_requests[30m])) ) > 0.006 and sum by (service) (rate(nginx_ingress_controller_requests[30m])) > 1 for: 5m labels: severity: P2 - alert: NginxIngressModerateErrorBudgetBurn annotations: description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage }} over the last day, which exceeds the 0.3% burn-rate threshold (3x against 99.9% SLO). At this rate, the 30-day error budget exhausts in under 10 days. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingressmoderateerrorbudgetburn summary: 'NGINX Ingress: ongoing 5xx errors steadily consuming error budget' expr: | ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[1d])) / sum by (service) (rate(nginx_ingress_controller_requests[1d])) ) > 0.003 and ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[2h])) / sum by (service) (rate(nginx_ingress_controller_requests[2h])) ) > 0.003 and sum by (service) (rate(nginx_ingress_controller_requests[2h])) > 1 for: 15m labels: severity: P3 - alert: NginxIngressLowErrorBudgetBurn annotations: description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage }} over the last 3 days, which exceeds the 0.1% burn-rate threshold (1x against 99.9% SLO). At this rate, the 30-day error budget exhausts before the window resets. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingresslowerrorbudgetburn summary: 'NGINX Ingress: low-level 5xx errors eroding error budget' expr: | ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[3d])) / sum by (service) (rate(nginx_ingress_controller_requests[3d])) ) > 0.001 and ( sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[6h])) / sum by (service) (rate(nginx_ingress_controller_requests[6h])) ) > 0.001 and sum by (service) (rate(nginx_ingress_controller_requests[6h])) > 1 for: 1h labels: severity: P4 monitoring-kube-prometheus-stack-kube-prometheus-general.rules-5b21151b-0b35-4448-9c7d-2a663d89264b.yaml: ---- groups: - name: kube-prometheus-general.rules rules: - expr: count without(instance, pod, node) (up == 1) record: count:up1 - expr: count without(instance, pod, node) (up == 0) record: count:up0 monitoring-kube-prometheus-stack-memcached-626c8eb9-63b3-4b62-9f61-be96a71978d8.yaml: ---- groups: - name: memcached rules: - alert: MemcachedDown annotations: description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} is down for more than 15 minutes. summary: Memcached instance is down. expr: | memcached_up == 0 for: 15m labels: severity: P1 - alert: MemcachedConnectionLimitApproaching annotations: description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes. summary: Memcached max connection limit is approaching. expr: | (memcached_current_connections / memcached_max_connections * 100) > 80 for: 15m labels: severity: P3 - alert: MemcachedConnectionLimitApproaching annotations: description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes. summary: Memcached connections at critical level. expr: | (memcached_current_connections / memcached_max_connections * 100) > 95 for: 15m labels: severity: P1 - alert: MemcachedOutOfMemoryErrors annotations: description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} has OutOfMemory errors for at least 15 minutes, current rate is {{ printf "%0.0f" $value }} summary: Memcached has OutOfMemory errors. expr: | sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0 for: 15m labels: severity: P3 monitoring-kube-prometheus-stack-rabbitmq-87f01b75-8cb3-4599-b1a6-d928f0005894.yaml: ---- groups: - name: alarms rules: - alert: RabbitmqAlarmFreeDiskSpace expr: rabbitmq_alarms_free_disk_space_watermark == 1 labels: severity: P1 - alert: RabbitmqAlarmMemoryUsedWatermark expr: rabbitmq_alarms_memory_used_watermark == 1 labels: severity: P1 - alert: RabbitmqAlarmFileDescriptorLimit expr: rabbitmq_alarms_file_descriptor_limit == 1 labels: severity: P1 - name: limits rules: - alert: RabbitmqMemoryHigh expr: rabbitmq:usage:memory > 0.80 labels: severity: P3 - alert: RabbitmqMemoryHigh expr: rabbitmq:usage:memory > 0.95 labels: severity: P1 - alert: RabbitmqFileDescriptorsUsage expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80 labels: severity: P3 - alert: RabbitmqFileDescriptorsUsage expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95 labels: severity: P1 - alert: RabbitmqTcpSocketsUsage expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80 labels: severity: P3 - alert: RabbitmqTcpSocketsUsage expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95 labels: severity: P1 - name: msgs rules: - alert: RabbitmqUnackedMessages expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000 for: 5m labels: severity: P3 - alert: RabbitmqUnackedMessages expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000 for: 1h labels: severity: P1 - name: recording rules: - expr: sum without (job) ( rabbitmq_process_resident_memory_bytes ) / sum without ( container, pod, job, namespace, node, resource, uid, unit ) ( label_replace( cluster:namespace:pod_memory:active:kube_pod_container_resource_limits, "instance", "$1", "pod", "(.*)" ) ) labels: job: rabbitmq record: rabbitmq:usage:memory monitoring-kube-prometheus-stack-k8s.rules.container-memory-rss-ffaad072-8ece-40e1-b966-6309fc2552ab.yaml: ---- groups: - name: k8s.rules.container_memory_rss rules: - expr: |- container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_rss monitoring-kube-prometheus-stack-k8s.rules.pod-owner-9b62044e-11b6-4f12-b975-055ae830d9d1.yaml: ---- groups: - name: k8s.rules.pod_owner rules: - expr: |- max by (cluster, namespace, workload, pod) ( label_replace( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, "replicaset", "$1", "owner_name", "(.*)" ) * on (replicaset, namespace) group_left(owner_name) topk by (replicaset, namespace) ( 1, max by (replicaset, namespace, owner_name) ( kube_replicaset_owner{job="kube-state-metrics"} ) ), "workload", "$1", "owner_name", "(.*)" ) ) labels: workload_type: deployment record: namespace_workload_pod:kube_pod_owner:relabel - expr: |- max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, "workload", "$1", "owner_name", "(.*)" ) ) labels: workload_type: daemonset record: namespace_workload_pod:kube_pod_owner:relabel - expr: |- max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, "workload", "$1", "owner_name", "(.*)" ) ) labels: workload_type: statefulset record: namespace_workload_pod:kube_pod_owner:relabel - expr: |- max by (cluster, namespace, workload, pod) ( label_replace( kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}, "workload", "$1", "owner_name", "(.*)" ) ) labels: workload_type: job record: namespace_workload_pod:kube_pod_owner:relabel monitoring-kube-prometheus-stack-k8s.rules.container-memory-working-set-by-5917acce-fc97-40b7-8697-71cf49973cc9.yaml: ---- groups: - name: k8s.rules.container_memory_working_set_bytes rules: - expr: |- container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_working_set_bytes monitoring-kube-prometheus-stack-k8s.rules.container-resource-e695dec9-5e10-45c4-9c42-ce9afb086c77.yaml: ---- groups: - name: k8s.rules.container_resource rules: - expr: |- kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests - expr: |- sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"} ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_memory:kube_pod_container_resource_requests:sum - expr: |- kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests - expr: |- sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"} ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_cpu:kube_pod_container_resource_requests:sum - expr: |- kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits - expr: |- sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"} ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_memory:kube_pod_container_resource_limits:sum - expr: |- kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( (kube_pod_status_phase{phase=~"Pending|Running"} == 1) ) record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits - expr: |- sum by (namespace, cluster) ( sum by (namespace, pod, cluster) ( max by (namespace, pod, container, cluster) ( kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"} ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) ( kube_pod_status_phase{phase=~"Pending|Running"} == 1 ) ) ) record: namespace_cpu:kube_pod_container_resource_limits:sum monitoring-kube-prometheus-stack-config-reloaders-ad93ba53-8edb-41dd-ba2f-1219ab0b267e.yaml: ---- groups: - name: config-reloaders rules: - alert: ConfigReloaderSidecarErrors annotations: description: |- Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace. As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors summary: config-reloader sidecar has not had a successful reload for 10m expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0 for: 10m labels: severity: warning monitoring-kube-prometheus-stack-k8s.rules.container-memory-swap-e122affb-0f94-4a8b-8b9b-c7efaa7e944d.yaml: ---- groups: - name: k8s.rules.container_memory_swap rules: - expr: |- container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""} * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""}) ) record: node_namespace_pod_container:container_memory_swap monitoring-kube-prometheus-stack-node-f7dc3f3e-034e-444f-9c96-bbd73f67eb98.yaml: ---- groups: - name: node-exporter rules: - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up. summary: Filesystem is predicted to run out of space within the next 24 hours. expr: | ( node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 and predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h labels: severity: P3 - alert: NodeFilesystemSpaceFillingUp annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up fast. summary: Filesystem is predicted to run out of space within the next 4 hours. expr: | ( node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 and predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h labels: severity: P1 - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. summary: Filesystem has less than 5% space left. expr: | ( node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 30m labels: severity: P3 - alert: NodeFilesystemAlmostOutOfSpace annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left. summary: Filesystem has less than 3% space left. expr: | ( node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 30m labels: severity: P1 - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up. summary: Filesystem is predicted to run out of inodes within the next 24 hours. expr: | ( node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 and predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h labels: severity: P3 - alert: NodeFilesystemFilesFillingUp annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left and is filling up fast. summary: Filesystem is predicted to run out of inodes within the next 4 hours. expr: | ( node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 and predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h labels: severity: P1 - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. summary: Filesystem has less than 5% inodes left. expr: | ( node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h labels: severity: P3 - alert: NodeFilesystemAlmostOutOfFiles annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available inodes left. summary: Filesystem has less than 3% inodes left. expr: | ( node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 ) for: 1h labels: severity: P1 - alert: NodeNetworkReceiveErrs annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.' summary: Network interface is reporting many receive errors. expr: | rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01 for: 1h labels: severity: P3 - alert: NodeNetworkTransmitErrs annotations: description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.' summary: Network interface is reporting many transmit errors. expr: | rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01 for: 1h labels: severity: P3 - alert: NodeHighNumberConntrackEntriesUsed annotations: description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack entries are used.' summary: Number of conntrack are getting close to the limit. expr: | (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75 labels: severity: P3 - alert: NodeTextFileCollectorScrapeError annotations: description: Node Exporter text file collector on {{ $labels.instance }} failed to scrape. summary: Node Exporter text file collector failed to scrape. expr: | node_textfile_scrape_error{job="node-exporter"} == 1 labels: severity: P3 - alert: NodeClockSkewDetected annotations: description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host. summary: Clock skew detected. expr: | ( node_timex_offset_seconds{job="node-exporter"} > 0.05 and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 ) or ( node_timex_offset_seconds{job="node-exporter"} < -0.05 and deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 ) for: 10m labels: severity: P3 - alert: NodeClockNotSynchronising annotations: description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP is configured on this host. summary: Clock not synchronising. expr: | min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 and node_timex_maxerror_seconds{job="node-exporter"} >= 16 for: 10m labels: severity: P3 - alert: NodeRAIDDegraded annotations: description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically. summary: RAID Array is degraded. expr: | node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0 for: 15m labels: severity: P1 - alert: NodeRAIDDiskFailure annotations: description: At least one device in RAID array at {{ $labels.instance }} failed. Array '{{ $labels.device }}' needs attention and possibly a disk swap. summary: Failed device in RAID array. expr: | node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0 labels: severity: P3 - alert: NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. summary: Kernel is predicted to exhaust file descriptors limit soon. expr: | ( node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 ) for: 15m labels: severity: P3 - alert: NodeFileDescriptorLimit annotations: description: File descriptors limit at {{ $labels.instance }} is currently at {{ printf "%.2f" $value }}%. summary: Kernel is predicted to exhaust file descriptors limit soon. expr: | ( node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 ) for: 15m labels: severity: P1 - alert: NodeCPUHighUsage annotations: description: | CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. summary: High CPU usage. expr: | sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!~"idle|iowait"}[2m]))) * 100 > 90 for: 15m labels: severity: P5 - alert: NodeSystemSaturation annotations: description: | System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This might indicate this instance resources saturation and can cause it becoming unresponsive. summary: System saturated, load per core is very high. expr: | node_load1{job="node-exporter"} / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 for: 15m labels: severity: P3 - alert: NodeMemoryMajorPagesFaults annotations: description: | Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. Please check that there is enough memory available at this instance. summary: Memory major page faults are occurring at very high rate. expr: | rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 for: 15m labels: severity: P3 - alert: NodeMemoryHighUtilization annotations: description: | Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. summary: Host is running out of memory. expr: | 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90 for: 15m labels: severity: P3 - alert: NodeSystemdServiceFailed annotations: description: Systemd service {{ $labels.name }} has entered failed state at {{ $labels.instance }} summary: Systemd service has entered failed state. expr: | node_systemd_unit_state{job="node-exporter", state="failed"} == 1 for: 5m labels: severity: P3 - alert: NodeSystemdServiceCrashlooping annotations: description: Systemd service {{ $labels.name }} has being restarted too many times at {{ $labels.instance }} for the last 15 minutes. Please check if service is crash looping. summary: Systemd service keeps restaring, possibly crash looping. expr: | increase(node_systemd_service_restart_total{job="node-exporter"}[5m]) > 2 for: 15m labels: severity: P3 - alert: NodeBondingDegraded annotations: description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} is in degraded state due to one or more slave failures. summary: Bonding interface is degraded. expr: | (node_bonding_slaves{job="node-exporter"} - node_bonding_active{job="node-exporter"}) != 0 for: 5m labels: severity: P3 - name: node-exporter-extras rules: - alert: NodeTimeSkewDetected annotations: description: Node {{ $labels.instance }} has a time difference {{ $value }}. summary: Node {{ $labels.instance }} has a time difference. expr: | abs(timestamp(node_time_seconds{job="node-exporter"}) - node_time_seconds{job="node-exporter"}) > 1 for: 5m labels: severity: P3 - alert: NodeDiskHighLatency annotations: description: Average IO latency on {{ $labels.device }} at {{ $labels.instance }} is {{ $value | humanizeDuration }} over the last 5 minutes, which exceeds the threshold of 20ms. Normal SSD latency is below 1ms and normal HDD latency is below 15ms. runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nodediskhighlatency summary: 'Node disk: high IO latency affecting workloads' expr: | ( ( rate(node_disk_read_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + rate(node_disk_write_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) ) / ( rate(node_disk_reads_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + rate(node_disk_writes_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) ) ) > 0.02 and ( rate(node_disk_reads_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + rate(node_disk_writes_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) ) > 0 for: 1h labels: severity: P4 - name: node-exporter.rules rules: - expr: | count without (cpu, mode) ( node_cpu_seconds_total{job="node-exporter",mode="idle"} ) record: instance:node_num_cpu:sum - expr: | 1 - avg without (cpu) ( sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m])) ) record: instance:node_cpu_utilisation:rate5m - expr: | ( node_load1{job="node-exporter"} / instance:node_num_cpu:sum{job="node-exporter"} ) record: instance:node_load1_per_cpu:ratio - expr: | 1 - ( ( node_memory_MemAvailable_bytes{job="node-exporter"} or ( node_memory_Buffers_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Slab_bytes{job="node-exporter"} ) ) / node_memory_MemTotal_bytes{job="node-exporter"} ) record: instance:node_memory_utilisation:ratio - expr: | rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) record: instance:node_vmstat_pgmajfault:rate5m - expr: | rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) record: instance_device:node_disk_io_time_seconds:rate5m - expr: | rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) record: instance_device:node_disk_io_time_weighted_seconds:rate5m - expr: | sum without (device) ( rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m]) ) record: instance:node_network_receive_bytes_excluding_lo:rate5m - expr: | sum without (device) ( rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m]) ) record: instance:node_network_transmit_bytes_excluding_lo:rate5m - expr: | sum without (device) ( rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m]) ) record: instance:node_network_receive_drop_excluding_lo:rate5m - expr: | sum without (device) ( rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m]) ) record: instance:node_network_transmit_drop_excluding_lo:rate5m monitoring-kube-prometheus-stack-etcd-a25d1705-78b3-4f41-904f-aaecfb24f223.yaml: ---- groups: - name: etcd rules: - alert: etcdMembersDown annotations: description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).' summary: etcd cluster members are down. expr: |- max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 ) ) > 0 for: 10m labels: severity: critical - alert: etcdInsufficientMembers annotations: description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value }}).' summary: etcd cluster has insufficient number of members. expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2) for: 3m labels: severity: critical - alert: etcdNoLeader annotations: description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} has no leader.' summary: etcd cluster has no leader. expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 for: 1m labels: severity: critical - alert: etcdHighNumberOfLeaderChanges annotations: description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.' summary: etcd cluster has high number of leader changes. expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4 for: 5m labels: severity: warning - alert: etcdGRPCRequestsSlow annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method }} method.' summary: etcd grpc requests are slow expr: |- histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15 for: 10m labels: severity: critical - alert: etcdMemberCommunicationSlow annotations: description: 'etcd cluster "{{ $labels.job }}": member communication with {{ $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance }}.' summary: etcd cluster member communication is slow. expr: |- histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15 for: 10m labels: severity: warning - alert: etcdHighNumberOfFailedProposals annotations: description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures within the last 30 minutes on etcd instance {{ $labels.instance }}.' summary: etcd cluster has high number of proposal failures. expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 for: 15m labels: severity: warning - alert: etcdHighFsyncDurations annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' summary: etcd cluster 99th percentile fsync durations are too high. expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.5 for: 10m labels: severity: warning - alert: etcdHighFsyncDurations annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.' summary: etcd cluster 99th percentile fsync durations are too high. expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 1 for: 10m labels: severity: critical - alert: etcdHighCommitDurations annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations {{ $value }}s on etcd instance {{ $labels.instance }}.' summary: etcd cluster 99th percentile commit durations are too high. expr: |- histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.25 for: 10m labels: severity: warning - alert: etcdDatabaseQuotaLowSpace annotations: description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined quota on etcd instance {{ $labels.instance }}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.' summary: etcd cluster database is running full. expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > 95 for: 10m labels: severity: critical - alert: etcdExcessiveDatabaseGrowth annotations: description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{ $labels.instance }}, please check as it might be disruptive.' summary: etcd cluster database growing very fast. expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) > etcd_server_quota_backend_bytes{job=~".*etcd.*"} for: 10m labels: severity: warning - alert: etcdDatabaseHighFragmentationRatio annotations: description: 'etcd cluster "{{ $labels.job }}": database size in use on instance {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.' runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation summary: etcd database size in use is less than 50% of the actual allocated storage. expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 for: 10m labels: severity: warning monitoring-kube-prometheus-stack-general.rules-6f378df2-c29d-456f-8007-f0d3b11cebb2.yaml: ---- groups: - name: general.rules rules: - alert: TargetDown annotations: description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) BY (cluster, job, namespace, service)) > 10 for: 10m labels: severity: warning - alert: Watchdog annotations: description: | This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog summary: An alert that should always be firing to certify that Alertmanager is working properly. expr: vector(1) labels: severity: none - alert: InfoInhibitor annotations: description: | This is an alert that is used to inhibit info alerts. By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with other alerts. This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a severity of 'warning' or 'critical' starts firing on the same namespace. This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor summary: Info-level alert inhibition. expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 labels: severity: none BinaryData ==== Events: