apiVersion: v1
data:
  monitoring-kube-prometheus-stack-alertmanager-7b2e9f9e-ce25-4a8e-9701-c5a8f41046ab.yaml: |
    groups:
    - name: alertmanager.rules
      rules:
      - alert: AlertmanagerFailedReload
        annotations:
          description: Configuration has failed to load for {{$labels.instance}}.
          summary: Reloading an Alertmanager configuration has failed.
        expr: |
          # Without max_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
          max_over_time(alertmanager_config_last_reload_successful{job="kube-prometheus-stack-alertmanager"}[5m]) == 0
        for: 10m
        labels:
          severity: P1
      - alert: AlertmanagerMembersInconsistent
        annotations:
          description: Alertmanager {{$labels.instance}} has only found {{ $value }} members
            of the {{$labels.job}} cluster.
          summary: A member of an Alertmanager cluster has not found all other cluster
            members.
        expr: |
          # Without max_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
            max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager"}[5m])
          < on (namespace,service,cluster) group_left
            count by (namespace,service,cluster) (max_over_time(alertmanager_cluster_members{job="kube-prometheus-stack-alertmanager"}[5m]))
        for: 15m
        labels:
          severity: P1
      - alert: AlertmanagerFailedToSendAlerts
        annotations:
          description: Alertmanager {{$labels.instance}} failed to send {{ $value | humanizePercentage
            }} of notifications to {{ $labels.integration }}.
          summary: An Alertmanager instance failed to send notifications.
        expr: |
          (
            rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager"}[5m])
          /
            ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager"}[5m])
          )
          > 0.01
        for: 5m
        labels:
          severity: P3
      - alert: AlertmanagerClusterFailedToSendAlerts
        annotations:
          description: The minimum notification failure rate to {{ $labels.integration
            }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage
            }}.
          summary: All Alertmanager instances in a cluster failed to send notifications
            to a critical integration.
        expr: |
          min by (namespace,service,cluster, integration) (
            rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager", integration=~`.*`}[5m])
          /
            ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager", integration=~`.*`}[5m])
          )
          > 0.01
        for: 5m
        labels:
          severity: P1
      - alert: AlertmanagerClusterFailedToSendAlerts
        annotations:
          description: The minimum notification failure rate to {{ $labels.integration
            }} sent from any instance in the {{$labels.job}} cluster is {{ $value | humanizePercentage
            }}.
          summary: All Alertmanager instances in a cluster failed to send notifications
            to a non-critical integration.
        expr: |
          min by (namespace,service,cluster, integration) (
            rate(alertmanager_notifications_failed_total{job="kube-prometheus-stack-alertmanager", integration!~`.*`}[5m])
          /
            ignoring (reason) group_left rate(alertmanager_notifications_total{job="kube-prometheus-stack-alertmanager", integration!~`.*`}[5m])
          )
          > 0.01
        for: 5m
        labels:
          severity: P3
      - alert: AlertmanagerConfigInconsistent
        annotations:
          description: Alertmanager instances within the {{$labels.job}} cluster have
            different configurations.
          summary: Alertmanager instances within the same cluster have different configurations.
        expr: |
          count by (namespace,service,cluster) (
            count_values by (namespace,service,cluster) ("config_hash", alertmanager_config_hash{job="kube-prometheus-stack-alertmanager"})
          )
          != 1
        for: 20m
        labels:
          severity: P1
      - alert: AlertmanagerClusterDown
        annotations:
          description: '{{ $value | humanizePercentage }} of Alertmanager instances within
            the {{$labels.job}} cluster have been up for less than half of the last 5m.'
          summary: Half or more of the Alertmanager instances within the same cluster
            are down.
        expr: |
          (
            count by (namespace,service,cluster) (
              avg_over_time(up{job="kube-prometheus-stack-alertmanager"}[5m]) < 0.5
            )
          /
            count by (namespace,service,cluster) (
              up{job="kube-prometheus-stack-alertmanager"}
            )
          )
          >= 0.5
        for: 5m
        labels:
          severity: P1
      - alert: AlertmanagerClusterCrashlooping
        annotations:
          description: '{{ $value | humanizePercentage }} of Alertmanager instances within
            the {{$labels.job}} cluster have restarted at least 5 times in the last 10m.'
          summary: Half or more of the Alertmanager instances within the same cluster
            are crashlooping.
        expr: |
          (
            count by (namespace,service,cluster) (
              changes(process_start_time_seconds{job="kube-prometheus-stack-alertmanager"}[10m]) > 4
            )
          /
            count by (namespace,service,cluster) (
              up{job="kube-prometheus-stack-alertmanager"}
            )
          )
          >= 0.5
        for: 5m
        labels:
          severity: P1
  monitoring-kube-prometheus-stack-ceph-8ecbb771-5600-4869-baee-e56d6fde0db4.yaml: |
    groups:
    - name: cluster health
    - name: mon
      rules:
      - alert: CephMonDownQuorumAtRisk
        annotations:
          description: '{{ $min := printf "floor(count(ceph_mon_metadata{cluster=''%s''})
            / 2) + 1" .Labels.cluster | query | first | value }}Quorum requires a majority
            of monitors (x {{ $min }}) to be active. Without quorum the cluster will become
            inoperable, affecting all services and connected clients. The following monitors
            are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''} == 0)
            + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" .Labels.cluster
            | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}'
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
          summary: Monitor quorum is at risk on cluster {{ $labels.cluster }}
        expr: |
          (
            (ceph_health_detail{name="MON_DOWN"} == 1) * on() group_right(cluster) (
              count(ceph_mon_quorum_status == 1) by(cluster)== bool (floor(count(ceph_mon_metadata) by(cluster) / 2) + 1)
            )
          ) == 1
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.3.1
          severity: P3
          type: ceph_default
      - alert: CephMonDown
        annotations:
          description: '{{ $down := printf "count(ceph_mon_quorum_status{cluster=''%s''}
            == 0)" .Labels.cluster | query | first | value }}{{ $s := "" }}{{ if gt $down
            1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. Quorum
            is still intact, but the loss of an additional monitor will make your cluster
            inoperable. The following monitors are down: {{- range printf "(ceph_mon_quorum_status{cluster=''%s''}
            == 0) + on(cluster,ceph_daemon) group_left(hostname) (ceph_mon_metadata *
            0)" .Labels.cluster | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname
            }} {{- end }}'
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-down
          summary: One or more monitors down on cluster {{ $labels.cluster }}
        expr: |
          (count by (cluster) (ceph_mon_quorum_status == 0)) <= (count by (cluster) (ceph_mon_metadata) - floor((count by (cluster) (ceph_mon_metadata) / 2 + 1)))
        for: 30s
        labels:
          severity: P4
          type: ceph_default
      - alert: CephMonDiskspaceCritical
        annotations:
          description: The free space available to a monitor's store is critically low.
            You should increase the space available to the monitor(s). The default directory
            is /var/lib/ceph/mon-*/data/store.db on traditional deployments, and /var/lib/rook/mon-*/data/store.db
            on the mon pod's worker node for Rook. Look for old, rotated versions of *.log
            and MANIFEST*. Do NOT touch any *.sst files. Also check any other directories
            under /var/lib/rook and other directories on the same filesystem, often /var/log
            and /var/tmp are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}}
            - {{ .Labels.hostname }} {{- end }}
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-crit
          summary: Filesystem space on at least one monitor is critically low on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.3.2
          severity: P1
          type: ceph_default
      - alert: CephMonDiskspaceLow
        annotations:
          description: The space available to a monitor's store is approaching full (>70%
            is the default). You should increase the space available to the monitor(s).
            The default directory is /var/lib/ceph/mon-*/data/store.db on traditional
            deployments, and /var/lib/rook/mon-*/data/store.db on the mon pod's worker
            node for Rook. Look for old, rotated versions of *.log and MANIFEST*.  Do
            NOT touch any *.sst files. Also check any other directories under /var/lib/rook
            and other directories on the same filesystem, often /var/log and /var/tmp
            are culprits. Your monitor hosts are; {{- range query "ceph_mon_metadata"}}
            - {{ .Labels.hostname }} {{- end }}
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-disk-low
          summary: Drive space on at least one monitor is approaching full on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="MON_DISK_LOW"} == 1
        for: 5m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephMonClockSkew
        annotations:
          description: Ceph monitors rely on closely synchronized time to maintain quorum
            and cluster consistency. This event indicates that the time on at least one
            mon has drifted too far from the lead mon. Review cluster status with ceph
            -s. This will show which monitors are affected. Check the time sync status
            on each monitor host with 'ceph time-sync-status' and the state and peers
            of your ntpd or chrony daemon.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#mon-clock-skew
          summary: Clock skew detected among monitors on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1
        for: 1m
        labels:
          severity: P3
          type: ceph_default
    - name: osd
      rules:
      - alert: CephOSDDownHigh
        annotations:
          description: '{{ $value | humanize }}% or {{ with printf "count (ceph_osd_up{cluster=''%s''}
            == 0)" .Labels.cluster | query }}{{ . | first | value }}{{ end }} of {{ with
            printf "count (ceph_osd_up{cluster=''%s''})" .Labels.cluster | query }}{{
            . | first | value }}{{ end }} OSDs are down (>= 10%). The following OSDs are
            down: {{- range printf "(ceph_osd_up{cluster=''%s''} * on(cluster, ceph_daemon)
            group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster | query }} -
            {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}'
          summary: More than 10% of OSDs are down on cluster {{ $labels.cluster }}
        expr: count by (cluster) (ceph_osd_up == 0) / count by (cluster) (ceph_osd_up)
          * 100 >= 10
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.1
          severity: P1
          type: ceph_default
      - alert: CephOSDHostDown
        annotations:
          description: 'The following OSDs are down: {{- range printf "(ceph_osd_up{cluster=''%s''}
            * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster
            | query }} - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} {{- end }}'
          summary: An OSD host is offline on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.8
          severity: P3
          type: ceph_default
      - alert: CephOSDDown
        annotations:
          description: '{{ $num := printf "count(ceph_osd_up{cluster=''%s''} == 0) " .Labels.cluster
            | query | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{
            end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }}
            {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range printf "(ceph_osd_up{cluster=''%s''}
            * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" .Labels.cluster
            | query }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }}'
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-down
          summary: An OSD has been marked down on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="OSD_DOWN"} == 1
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.2
          severity: P3
          type: ceph_default
      - alert: CephOSDNearFull
        annotations:
          description: One or more OSDs have reached the NEARFULL threshold. Use 'ceph
            health detail' and 'ceph osd df' to identify the problem. To resolve, add
            capacity to the affected OSD's failure domain, restore down/out OSDs, or delete
            unwanted data.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-nearfull
          summary: OSD(s) running low on free space (NEARFULL) on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="OSD_NEARFULL"} == 1
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.3
          severity: P3
          type: ceph_default
      - alert: CephOSDFull
        annotations:
          description: An OSD has reached the FULL threshold. Writes to pools that share
            the affected OSD will be blocked. Use 'ceph health detail' and 'ceph osd df'
            to identify the problem. To resolve, add capacity to the affected OSD's failure
            domain, restore down/out OSDs, or delete unwanted data.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-full
          summary: OSD full, writes blocked on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="OSD_FULL"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.6
          severity: P1
          type: ceph_default
      - alert: CephOSDBackfillFull
        annotations:
          description: An OSD has reached the BACKFILL FULL threshold. This will prevent
            rebalance operations from completing. Use 'ceph health detail' and 'ceph osd
            df' to identify the problem. To resolve, add capacity to the affected OSD's
            failure domain, restore down/out OSDs, or delete unwanted data.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-backfillfull
          summary: OSD(s) too full for backfill operations on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephOSDTooManyRepairs
        annotations:
          description: Reads from an OSD have used a secondary PG to return data to the
            client, indicating a potential failing drive.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#osd-too-many-repairs
          summary: OSD reports a high number of read errors on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1
        for: 30s
        labels:
          severity: P3
          type: ceph_default
      - alert: CephOSDTimeoutsPublicNetwork
        annotations:
          description: OSD heartbeats on the cluster's 'public' network (frontend) are
            running slow. Investigate the network for latency or loss issues. Use 'ceph
            health detail' to show the affected OSDs.
          summary: Network issues delaying OSD heartbeats (public network) on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1
        for: 1m
        labels:
          severity: P4
          type: ceph_default
      - alert: CephOSDTimeoutsClusterNetwork
        annotations:
          description: OSD heartbeats on the cluster's 'cluster' network (backend) are
            slow. Investigate the network for latency issues on this subnet. Use 'ceph
            health detail' to show the affected OSDs.
          summary: Network issues delaying OSD heartbeats (cluster network) on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1
        for: 1m
        labels:
          severity: P4
          type: ceph_default
      - alert: CephOSDInternalDiskSizeMismatch
        annotations:
          description: One or more OSDs have an internal inconsistency between metadata
            and the size of the device. This could lead to the OSD(s) crashing in future.
            You should redeploy the affected OSDs.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-disk-size-mismatch
          summary: OSD size inconsistency error on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephDeviceFailurePredicted
        annotations:
          description: The device health module has determined that one or more devices
            will fail soon. To review device status use 'ceph device ls'. To show a specific
            device use 'ceph device info <dev id>'. Mark the OSD out so that data may
            migrate to other OSDs. Once the OSD has drained, destroy the OSD, replace
            the device, and redeploy the OSD.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#id2
          summary: Device(s) predicted to fail soon on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephDeviceFailurePredictionTooHigh
        annotations:
          description: The device health module has determined that devices predicted
            to fail can not be remediated automatically, since too many OSDs would be
            removed from the cluster to ensure performance and availability. Prevent data
            integrity issues by adding new OSDs so that data may be relocated.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-toomany
          summary: Too many devices are predicted to fail on cluster {{ $labels.cluster
            }}, unable to resolve
        expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.7
          severity: P1
          type: ceph_default
      - alert: CephDeviceFailureRelocationIncomplete
        annotations:
          description: "The device health module has determined that one or more devices
            will fail soon, but the normal process of relocating the data on the device
            to other OSDs in the cluster is blocked. \nEnsure that the cluster has available
            free space. It may be necessary to add capacity to the cluster to allow data
            from the failing device to successfully migrate, or to enable the balancer."
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#device-health-in-use
          summary: Device failure is predicted, but unable to relocate data on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephOSDFlapping
        annotations:
          description: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked
            down and back up {{ $value | humanize }} times once a minute for 5 minutes.
            This may indicate a network issue (latency, packet loss, MTU mismatch) on
            the cluster network, or the public network if no cluster network is deployed.
            Check the network stats on the listed host(s).
          documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd#flapping-osds
          summary: Network issues are causing OSDs to flap (mark each other down) on cluster
            {{ $labels.cluster }}
        expr: (rate(ceph_osd_up[5m]) * on(cluster,ceph_daemon) group_left(hostname) ceph_osd_metadata)
          * 60 > 1
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.4.4
          severity: P3
          type: ceph_default
      - alert: CephOSDReadErrors
        annotations:
          description: An OSD has encountered read errors, but the OSD has recovered by
            retrying the reads. This may indicate an issue with hardware or the kernel.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#bluestore-spurious-read-errors
          summary: Device read errors detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1
        for: 30s
        labels:
          severity: P3
          type: ceph_default
    - name: mds
      rules:
      - alert: CephFilesystemDamaged
        annotations:
          description: Filesystem metadata has been corrupted. Data may be inaccessible.
            Analyze metrics from the MDS daemon admin socket, or escalate to support.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
          summary: CephFS filesystem is damaged on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="MDS_DAMAGE"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.5.1
          severity: P1
          type: ceph_default
      - alert: CephFilesystemOffline
        annotations:
          description: All MDS ranks are unavailable. The MDS daemons managing metadata
            are down, rendering the filesystem offline.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-all-down
          summary: CephFS filesystem is offline on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="MDS_ALL_DOWN"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.5.3
          severity: P1
          type: ceph_default
      - alert: CephFilesystemDegraded
        annotations:
          description: One or more metadata daemons (MDS ranks) are failed or in a damaged
            state. At best the filesystem is partially available, at worst the filesystem
            is completely unusable.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-degraded
          summary: CephFS filesystem is degraded on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="FS_DEGRADED"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.5.4
          severity: P1
          type: ceph_default
      - alert: CephFilesystemMDSRanksLow
        annotations:
          description: The filesystem's 'max_mds' setting defines the number of MDS ranks
            in the filesystem. The current number of active MDS daemons is less than this
            value.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-up-less-than-max
          summary: Ceph MDS daemon count is lower than configured on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="MDS_UP_LESS_THAN_MAX"} > 0
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephFilesystemInsufficientStandby
        annotations:
          description: The minimum number of standby daemons required by standby_count_wanted
            is less than the current number of standby daemons. Adjust the standby count
            or increase the number of MDS daemons.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#mds-insufficient-standby
          summary: Ceph filesystem standby daemons too few on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="MDS_INSUFFICIENT_STANDBY"} > 0
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephFilesystemFailureNoStandby
        annotations:
          description: An MDS daemon has failed, leaving only one active rank and no available
            standby. Investigate the cause of the failure or add a standby MDS.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#fs-with-failed-mds
          summary: MDS daemon failed, no further standby available on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="FS_WITH_FAILED_MDS"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.5.5
          severity: P1
          type: ceph_default
      - alert: CephFilesystemReadOnly
        annotations:
          description: The filesystem has switched to READ ONLY due to an unexpected error
            when writing to the metadata pool. Either analyze the output from the MDS
            daemon admin socket, or escalate to support.
          documentation: https://docs.ceph.com/en/latest/cephfs/health-messages#cephfs-health-messages
          summary: CephFS filesystem in read only mode due to write error(s) on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.5.2
          severity: P1
          type: ceph_default
    - name: mgr
      rules:
      - alert: CephMgrModuleCrash
        annotations:
          description: One or more mgr modules have crashed and have yet to be acknowledged
            by an administrator. A crashed module may impact functionality within the
            cluster. Use the 'ceph crash' command to determine which module has failed,
            and archive it to acknowledge the failure.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#recent-mgr-module-crash
          summary: A manager module has recently crashed on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.6.1
          severity: P1
          type: ceph_default
      - alert: CephMgrPrometheusModuleInactive
        annotations:
          description: The mgr/prometheus module at {{ $labels.instance }} is unreachable.
            This could mean that the module has been disabled or the mgr daemon itself
            is down. Without the mgr/prometheus module metrics and alerts will no longer
            function. Open a shell to an admin node or toolbox pod and use 'ceph -s' to
            to determine whether the mgr is active. If the mgr is not active, restart
            it, otherwise you can determine module status with 'ceph mgr module ls'. If
            it is not listed as enabled, enable it with 'ceph mgr module enable prometheus'.
          summary: The mgr/prometheus module is not available
        expr: up{job="ceph"} == 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.6.2
          severity: P4
          type: ceph_default
    - name: pgs
      rules:
      - alert: CephPGsInactive
        annotations:
          description: '{{ $value }} PGs have been inactive for more than 5 minutes in
            pool {{ $labels.name }}. Inactive placement groups are not able to serve read/write
            requests.'
          summary: One or more placement groups are inactive on cluster {{ $labels.cluster
            }}
        expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total
          - ceph_pg_active) > 0
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.7.1
          severity: P1
          type: ceph_default
      - alert: CephPGsUnclean
        annotations:
          description: '{{ $value }} PGs have been unclean for more than 15 minutes in
            pool {{ $labels.name }}. Unclean PGs have not recovered from a previous failure.'
          summary: One or more placement groups are marked unclean on cluster {{ $labels.cluster
            }}
        expr: ceph_pool_metadata * on(cluster,pool_id,instance) group_left() (ceph_pg_total
          - ceph_pg_clean) > 0
        for: 15m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.7.2
          severity: P3
          type: ceph_default
      - alert: CephPGsDamaged
        annotations:
          description: During data consistency checks (scrub), at least one PG has been
            flagged as being damaged or inconsistent. Check to see which PG is affected,
            and attempt a manual repair if necessary. To list problematic placement groups,
            use 'rados list-inconsistent-pg <pool>'. To repair PGs use the 'ceph pg repair
            <pg_num>' command.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-damaged
          summary: Placement group damaged, manual intervention needed on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.7.4
          severity: P1
          type: ceph_default
      - alert: CephPGRecoveryAtRisk
        annotations:
          description: Data redundancy is at risk since one or more OSDs are at or above
            the 'full' threshold. Add more capacity to the cluster, restore down/out OSDs,
            or delete unwanted data.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-recovery-full
          summary: OSDs are too full for recovery on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.7.5
          severity: P1
          type: ceph_default
      - alert: CephPGUnavailableBlockingIO
        annotations:
          description: Data availability is reduced, impacting the cluster's ability to
            service I/O. One or more placement groups (PGs) are in a state that blocks
            I/O.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-availability
          summary: PG is unavailable on cluster {{ $labels.cluster }}, blocking I/O
        expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"}))
          == 1
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.7.3
          severity: P1
          type: ceph_default
      - alert: CephPGBackfillAtRisk
        annotations:
          description: Data redundancy may be at risk due to lack of free space within
            the cluster. One or more OSDs have reached the 'backfillfull' threshold. Add
            more capacity, or delete unwanted data.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-backfill-full
          summary: Backfill operations are blocked due to lack of free space on cluster
            {{ $labels.cluster }}
        expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.7.6
          severity: P1
          type: ceph_default
      - alert: CephPGNotScrubbed
        annotations:
          description: 'One or more PGs have not been scrubbed recently. Scrubs check
            metadata integrity, protecting against bit-rot. They check that metadata is
            consistent across data replicas. When PGs miss their scrub interval, it may
            indicate that the scrub window is too small, or PGs were not in a ''clean''
            state during the scrub window. You can manually initiate a scrub with: ceph
            pg scrub <pgid>'
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-scrubbed
          summary: Placement group(s) have not been scrubbed on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1
        for: 5m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephPGsHighPerOSD
        annotations:
          description: |-
            The number of placement groups per OSD is too high (exceeds the mon_max_pg_per_osd setting).
             Check that the pg_autoscaler has not been disabled for any pools with 'ceph osd pool autoscale-status', and that the profile selected is appropriate. You may also adjust the target_size_ratio of a pool to guide the autoscaler based on the expected relative size of the pool ('ceph osd pool set cephfs.cephfs.meta target_size_ratio .1') or set the pg_autoscaler mode to 'warn' and adjust pg_num appropriately for one or more pools.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#too-many-pgs
          summary: Placement groups per OSD is too high on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="TOO_MANY_PGS"} == 1
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: CephPGNotDeepScrubbed
        annotations:
          description: One or more PGs have not been deep scrubbed recently. Deep scrubs
            protect against bit-rot. They compare data replicas to ensure consistency.
            When PGs miss their deep scrub interval, it may indicate that the window is
            too small or PGs were not in a 'clean' state during the deep-scrub window.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pg-not-deep-scrubbed
          summary: Placement group(s) have not been deep scrubbed on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1
        for: 5m
        labels:
          severity: P3
          type: ceph_default
    - name: nodes
      rules:
      - alert: CephNodeRootFilesystemFull
        annotations:
          description: 'Root volume is dangerously full: {{ $value | humanize }}% free.'
          summary: Root filesystem is dangerously full
        expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}
          * 100 < 5
        for: 5m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.8.1
          severity: P1
          type: ceph_default
      - alert: CephNodeNetworkPacketErrors
        annotations:
          description: Node {{ $labels.instance }} experiences packet errors > 0.01% or
            > 10 packets/s on interface {{ $labels.device }}.
          summary: One or more NICs reports packet errors on cluster {{ $labels.cluster
            }}
        expr: |
          (
            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
            rate(node_network_transmit_errs_total{device!="lo"}[1m])
          ) / (
            rate(node_network_receive_packets_total{device!="lo"}[1m]) +
            rate(node_network_transmit_packets_total{device!="lo"}[1m])
          ) >= 0.0001 or (
            rate(node_network_receive_errs_total{device!="lo"}[1m]) +
            rate(node_network_transmit_errs_total{device!="lo"}[1m])
          ) >= 10
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.8.3
          severity: P3
          type: ceph_default
      - alert: CephNodeNetworkBondDegraded
        annotations:
          description: Bond {{ $labels.master }} is degraded on Node {{ $labels.instance
            }}.
          summary: Degraded Bond on Node {{ $labels.instance }} on cluster {{ $labels.cluster
            }}
        expr: |
          node_bonding_slaves - node_bonding_active != 0
        labels:
          severity: P3
          type: ceph_default
      - alert: CephNodeInconsistentMTU
        annotations:
          description: Node {{ $labels.instance }} has a different MTU size ({{ $value
            }}) than the median of devices named {{ $labels.device }}.
          summary: MTU settings across Ceph hosts are inconsistent on cluster {{ $labels.cluster
            }}
        expr: node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    max
          by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0))  )or node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==  scalar(    min
          by (cluster,device) (node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0)) !=      quantile by (cluster,device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"}
          > 0))  )
        labels:
          severity: P3
          type: ceph_default
    - name: pools
      rules:
      - alert: CephPoolGrowthWarning
        annotations:
          description: Pool '{{ $labels.name }}' will be full in less than 5 days assuming
            the average fill-up rate of the past 48 hours.
          summary: Pool growth rate may soon exceed capacity on cluster {{ $labels.cluster
            }}
        expr: (predict_linear(ceph_pool_percent_used[2d], 3600 * 24 * 5) * on(cluster,pool_id,
          instance) group_right() ceph_pool_metadata) >= 95
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.9.2
          severity: P3
          type: ceph_default
      - alert: CephPoolBackfillFull
        annotations:
          description: A pool is approaching the near full threshold, which will prevent
            recovery/backfill operations from completing. Consider adding more capacity.
          summary: Free space in a pool is too low for recovery/backfill on cluster {{
            $labels.cluster }}
        expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0
        labels:
          severity: P3
          type: ceph_default
      - alert: CephPoolFull
        annotations:
          description: A pool has reached its MAX quota, or OSDs supporting the pool have
            reached the FULL threshold. Until this is resolved, writes to the pool will
            be blocked. Pool Breakdown (top 5) {{- range printf "topk(5, sort_desc(ceph_pool_percent_used{cluster='%s'}
            * on(cluster,pool_id) group_right ceph_pool_metadata))" .Labels.cluster |
            query }} - {{ .Labels.name }} at {{ .Value }}% {{- end }} Increase the pool's
            quota, or add capacity to the cluster first then increase the pool's quota
            (e.g. ceph osd pool set quota <pool_name> max_bytes <bytes>)
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#pool-full
          summary: Pool is full - writes are blocked on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="POOL_FULL"} > 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.9.1
          severity: P1
          type: ceph_default
      - alert: CephPoolNearFull
        annotations:
          description: A pool has exceeded the warning (percent full) threshold, or OSDs
            supporting the pool have reached the NEARFULL threshold. Writes may continue,
            but you are at risk of the pool going read-only if more capacity isn't made
            available. Determine the affected pool with 'ceph df detail', looking at QUOTA
            BYTES and STORED. Increase the pool's quota, or add capacity to the cluster
            first then increase the pool's quota (e.g. ceph osd pool set quota <pool_name>
            max_bytes <bytes>). Also ensure that the balancer is active.
          summary: One or more Ceph pools are nearly full on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0
        for: 5m
        labels:
          severity: P3
          type: ceph_default
    - name: healthchecks
      rules:
      - alert: CephSlowOps
        annotations:
          description: '{{ $value }} OSD requests are taking too long to process (osd_op_complaint_time
            exceeded)'
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
          summary: OSD operations are slow to complete on cluster {{ $labels.cluster }}
        expr: ceph_healthcheck_slow_ops > 0
        for: 30s
        labels:
          severity: P3
          type: ceph_default
      - alert: CephDaemonSlowOps
        annotations:
          description: '{{ $labels.ceph_daemon }} operations are taking too long to process
            (complaint time exceeded)'
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#slow-ops
          summary: '{{ $labels.ceph_daemon }} operations are slow to complete on cluster
            {{ $labels.cluster }}'
        expr: ceph_daemon_health_metrics{type="SLOW_OPS"} > 0
        for: 30s
        labels:
          severity: P3
          type: ceph_default
    - name: cephadm
      rules:
      - alert: CephadmUpgradeFailed
        annotations:
          description: The cephadm cluster upgrade process has failed. The cluster remains
            in an undetermined state. Please review the cephadm logs, to understand the
            nature of the issue
          summary: Ceph version upgrade has failed on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.11.2
          severity: P1
          type: ceph_default
      - alert: CephadmDaemonFailed
        annotations:
          description: A daemon managed by cephadm is no longer active. Determine, which
            daemon is down with 'ceph health detail'. you may start daemons with the 'ceph
            orch daemon start <daemon_id>'
          summary: A ceph daemon managed by cephadm is down on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.11.1
          severity: P1
          type: ceph_default
      - alert: CephadmPaused
        annotations:
          description: Cluster management has been paused manually. This will prevent
            the orchestrator from service management and reconciliation. If this is not
            intentional, resume cephadm operations with 'ceph orch resume'
          documentation: https://docs.ceph.com/en/latest/cephadm/operations#cephadm-paused
          summary: Orchestration tasks via cephadm are PAUSED on cluster {{ $labels.cluster
            }}
        expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0
        for: 1m
        labels:
          severity: P3
          type: ceph_default
    - name: hardware
      rules:
      - alert: HardwareStorageError
        annotations:
          description: Some storage devices are in error. Check `ceph health detail`.
          summary: Storage devices error(s) detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="HARDWARE_STORAGE"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.13.1
          severity: P1
          type: ceph_default
      - alert: HardwareMemoryError
        annotations:
          description: DIMM error(s) detected. Check `ceph health detail`.
          summary: DIMM error(s) detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="HARDWARE_MEMORY"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.13.2
          severity: P1
          type: ceph_default
      - alert: HardwareProcessorError
        annotations:
          description: Processor error(s) detected. Check `ceph health detail`.
          summary: Processor error(s) detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="HARDWARE_PROCESSOR"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.13.3
          severity: P1
          type: ceph_default
      - alert: HardwareNetworkError
        annotations:
          description: Network error(s) detected. Check `ceph health detail`.
          summary: Network error(s) detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="HARDWARE_NETWORK"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.13.4
          severity: P1
          type: ceph_default
      - alert: HardwarePowerError
        annotations:
          description: Power supply error(s) detected. Check `ceph health detail`.
          summary: Power supply error(s) detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="HARDWARE_POWER"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.13.5
          severity: P1
          type: ceph_default
      - alert: HardwareFanError
        annotations:
          description: Fan error(s) detected. Check `ceph health detail`.
          summary: Fan error(s) detected on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="HARDWARE_FANS"} > 0
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.13.6
          severity: P1
          type: ceph_default
    - name: PrometheusServer
      rules:
      - alert: PrometheusJobMissing
        annotations:
          description: The prometheus job that scrapes from Ceph is no longer defined,
            this will effectively mean you'll have no metrics or alerts for the cluster.  Please
            review the job definitions in the prometheus.yml file of the prometheus instance.
          summary: The scrape job for Ceph is missing from Prometheus
        expr: absent(up{job="ceph"})
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.12.1
          severity: P1
          type: ceph_default
    - name: rados
      rules:
      - alert: CephObjectMissing
        annotations:
          description: The latest version of a RADOS object can not be found, even though
            all OSDs are up. I/O requests for this object from clients will block (hang).
            Resolving this issue may require the object to be rolled back to a prior version
            manually, and manually verified.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks#object-unfound
          summary: Object(s) marked UNFOUND on cluster {{ $labels.cluster }}
        expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() group_right(cluster)
          (count(ceph_osd_up == 1) by (cluster) == bool count(ceph_osd_metadata) by(cluster))
          == 1
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.10.1
          severity: P1
          type: ceph_default
    - name: generic
      rules:
      - alert: CephDaemonCrash
        annotations:
          description: One or more daemons have crashed recently, and need to be acknowledged.
            This notification ensures that software crashes do not go unseen. To acknowledge
            a crash, use the 'ceph crash archive <id>' command.
          documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-crash
          summary: One or more Ceph daemons have crashed, and are pending acknowledgement
            on cluster {{ $labels.cluster }}
        expr: ceph_health_detail{name="RECENT_CRASH"} == 1
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.1.2
          severity: P1
          type: ceph_default
    - name: rbdmirror
      rules:
      - alert: CephRBDMirrorImagesPerDaemonHigh
        annotations:
          description: Number of image replications per daemon is not supposed to go beyond
            threshold 100
          summary: Number of image replications are now above 100 on cluster {{ $labels.cluster
            }}
        expr: sum by (cluster, ceph_daemon, namespace) (ceph_rbd_mirror_snapshot_image_snapshots)
          > 100
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.10.2
          severity: P1
          type: ceph_default
      - alert: CephRBDMirrorImagesNotInSync
        annotations:
          description: Both local and remote RBD mirror images should be in sync.
          summary: Some of the RBD mirror images are not in sync with the remote counter
            parts on cluster {{ $labels.cluster }}
        expr: sum by (cluster, ceph_daemon, image, namespace, pool) (topk by (cluster,
          ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp)
          - topk by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp))
          != 0
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.10.3
          severity: P1
          type: ceph_default
      - alert: CephRBDMirrorImagesNotInSyncVeryHigh
        annotations:
          description: More than 10% of the images have synchronization problems.
          summary: Number of unsynchronized images are very high on cluster {{ $labels.cluster
            }}
        expr: count by (ceph_daemon, cluster) ((topk by (cluster, ceph_daemon, image,
          namespace, pool) (1, ceph_rbd_mirror_snapshot_image_local_timestamp) - topk
          by (cluster, ceph_daemon, image, namespace, pool) (1, ceph_rbd_mirror_snapshot_image_remote_timestamp))
          != 0) > (sum by (ceph_daemon, cluster) (ceph_rbd_mirror_snapshot_snapshots)*.1)
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.10.4
          severity: P1
          type: ceph_default
      - alert: CephRBDMirrorImageTransferBandwidthHigh
        annotations:
          description: Detected a heavy increase in bandwidth for rbd replications (over
            80%) in the last 30 min. This might not be a problem, but it is good to review
            the number of images being replicated simultaneously
          summary: The replication network usage on cluster {{ $labels.cluster }} has
            been increased over 80% in the last 30 minutes. Review the number of images
            being replicated. This alert will be cleaned automatically after 30 minutes
        expr: rate(ceph_rbd_mirror_journal_replay_bytes[30m]) > 0.80
        for: 1m
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.10.5
          severity: P3
          type: ceph_default
    - name: nvmeof
      rules:
      - alert: NVMeoFSubsystemNamespaceLimit
        annotations:
          description: Subsystems have a max namespace limit defined at creation time.
            This alert means that no more namespaces can be added to {{ $labels.nqn }}
          summary: '{{ $labels.nqn }} subsystem has reached its maximum number of namespaces
            on cluster {{ $labels.cluster }}'
        expr: (count by(nqn, cluster, instance) (ceph_nvmeof_subsystem_namespace_metadata))
          >= on(nqn, instance) group_right(cluster) ceph_nvmeof_subsystem_namespace_limit
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFMultipleNamespacesOfRBDImage
        annotations:
          description: Each NVMeoF namespace must have a unique RBD pool and image, across
            all different gateway groups.
          summary: 'RBD image {{ $labels.pool_name }}/{{ $labels.rbd_name }} cannot be
            reused for multiple NVMeoF namespace '
        expr: count by(pool_name, rbd_name) (count by(bdev_name, pool_name, rbd_name)
          (ceph_nvmeof_bdev_metadata and on (bdev_name) ceph_nvmeof_subsystem_namespace_metadata))
          > 1
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFTooManyGateways
        annotations:
          description: You may create many gateways, but 32 is the tested limit
          summary: Max supported gateways exceeded on cluster {{ $labels.cluster }}
        expr: count(ceph_nvmeof_gateway_info) by (cluster) > 32.00
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFMaxGatewayGroupSize
        annotations:
          description: You may create many gateways in a gateway group, but 8 is the tested
            limit
          summary: Max gateways within a gateway group ({{ $labels.group }}) exceeded
            on cluster {{ $labels.cluster }}
        expr: count(ceph_nvmeof_gateway_info) by (cluster,group) > 8.00
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFMaxGatewayGroups
        annotations:
          description: You may create many gateway groups, but 4 is the tested limit
          summary: Max gateway groups exceeded on cluster {{ $labels.cluster }}
        expr: count(count by (group, cluster) (ceph_nvmeof_gateway_info)) by (cluster)
          > 4.00
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFSingleGateway
        annotations:
          description: Although a single member gateway group is valid, it should only
            be used for test purposes
          summary: The gateway group {{ $labels.group }} consists of a single gateway
            - HA is not possible on cluster {{ $labels.cluster }}
        expr: count(ceph_nvmeof_gateway_info) by(cluster,group) == 1
        for: 5m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFHighGatewayCPU
        annotations:
          description: Typically, high CPU may indicate degraded performance. Consider
            increasing the number of reactor cores
          summary: CPU used by {{ $labels.instance }} NVMe-oF Gateway is high on cluster
            {{ $labels.cluster }}
        expr: label_replace(avg by(instance, cluster) (rate(ceph_nvmeof_reactor_seconds_total{mode="busy"}[1m])),"instance","$1","instance","(.*):.*")
          > 80.00
        for: 10m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFGatewayOpenSecurity
        annotations:
          description: It is good practice to ensure subsystems use host security to reduce
            the risk of unexpected data loss
          summary: Subsystem {{ $labels.nqn }} has been defined without host level security
            on cluster {{ $labels.cluster }}
        expr: ceph_nvmeof_subsystem_metadata{allow_any_host="yes"}
        for: 5m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFTooManySubsystems
        annotations:
          description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded
            the supported maximum of 128 subsystems. Current count: {{ $value }}.'
          summary: The number of subsystems defined to the NVMeoF gateway reached or exceeded
            the supported values on cluster {{ $labels.cluster }}
        expr: count by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_metadata,"gateway_host","$1","instance","(.*?)(?::.*)?"))
          >= 128.00
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFTooManyNamespaces
        annotations:
          description: 'NVMeoF gateway {{ $labels.gateway_host }} has reached or exceeded
            the supported maximum of 2048 namespaces. Current count: {{ $value }}.'
          summary: The number of namespaces defined to the NVMeoF gateway reached or exceeded
            supported values on cluster {{ $labels.cluster }}
        expr: sum by(gateway_host, cluster) (label_replace(ceph_nvmeof_subsystem_namespace_count,"gateway_host","$1","instance","(.*?)(?::.*)?"))
          >= 2048.00
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFVersionMismatch
        annotations:
          description: This may indicate an issue with deployment. Check cephadm logs
          summary: Too many different NVMe-oF gateway releases active on cluster {{ $labels.cluster
            }}
        expr: count(count(ceph_nvmeof_gateway_info) by (cluster, version)) by (cluster)
          > 1
        for: 1h
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFHighClientCount
        annotations:
          description: The supported limit for clients connecting to a subsystem is 128
          summary: The number of clients connected to {{ $labels.nqn }} is too high on
            cluster {{ $labels.cluster }}
        expr: ceph_nvmeof_subsystem_host_count > 128.00
        for: 1m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFMissingListener
        annotations:
          description: For every subsystem, each gateway should have a listener to balance
            traffic between gateways.
          summary: No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{
            $labels.nqn }} subsystem
        expr: ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count)
          by (nqn) > 0
        for: 10m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFZeroListenerSubsystem
        annotations:
          description: NVMeoF gateway configuration incomplete; one of the subsystems
            have zero listeners.
          summary: No listeners added to {{ $labels.nqn }} subsystem
        expr: sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0
        for: 10m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFHighHostCPU
        annotations:
          description: High CPU on a gateway host can lead to CPU contention and performance
            degradation
          summary: The CPU is high ({{ $value }}%) on NVMeoF Gateway host ({{ $labels.host
            }}) on cluster {{ $labels.cluster }}
        expr: 100-((100*(avg by(cluster,host) (label_replace(rate(node_cpu_seconds_total{mode="idle"}[5m]),"host","$1","instance","(.*):.*"))
          * on(cluster, host) group_right label_replace(ceph_nvmeof_gateway_info,"host","$1","instance","(.*):.*"))))
          >= 80.00
        for: 10m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFInterfaceDown
        annotations:
          description: A NIC used by one or more subsystems is in a down state
          summary: Network interface {{ $labels.device }} is down on cluster {{ $labels.cluster
            }}
        expr: ceph_nvmeof_subsystem_listener_iface_info{operstate="down"}
        for: 30s
        labels:
          oid: 1.3.6.1.4.1.50495.1.2.1.14.1
          severity: P3
          type: ceph_default
      - alert: NVMeoFInterfaceDuplex
        annotations:
          description: Until this is resolved, performance from the gateway will be degraded
          summary: Network interface {{ $labels.device }} is not running in full duplex
            mode on cluster {{ $labels.cluster }}
        expr: ceph_nvmeof_subsystem_listener_iface_info{duplex!="full"}
        for: 30s
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFHighReadLatency
        annotations:
          description: High latencies may indicate a constraint within the cluster e.g.
            CPU, network. Please investigate
          summary: The average read latency over the last 5 mins has reached 10 ms or
            more on {{ $labels.gateway }}
        expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_read_seconds_total[1m])
          / rate(ceph_nvmeof_bdev_reads_completed_total[1m])))),"gateway","$1","instance","(.*):.*")
          > 0.01
        for: 5m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFHighWriteLatency
        annotations:
          description: High latencies may indicate a constraint within the cluster e.g.
            CPU, network. Please investigate
          summary: The average write latency over the last 5 mins has reached 20 ms or
            more on {{ $labels.gateway }}
        expr: label_replace((avg by(instance) ((rate(ceph_nvmeof_bdev_write_seconds_total[5m])
          / rate(ceph_nvmeof_bdev_writes_completed_total[5m])))),"gateway","$1","instance","(.*):.*")
          > 0.02
        for: 5m
        labels:
          severity: P3
          type: ceph_default
      - alert: NVMeoFHostKeepAliveTimeout
        annotations:
          description: Host was disconnected due to host keep alive timeout
          summary: Host ({{ $labels.host_nqn }}) was disconnected {{ $value }} times from
            subsystem ({{ $labels.nqn }}) in last 24 hours
        expr: ceil(changes(ceph_nvmeof_host_keepalive_timeout[24h:]) / 2) > 0
        for: 1m
        labels:
          severity: P3
          type: ceph_default
    - name: cluster health detail
      rules:
      - alert: CephHealthDetailError
        annotations:
          description: Health check {{ $labels.name }} has been HEALTH_ERROR for more
            than 5 minutes. Please check 'ceph health detail' for more information.
          summary: Ceph is in the ERROR state
        expr: ceph_health_detail{severity="HEALTH_ERROR"} == 1
        for: 5m
        labels:
          severity: P1
      - alert: CephHealthDetailWarning
        annotations:
          description: Health check {{ $labels.name }} has been HEALTH_WARN for more than
            15 minutes. Please check 'ceph health detail' for more information.
          summary: Ceph is in the WARNING state
        expr: ceph_health_detail{severity="HEALTH_WARN"} == 1
        for: 15m
        labels:
          severity: P3
  monitoring-kube-prometheus-stack-config-reloaders-712b2a77-4a16-496f-adbd-5bc45971adc5.yaml: |
    groups:
    - name: config-reloaders
      rules:
      - alert: ConfigReloaderSidecarErrors
        annotations:
          description: |-
            Errors encountered while the {{$labels.pod}} config-reloader sidecar attempts to sync config in {{$labels.namespace}} namespace.
            As a result, configuration for service running in {{$labels.pod}} may be stale and cannot be updated anymore.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/configreloadersidecarerrors
          summary: config-reloader sidecar has not had a successful reload for 10m
        expr: max_over_time(reloader_last_reload_successful{namespace=~".+"}[5m]) == 0
        for: 10m
        labels:
          severity: warning
  monitoring-kube-prometheus-stack-coredns-256321af-52e4-44df-bc9d-c1d787df4fd7.yaml: |
    groups:
    - name: coredns
      rules:
      - alert: CoreDNSCriticalErrorBudgetBurn
        annotations:
          description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }}
            over the last hour, which exceeds the 1.44% burn-rate threshold (14.4x against
            99.9% SLO). At this rate, the 30-day error budget exhausts in under 2.1 days.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednscriticalerrorbudgetburn
          summary: 'CoreDNS: SERVFAIL rate rapidly consuming error budget'
        expr: |
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[1h]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[1h]))
          ) > 0.0144
          and
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[5m]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[5m]))
          ) > 0.0144
          and
          sum(rate(coredns_dns_responses_total{job="coredns"}[5m])) > 1
        for: 2m
        labels:
          severity: P1
      - alert: CoreDNSHighErrorBudgetBurn
        annotations:
          description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }}
            over the last 6 hours, which exceeds the 0.6% burn-rate threshold (6x against
            99.9% SLO). At this rate, the 30-day error budget exhausts in under 5 days.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednshigherrorbudgetburn
          summary: 'CoreDNS: sustained SERVFAIL rate depleting error budget'
        expr: |
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[6h]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[6h]))
          ) > 0.006
          and
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[30m]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[30m]))
          ) > 0.006
          and
          sum(rate(coredns_dns_responses_total{job="coredns"}[30m])) > 1
        for: 5m
        labels:
          severity: P2
      - alert: CoreDNSModerateErrorBudgetBurn
        annotations:
          description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }}
            over the last day, which exceeds the 0.3% burn-rate threshold (3x against
            99.9% SLO). At this rate, the 30-day error budget exhausts in under 10 days.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednsmoderateerrorbudgetburn
          summary: 'CoreDNS: ongoing SERVFAIL rate steadily consuming error budget'
        expr: |
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[1d]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[1d]))
          ) > 0.003
          and
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[2h]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[2h]))
          ) > 0.003
          and
          sum(rate(coredns_dns_responses_total{job="coredns"}[2h])) > 1
        for: 15m
        labels:
          severity: P3
      - alert: CoreDNSLowErrorBudgetBurn
        annotations:
          description: The CoreDNS SERVFAIL rate is {{ $value | humanizePercentage }}
            over the last 3 days, which exceeds the 0.1% burn-rate threshold (1x against
            99.9% SLO). At this rate, the 30-day error budget exhausts before the window
            resets.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednslowerrorbudgetburn
          summary: 'CoreDNS: low-level SERVFAIL rate eroding error budget'
        expr: |
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[3d]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[3d]))
          ) > 0.001
          and
          (
          sum(rate(coredns_dns_responses_total{job="coredns",rcode="SERVFAIL"}[6h]))
          /
          sum(rate(coredns_dns_responses_total{job="coredns"}[6h]))
          ) > 0.001
          and
          sum(rate(coredns_dns_responses_total{job="coredns"}[6h])) > 1
        for: 1h
        labels:
          severity: P4
      - alert: CoreDNSDown
        annotations:
          description: CoreDNS has disappeared from Prometheus target discovery for more
            than 15 minutes. This could indicate a crashed CoreDNS pod or a misconfigured
            scrape target.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#corednsdown
          summary: 'CoreDNS: instance has disappeared from Prometheus target discovery'
        expr: absent(up{job="coredns"} == 1)
        for: 15m
        labels:
          severity: P3
  monitoring-kube-prometheus-stack-etcd-95d7c95b-8288-4ed3-aeb4-8dc4f9b84e64.yaml: |
    groups:
    - name: etcd
      rules:
      - alert: etcdMembersDown
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
            }}).'
          summary: etcd cluster members are down.
        expr: |-
          max without (endpoint) (
            sum without (instance) (up{job=~".*etcd.*"} == bool 0)
          or
            count without (To) (
              sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
            )
          )
          > 0
        for: 10m
        labels:
          severity: critical
      - alert: etcdInsufficientMembers
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
            }}).'
          summary: etcd cluster has insufficient number of members.
        expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
          without (instance) + 1) / 2)
        for: 3m
        labels:
          severity: critical
      - alert: etcdNoLeader
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
            has no leader.'
          summary: etcd cluster has no leader.
        expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
        for: 1m
        labels:
          severity: critical
      - alert: etcdHighNumberOfLeaderChanges
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
            within the last 15 minutes. Frequent elections may be a sign of insufficient
            resources, high network latency, or disruptions by other components and should
            be investigated.'
          summary: etcd cluster has high number of leader changes.
        expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
          or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
          >= 4
        for: 5m
        labels:
          severity: warning
      - alert: etcdGRPCRequestsSlow
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
            is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
            }} method.'
          summary: etcd grpc requests are slow
        expr: |-
          histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
          > 0.15
        for: 10m
        labels:
          severity: critical
      - alert: etcdMemberCommunicationSlow
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": member communication with {{
            $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
            }}.'
          summary: etcd cluster member communication is slow.
        expr: |-
          histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
          > 0.15
        for: 10m
        labels:
          severity: warning
      - alert: etcdHighNumberOfFailedProposals
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
            within the last 30 minutes on etcd instance {{ $labels.instance }}.'
          summary: etcd cluster has high number of proposal failures.
        expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
        for: 15m
        labels:
          severity: warning
      - alert: etcdHighFsyncDurations
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
            are {{ $value }}s on etcd instance {{ $labels.instance }}.'
          summary: etcd cluster 99th percentile fsync durations are too high.
        expr: |-
          histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
          > 0.5
        for: 10m
        labels:
          severity: warning
      - alert: etcdHighFsyncDurations
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
            are {{ $value }}s on etcd instance {{ $labels.instance }}.'
          summary: etcd cluster 99th percentile fsync durations are too high.
        expr: |-
          histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
          > 1
        for: 10m
        labels:
          severity: critical
      - alert: etcdHighCommitDurations
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
            {{ $value }}s on etcd instance {{ $labels.instance }}.'
          summary: etcd cluster 99th percentile commit durations are too high.
        expr: |-
          histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
          > 0.25
        for: 10m
        labels:
          severity: warning
      - alert: etcdDatabaseQuotaLowSpace
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
            quota on etcd instance {{ $labels.instance }}, please defrag or increase the
            quota as the writes to etcd will be disabled when it is full.'
          summary: etcd cluster database is running full.
        expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
          last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
          95
        for: 10m
        labels:
          severity: critical
      - alert: etcdExcessiveDatabaseGrowth
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
            space in the next four hours, based on write observations within the past
            four hours on etcd instance {{ $labels.instance }}, please check as it might
            be disruptive.'
          summary: etcd cluster database growing very fast.
        expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
          > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
        for: 10m
        labels:
          severity: warning
      - alert: etcdDatabaseHighFragmentationRatio
        annotations:
          description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
            {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
            allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
            retrieve the unused fragmented disk space.'
          runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
          summary: etcd database size in use is less than 50% of the actual allocated
            storage.
        expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
          / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
          and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
        for: 10m
        labels:
          severity: warning
  monitoring-kube-prometheus-stack-general.rules-cfb4b4f9-5160-4626-8c6c-7b56c76536f9.yaml: |
    groups:
    - name: general.rules
      rules:
      - alert: TargetDown
        annotations:
          description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
            }} targets in {{ $labels.namespace }} namespace are down.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
          summary: One or more targets are unreachable.
        expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
          BY (cluster, job, namespace, service)) > 10
        for: 10m
        labels:
          severity: warning
      - alert: Watchdog
        annotations:
          description: |
            This is an alert meant to ensure that the entire alerting pipeline is functional.
            This alert is always firing, therefore it should always be firing in Alertmanager
            and always fire against a receiver. There are integrations with various notification
            mechanisms that send a notification when this alert is not firing. For example the
            "DeadMansSnitch" integration in PagerDuty.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
          summary: An alert that should always be firing to certify that Alertmanager
            is working properly.
        expr: vector(1)
        labels:
          severity: none
      - alert: InfoInhibitor
        annotations:
          description: |
            This is an alert that is used to inhibit info alerts.
            By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
            other alerts.
            This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
            severity of 'warning' or 'critical' starts firing on the same namespace.
            This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
          summary: Info-level alert inhibition.
        expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
          "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
        labels:
          severity: none
  monitoring-kube-prometheus-stack-goldpinger-f9e62ea9-1ecf-46d1-8f95-99af4ffe6318.yaml: |
    groups:
    - name: goldpinger
      rules:
      - alert: GoldpingerHighUnhealthyRatio
        annotations:
          description: 'More than 10% of nodes (current: {{ $value | humanizePercentage
            }}) are reporting as unhealthy for at least 5 minutes. Normal operation expects
            0% unhealthy nodes.'
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingerhighunhealthyratio
          summary: 'Goldpinger: high percentage of cluster nodes unhealthy'
        expr: |
          (
            sum(goldpinger_nodes_health_total{status="unhealthy"})
            /
            sum(goldpinger_nodes_health_total)
          ) > 0.1
        for: 5m
        labels:
          severity: P2
      - alert: GoldpingerNodeUnreachable
        annotations:
          description: 'Node with IP {{ $labels.host_ip }} has a median ping latency above
            1s from more than 50% (current: {{ $value | humanizePercentage }}) of Goldpinger
            instances. Normal operation expects all nodes to be reachable with sub-10ms
            latency.'
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingernodeunreachable
          summary: 'Goldpinger: node unreachable by majority of cluster'
        expr: |
          (
            count by (host_ip) (
              histogram_quantile(0.5,
                sum by (instance, host_ip, le) (
                  rate(goldpinger_peers_response_time_s_bucket{call_type="ping"}[5m])
                )
              ) > 1.0
            )
            /
            scalar(count(goldpinger_cluster_health_total))
          ) > 0.5
        for: 5m
        labels:
          severity: P2
      - alert: GoldpingerHighPeerLatency
        annotations:
          description: The 95th percentile of peer-to-peer latency is {{ $value | humanizeDuration
            }}, which exceeds the threshold of 500ms. Normal latency is typically below
            10ms.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingerhighpeerlatency
          summary: 'Goldpinger: high cluster-wide peer latency'
        expr: |
          histogram_quantile(0.95,
            sum by (le) (
              rate(goldpinger_peers_response_time_s_bucket{call_type="ping"}[5m])
            )
          ) > 0.5
        for: 15m
        labels:
          severity: P3
      - alert: GoldpingerHighErrorRate
        annotations:
          description: 'More than 5% (current: {{ $value | humanizePercentage }}) of Goldpinger
            ping attempts are failing. Normal operation expects less than 0.1% error rate.'
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#goldpingerhigherrorrate
          summary: 'Goldpinger: high ping error rate'
        expr: |
          (
            sum(rate(goldpinger_errors_total{type="ping"}[5m]))
            /
            sum(rate(goldpinger_stats_total{action="ping",group="made"}[5m]))
          ) > 0.05
        for: 15m
        labels:
          severity: P3
  monitoring-kube-prometheus-stack-ipmi-exporter-b45b0bdb-cd2b-42ab-a003-a9c4dc06dac8.yaml: |
    groups:
    - name: rules
      rules:
      - alert: IpmiCollectorDown
        expr: ipmi_up == 0
        for: 15m
        labels:
          severity: P3
    - name: collectors-state-warning
      rules:
      - alert: IpmiCurrent
        expr: ipmi_current_state == 1
        labels:
          severity: P3
      - alert: IpmiFanSpeed
        expr: ipmi_fan_speed_state == 1
        labels:
          severity: P3
      - alert: IpmiPower
        expr: ipmi_power_state == 1
        labels:
          severity: P3
      - alert: IpmiSensor
        expr: ipmi_sensor_state == 1
        labels:
          severity: P3
      - alert: IpmiTemperature
        expr: ipmi_temperature_state == 1
        labels:
          severity: P3
      - alert: IpmiVoltage
        expr: ipmi_voltage_state == 1
        labels:
          severity: P3
    - name: collectors-state-critical
      rules:
      - alert: IpmiCurrent
        expr: ipmi_current_state == 2
        labels:
          severity: P1
      - alert: IpmiFanSpeed
        expr: ipmi_fan_speed_state == 2
        labels:
          severity: P1
      - alert: IpmiPower
        expr: ipmi_power_state == 2
        labels:
          severity: P1
      - alert: IpmiSensor
        expr: ipmi_sensor_state{name!="TPM Presence"} == 2
        labels:
          severity: P1
      - alert: IpmiTemperature
        expr: ipmi_temperature_state == 2
        labels:
          severity: P1
      - alert: IpmiVoltage
        expr: ipmi_voltage_state == 2
        labels:
          severity: P1
    - name: ipmi-exporter-sel
      rules:
      - alert: IpmiUncorrectableMemoryError
        annotations:
          description: The metric ipmi_sel_events_latest_timestamp{name="uncorrectable_memory_error"}
            for {{ $labels.instance }} indicates the most recent uncorrectable memory
            SEL event is {{ $value | humanizeDuration }} old, which is within the alert
            window of 24 hours. Normal behavior is that no uncorrectable memory SEL events
            occur, or the latest such event is older than 24 hours.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#ipmiuncorrectablememoryerror
          summary: 'IPMI: recent uncorrectable memory error may affect host stability
            on {{ $labels.instance }}'
        expr: time() - max by (instance) (ipmi_sel_events_latest_timestamp{name="uncorrectable_memory_error"})
          < 86400
        for: 2m
        labels:
          severity: P1
      - alert: IpmiUnrecoverableCpuError
        annotations:
          description: The metric ipmi_sel_events_latest_timestamp{name="unrecoverable_cpu_error"}
            for {{ $labels.instance }} indicates the most recent unrecoverable CPU SEL
            event is {{ $value | humanizeDuration }} old, which is within the alert window
            of 24 hours. Normal behavior is that no unrecoverable CPU SEL events occur,
            or the latest such event is older than 24 hours.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#ipmiunrecoverablecpuerror
          summary: 'IPMI: recent unrecoverable CPU error may affect host stability on
            {{ $labels.instance }}'
        expr: time() - max by (instance) (ipmi_sel_events_latest_timestamp{name="unrecoverable_cpu_error"})
          < 86400
        for: 2m
        labels:
          severity: P1
  monitoring-kube-prometheus-stack-k8s.rules.container-cpu-usage-seconds-tot-67a370f2-1430-4dc0-8c97-905d51026884.yaml: |
    groups:
    - name: k8s.rules.container_cpu_usage_seconds_total
      rules:
      - expr: |-
          sum by (cluster, namespace, pod, container) (
            irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
          ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
            1, max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
  monitoring-kube-prometheus-stack-k8s.rules.container-memory-cache-e5446a97-645f-4721-b342-84d7654959e4.yaml: |
    groups:
    - name: k8s.rules.container_memory_cache
      rules:
      - expr: |-
          container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
            max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_cache
  monitoring-kube-prometheus-stack-k8s.rules.container-memory-rss-0444e196-4f65-4258-8196-99d9a181af02.yaml: |
    groups:
    - name: k8s.rules.container_memory_rss
      rules:
      - expr: |-
          container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
            max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_rss
  monitoring-kube-prometheus-stack-k8s.rules.container-memory-swap-f1fcb46c-f937-44a3-80d8-f3581748d5e4.yaml: |
    groups:
    - name: k8s.rules.container_memory_swap
      rules:
      - expr: |-
          container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
            max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_swap
  monitoring-kube-prometheus-stack-k8s.rules.container-memory-working-set-by-a961303f-4085-4290-ad39-c64e74e636bc.yaml: |
    groups:
    - name: k8s.rules.container_memory_working_set_bytes
      rules:
      - expr: |-
          container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (1,
            max by (cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_working_set_bytes
  monitoring-kube-prometheus-stack-k8s.rules.container-resource-603f93af-88c5-43e2-845c-2dcea091af33.yaml: |
    groups:
    - name: k8s.rules.container_resource
      rules:
      - expr: |-
          kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
      - expr: |-
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
                  ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_memory:kube_pod_container_resource_requests:sum
      - expr: |-
          kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
      - expr: |-
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
                  ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_cpu:kube_pod_container_resource_requests:sum
      - expr: |-
          kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
      - expr: |-
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
                  ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_memory:kube_pod_container_resource_limits:sum
      - expr: |-
          kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
           (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
           )
        record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
      - expr: |-
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
                  ) * on (namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_cpu:kube_pod_container_resource_limits:sum
  monitoring-kube-prometheus-stack-k8s.rules.pod-owner-bf14ea1a-5586-40da-a3c5-ee5291c498a2.yaml: |
    groups:
    - name: k8s.rules.pod_owner
      rules:
      - expr: |-
          max by (cluster, namespace, workload, pod) (
            label_replace(
              label_replace(
                kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
                "replicaset", "$1", "owner_name", "(.*)"
              ) * on (replicaset, namespace) group_left(owner_name) topk by (replicaset, namespace) (
                1, max by (replicaset, namespace, owner_name) (
                  kube_replicaset_owner{job="kube-state-metrics"}
                )
              ),
              "workload", "$1", "owner_name", "(.*)"
            )
          )
        labels:
          workload_type: deployment
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |-
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
              "workload", "$1", "owner_name", "(.*)"
            )
          )
        labels:
          workload_type: daemonset
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |-
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
              "workload", "$1", "owner_name", "(.*)"
            )
          )
        labels:
          workload_type: statefulset
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |-
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="Job"},
              "workload", "$1", "owner_name", "(.*)"
            )
          )
        labels:
          workload_type: job
        record: namespace_workload_pod:kube_pod_owner:relabel
  monitoring-kube-prometheus-stack-kube-68319dbb-d9d9-496f-ad52-90c1fd48d8a1.yaml: |
    groups:
    - name: kubernetes-apps
      rules:
      - alert: KubePodCrashLooping
        annotations:
          description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
            }}) is in waiting state (reason: "CrashLoopBackOff").'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
          summary: Pod is crash looping.
        expr: |
          max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics"}[5m]) >= 1
        for: 15m
        labels:
          severity: P3
      - alert: KubePodNotReady
        annotations:
          description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
            state for longer than 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
          summary: Pod has been in a non-ready state for more than 15 minutes.
        expr: |
          sum by (namespace, pod, cluster) (
            max by(namespace, pod, cluster) (
              kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}
            ) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
              1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
            )
          ) > 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeDeploymentGenerationMismatch
        annotations:
          description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
            }} does not match, this indicates that the Deployment has failed but has not
            been rolled back.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
          summary: Deployment generation mismatch due to possible roll-back
        expr: |
          kube_deployment_status_observed_generation{job="kube-state-metrics"}
            !=
          kube_deployment_metadata_generation{job="kube-state-metrics"}
        for: 15m
        labels:
          severity: P3
      - alert: KubeDeploymentReplicasMismatch
        annotations:
          description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
            not matched the expected number of replicas for longer than 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
          summary: Deployment has not matched the expected number of replicas.
        expr: |
          (
            kube_deployment_spec_replicas{job="kube-state-metrics"}
              >
            kube_deployment_status_replicas_available{job="kube-state-metrics"}
          ) and (
            changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[10m])
              ==
            0
          )
        for: 15m
        labels:
          severity: P3
      - alert: KubeDeploymentRolloutStuck
        annotations:
          description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
            }} is not progressing for longer than 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentrolloutstuck
          summary: Deployment rollout is not progressing.
        expr: |
          kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics"}
          != 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeStatefulSetReplicasMismatch
        annotations:
          description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
            not matched the expected number of replicas for longer than 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
          summary: StatefulSet has not matched the expected number of replicas.
        expr: |
          (
            kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
              !=
            kube_statefulset_replicas{job="kube-state-metrics"}
          ) and (
            changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[10m])
              ==
            0
          )
        for: 15m
        labels:
          severity: P3
      - alert: KubeStatefulSetGenerationMismatch
        annotations:
          description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
            }} does not match, this indicates that the StatefulSet has failed but has
            not been rolled back.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
          summary: StatefulSet generation mismatch due to possible roll-back
        expr: |
          kube_statefulset_status_observed_generation{job="kube-state-metrics"}
            !=
          kube_statefulset_metadata_generation{job="kube-state-metrics"}
        for: 15m
        labels:
          severity: P3
      - alert: KubeStatefulSetUpdateNotRolledOut
        annotations:
          description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
            has not been rolled out.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
          summary: StatefulSet update has not been rolled out.
        expr: |
          (
            max by(namespace, statefulset, job, cluster) (
              kube_statefulset_status_current_revision{job="kube-state-metrics"}
                unless
              kube_statefulset_status_update_revision{job="kube-state-metrics"}
            )
              * on(namespace, statefulset, job, cluster)
            (
              kube_statefulset_replicas{job="kube-state-metrics"}
                !=
              kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
            )
          )  and on(namespace, statefulset, job, cluster) (
            changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
              ==
            0
          )
        for: 15m
        labels:
          severity: P3
      - alert: KubeDaemonSetRolloutStuck
        annotations:
          description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
            finished or progressed for at least 15m.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
          summary: DaemonSet rollout is stuck.
        expr: |
          (
            (
              kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"}
                !=
              kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
            ) or (
              kube_daemonset_status_number_misscheduled{job="kube-state-metrics"}
                !=
              0
            ) or (
              kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}
                !=
              kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
            ) or (
              kube_daemonset_status_number_available{job="kube-state-metrics"}
                !=
              kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
            )
          ) and (
            changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics"}[5m])
              ==
            0
          )
        for: 15m
        labels:
          severity: P3
      - alert: KubeContainerWaiting
        annotations:
          description: 'pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on
            container {{ $labels.container}} has been in waiting state for longer than
            1 hour. (reason: "{{ $labels.reason }}").'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontainerwaiting
          summary: Pod container waiting longer than 1 hour
        expr: |
          kube_pod_container_status_waiting_reason{reason!="CrashLoopBackOff", job="kube-state-metrics"} > 0
        for: 1h
        labels:
          severity: P3
      - alert: KubeDaemonSetNotScheduled
        annotations:
          description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
            }} are not scheduled.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
          summary: DaemonSet pods are not scheduled.
        expr: |
          kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
            -
          kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
        for: 10m
        labels:
          severity: P3
      - alert: KubeDaemonSetMisScheduled
        annotations:
          description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
            }} are running where they are not supposed to run.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
          summary: DaemonSet pods are misscheduled.
        expr: |
          kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeJobNotCompleted
        annotations:
          description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
            than {{ "43200" | humanizeDuration }} to complete.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobnotcompleted
          summary: Job did not complete in time
        expr: |
          time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics"}
            and
          kube_job_status_active{job="kube-state-metrics"} > 0) > 43200
        labels:
          severity: P3
      - alert: KubeJobFailed
        annotations:
          description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
            Removing failed job after investigation should clear this alert.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
          summary: Job failed to complete.
        expr: |
          kube_job_failed{job="kube-state-metrics"}  > 0
        for: 15m
        labels:
          severity: P4
      - alert: KubeHpaReplicasMismatch
        annotations:
          description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler  }}
            has not matched the desired number of replicas for longer than 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
          summary: HPA has not matched desired number of replicas.
        expr: |
          (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics"}
            !=
          kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"})
            and
          (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
            >
          kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics"})
            and
          (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
            <
          kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"})
            and
          changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}[15m]) == 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeHpaMaxedOut
        annotations:
          description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler  }}
            has been running at max replicas for longer than 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
          summary: HPA is running at max replicas
        expr: |
          kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics"}
            ==
          kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics"}
        for: 15m
        labels:
          severity: P3
      - alert: KubePdbNotEnoughHealthyPods
        annotations:
          description: PDB {{ $labels.namespace }}/{{ $labels.poddisruptionbudget }} expects
            {{ $value }} more healthy pods. The desired number of healthy pods has not
            been met for at least 15m.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepdbnotenoughhealthypods
          summary: PDB does not have enough healthy pods.
        expr: |
          (
            kube_poddisruptionbudget_status_desired_healthy{job="kube-state-metrics"}
            -
            kube_poddisruptionbudget_status_current_healthy{job="kube-state-metrics"}
          )
          > 0
        for: 15m
        labels:
          severity: P3
    - name: kubernetes-resources
      rules:
      - alert: KubeCPUOvercommit
        annotations:
          description: Cluster has overcommitted CPU resource requests for Pods by {{
            printf "%.2f" $value }} CPU shares and cannot tolerate node failure.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
          summary: Cluster has overcommitted CPU resource requests.
        expr: |
          # Non-HA clusters.
          (
            (
              sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
              -
              sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) > 0
            )
            and
            count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3
          )
          or
          # HA clusters.
          (
            sum(namespace_cpu:kube_pod_container_resource_requests:sum{})
            -
            (
              # Skip clusters with only one allocatable node.
              (
                sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
                -
                max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
              ) > 0
            ) > 0
          )
        for: 10m
        labels:
          severity: P3
      - alert: KubeMemoryOvercommit
        annotations:
          description: Cluster has overcommitted memory resource requests for Pods by
            {{ $value | humanize }} bytes and cannot tolerate node failure.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryovercommit
          summary: Cluster has overcommitted memory resource requests.
        expr: |
          # Non-HA clusters.
          (
            (
              sum(namespace_memory:kube_pod_container_resource_requests:sum{})
              -
              sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) > 0
            )
            and
            count(max by (node) (kube_node_role{job="kube-state-metrics", role="control-plane"})) < 3
          )
          or
          # HA clusters.
          (
            sum(namespace_memory:kube_pod_container_resource_requests:sum{})
            -
            (
              # Skip clusters with only one allocatable node.
              (
                sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
                -
                max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
              ) > 0
            ) > 0
          )
        for: 10m
        labels:
          severity: P3
      - alert: KubeCPUQuotaOvercommit
        annotations:
          description: Cluster has overcommitted CPU resource requests for Namespaces.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuquotaovercommit
          summary: Cluster has overcommitted CPU resource requests.
        expr: |
          sum (
            min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})
          )
          /
          sum (
            kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}
          ) > 1.5
        for: 5m
        labels:
          severity: P3
      - alert: KubeMemoryQuotaOvercommit
        annotations:
          description: Cluster has overcommitted memory resource requests for Namespaces.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememoryquotaovercommit
          summary: Cluster has overcommitted memory resource requests.
        expr: |
          sum (
            min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})
          )
          /
          sum (
            kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}
          ) > 1.5
        for: 5m
        labels:
          severity: P3
      - alert: KubeQuotaAlmostFull
        annotations:
          description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
            }} of its {{ $labels.resource }} quota.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaalmostfull
          summary: Namespace quota is going to be full.
        expr: |
          kube_resourcequota{job="kube-state-metrics", type="used"}
            / ignoring(instance, job, type)
          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
            > 0.9 < 1
        for: 15m
        labels:
          severity: P5
      - alert: KubeQuotaFullyUsed
        annotations:
          description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
            }} of its {{ $labels.resource }} quota.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotafullyused
          summary: Namespace quota is fully used.
        expr: |
          kube_resourcequota{job="kube-state-metrics", type="used"}
            / ignoring(instance, job, type)
          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
            == 1
        for: 15m
        labels:
          severity: P5
      - alert: KubeQuotaExceeded
        annotations:
          description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
            }} of its {{ $labels.resource }} quota.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
          summary: Namespace quota has exceeded the limits.
        expr: |
          kube_resourcequota{job="kube-state-metrics", type="used"}
            / ignoring(instance, job, type)
          (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
            > 1
        for: 15m
        labels:
          severity: P3
      - alert: CPUThrottlingHigh
        annotations:
          description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
            {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
            }}.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
          summary: Processes experience elevated CPU throttling.
        expr: |
          sum(increase(container_cpu_cfs_throttled_periods_total{container!="", job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
            / on (cluster, namespace, pod, container, instance) group_left
          sum(increase(container_cpu_cfs_periods_total{job="cadvisor", }[5m])) without (id, metrics_path, name, image, endpoint, job, node)
            > ( 25 / 100 )
        for: 15m
        labels:
          severity: P5
    - name: kubernetes-storage
      rules:
      - alert: KubePersistentVolumeFillingUp
        annotations:
          description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
            }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
            {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
          summary: PersistentVolume is filling up.
        expr: |
          (
            kubelet_volume_stats_available_bytes{job="kubelet"}
              /
            kubelet_volume_stats_capacity_bytes{job="kubelet"}
          ) < 0.03
          and
          kubelet_volume_stats_used_bytes{job="kubelet"} > 0
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
        for: 1m
        labels:
          severity: P1
      - alert: KubePersistentVolumeFillingUp
        annotations:
          description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
            }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
            {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
            | humanizePercentage }} is available.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefillingup
          summary: PersistentVolume is filling up.
        expr: |
          (
            kubelet_volume_stats_available_bytes{job="kubelet"}
              /
            kubelet_volume_stats_capacity_bytes{job="kubelet"}
          ) < 0.15
          and
          kubelet_volume_stats_used_bytes{job="kubelet"} > 0
          and
          predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
        for: 1h
        labels:
          severity: P3
      - alert: KubePersistentVolumeInodesFillingUp
        annotations:
          description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
            }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
            {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup
          summary: PersistentVolumeInodes are filling up.
        expr: |
          (
            kubelet_volume_stats_inodes_free{job="kubelet"}
              /
            kubelet_volume_stats_inodes{job="kubelet"}
          ) < 0.03
          and
          kubelet_volume_stats_inodes_used{job="kubelet"} > 0
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
        for: 1m
        labels:
          severity: P1
      - alert: KubePersistentVolumeInodesFillingUp
        annotations:
          description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
            }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
            {{ . }} {{- end }} is expected to run out of inodes within four days. Currently
            {{ $value | humanizePercentage }} of its inodes are free.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeinodesfillingup
          summary: PersistentVolumeInodes are filling up.
        expr: |
          (
            kubelet_volume_stats_inodes_free{job="kubelet"}
              /
            kubelet_volume_stats_inodes{job="kubelet"}
          ) < 0.15
          and
          kubelet_volume_stats_inodes_used{job="kubelet"} > 0
          and
          predict_linear(kubelet_volume_stats_inodes_free{job="kubelet"}[6h], 4 * 24 * 3600) < 0
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
          unless on(cluster, namespace, persistentvolumeclaim)
          kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
        for: 1h
        labels:
          severity: P3
      - alert: KubePersistentVolumeErrors
        annotations:
          description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
            -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
          summary: PersistentVolume is having issues with provisioning.
        expr: |
          kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
        for: 5m
        labels:
          severity: P1
    - name: kubernetes-system
      rules:
      - alert: KubeVersionMismatch
        annotations:
          description: There are {{ $value }} different semantic versions of Kubernetes
            components running.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
          summary: Different semantic versions of Kubernetes components running.
        expr: |
          count by (cluster) (count by (git_version, cluster) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"git_version","$1","git_version","(v[0-9]*.[0-9]*).*"))) > 1
        for: 15m
        labels:
          severity: P3
      - alert: KubeClientErrors
        annotations:
          description: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
            }}' is experiencing {{ $value | humanizePercentage }} errors.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
          summary: Kubernetes API server client is experiencing errors.
        expr: |
          (sum(rate(rest_client_requests_total{job="apiserver",code=~"5.."}[5m])) by (cluster, instance, job, namespace)
            /
          sum(rate(rest_client_requests_total{job="apiserver"}[5m])) by (cluster, instance, job, namespace))
          > 0.01
        for: 15m
        labels:
          severity: P3
    - name: kube-apiserver-slos
      rules:
      - alert: KubeAPIErrorBudgetBurn
        annotations:
          description: The API server is burning too much error budget.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
          summary: The API server is burning too much error budget.
        expr: |
          sum by(cluster) (apiserver_request:burnrate1h) > (14.40 * 0.01000)
          and on(cluster)
          sum by(cluster) (apiserver_request:burnrate5m) > (14.40 * 0.01000)
        for: 2m
        labels:
          long: 1h
          severity: P1
          short: 5m
      - alert: KubeAPIErrorBudgetBurn
        annotations:
          description: The API server is burning too much error budget.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
          summary: The API server is burning too much error budget.
        expr: |
          sum by(cluster) (apiserver_request:burnrate6h) > (6.00 * 0.01000)
          and on(cluster)
          sum by(cluster) (apiserver_request:burnrate30m) > (6.00 * 0.01000)
        for: 15m
        labels:
          long: 6h
          severity: P1
          short: 30m
      - alert: KubeAPIErrorBudgetBurn
        annotations:
          description: The API server is burning too much error budget.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
          summary: The API server is burning too much error budget.
        expr: |
          sum by(cluster) (apiserver_request:burnrate1d) > (3.00 * 0.01000)
          and on(cluster)
          sum by(cluster) (apiserver_request:burnrate2h) > (3.00 * 0.01000)
        for: 1h
        labels:
          long: 1d
          severity: P3
          short: 2h
      - alert: KubeAPIErrorBudgetBurn
        annotations:
          description: The API server is burning too much error budget.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorbudgetburn
          summary: The API server is burning too much error budget.
        expr: |
          sum by(cluster) (apiserver_request:burnrate3d) > (1.00 * 0.01000)
          and on(cluster)
          sum by(cluster) (apiserver_request:burnrate6h) > (1.00 * 0.01000)
        for: 3h
        labels:
          long: 3d
          severity: P3
          short: 6h
    - name: kubernetes-system-apiserver
      rules:
      - alert: KubeClientCertificateExpiration
        annotations:
          description: A client certificate used to authenticate to kubernetes apiserver
            is expiring in less than 7.0 days.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
          summary: Client certificate is about to expire.
        expr: |
          histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
          and
          on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
        for: 5m
        labels:
          severity: P3
      - alert: KubeClientCertificateExpiration
        annotations:
          description: A client certificate used to authenticate to kubernetes apiserver
            is expiring in less than 24.0 hours.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
          summary: Client certificate is about to expire.
        expr: |
          histogram_quantile(0.01, sum without (namespace, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
          and
          on(job, cluster, instance) apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
        for: 5m
        labels:
          severity: P1
      - alert: KubeAggregatedAPIErrors
        annotations:
          description: Kubernetes aggregated API {{ $labels.instance }}/{{ $labels.name
            }} has reported {{ $labels.reason }} errors.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapierrors
          summary: Kubernetes aggregated API has reported errors.
        expr: |
          sum by(cluster, instance, name, reason)(increase(aggregator_unavailable_apiservice_total{job="apiserver"}[1m])) > 0
        for: 10m
        labels:
          severity: P3
      - alert: KubeAggregatedAPIDown
        annotations:
          description: Kubernetes aggregated API {{ $labels.name }}/{{ $labels.namespace
            }} has been only {{ $value | humanize }}% available over the last 10m.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeaggregatedapidown
          summary: Kubernetes aggregated API is down.
        expr: |
          (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice{job="apiserver"}[10m]))) * 100 < 85
        for: 5m
        labels:
          severity: P3
      - alert: KubeAPIDown
        annotations:
          description: KubeAPI has disappeared from Prometheus target discovery.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
          summary: Target disappeared from Prometheus target discovery.
        expr: |
          absent(up{job="apiserver"} == 1)
        for: 15m
        labels:
          severity: P1
      - alert: KubeAPITerminatedRequests
        annotations:
          description: The kubernetes apiserver has terminated {{ $value | humanizePercentage
            }} of its incoming requests.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapiterminatedrequests
          summary: The kubernetes apiserver has terminated {{ $value | humanizePercentage
            }} of its incoming requests.
        expr: |
          sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum by(cluster) (rate(apiserver_request_total{job="apiserver"}[10m])) + sum by(cluster) (rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
        for: 5m
        labels:
          severity: P3
    - name: kubernetes-system-kubelet
      rules:
      - alert: KubeNodeNotReady
        annotations:
          description: '{{ $labels.node }} has been unready for more than 15 minutes.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
          summary: Node is not ready.
        expr: |
          kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
          and on (cluster, node)
          kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeNodePressure
        annotations:
          description: '{{ $labels.node }} has active Condition {{ $labels.condition }}.
            This is caused by resource usage exceeding eviction thresholds.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodepressure
          summary: Node has as active Condition.
        expr: |
          kube_node_status_condition{job="kube-state-metrics",condition=~"(MemoryPressure|DiskPressure|PIDPressure)",status="true"} == 1
          and on (cluster, node)
          kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
        for: 10m
        labels:
          severity: P5
      - alert: KubeNodeUnreachable
        annotations:
          description: '{{ $labels.node }} is unreachable and some workloads may be rescheduled.'
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
          summary: Node is unreachable.
        expr: |
          (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
        for: 15m
        labels:
          severity: P3
      - alert: KubeletTooManyPods
        annotations:
          description: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
            }} of its Pod capacity.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
          summary: Kubelet is running at capacity.
        expr: |
          (
            max by (cluster, instance) (
              kubelet_running_pods{job="kubelet"} > 1
            )
            * on (cluster, instance) group_left(node)
            max by (cluster, instance, node) (
              kubelet_node_name{job="kubelet"}
            )
          )
          / on (cluster, node) group_left()
          max by (cluster, node) (
            kube_node_status_capacity{job="kube-state-metrics", resource="pods"} != 1
          ) > 0.95
        for: 15m
        labels:
          severity: P5
      - alert: KubeNodeReadinessFlapping
        annotations:
          description: The readiness status of node {{ $labels.node }} has changed {{
            $value }} times in the last 15 minutes.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
          summary: Node readiness status is flapping.
        expr: |
          sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2
          and on (cluster, node)
          kube_node_spec_unschedulable{job="kube-state-metrics"} == 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeNodeEviction
        annotations:
          description: Node {{ $labels.node }} is evicting Pods due to {{ $labels.eviction_signal
            }}.  Eviction occurs when eviction thresholds are crossed, typically caused
            by Pods exceeding RAM/ephemeral-storage limits.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeeviction
          summary: Node is evicting pods.
        expr: |
          sum(rate(kubelet_evictions{job="kubelet"}[15m])) by(cluster, eviction_signal, instance)
          * on (cluster, instance) group_left(node)
          max by (cluster, instance, node) (
            kubelet_node_name{job="kubelet"}
          )
          > 0
        for: 0s
        labels:
          severity: P5
      - alert: KubeletPlegDurationHigh
        annotations:
          description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile
            duration of {{ $value }} seconds on node {{ $labels.node }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
          summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
        expr: |
          node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
        for: 5m
        labels:
          severity: P3
      - alert: KubeletPodStartUpLatencyHigh
        annotations:
          description: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
            on node {{ $labels.node }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
          summary: Kubelet Pod startup latency is too high.
        expr: |
          histogram_quantile(0.99,
            sum by (cluster, instance, le) (
              topk by (cluster, instance, le, operation_type) (1,
                rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet"}[5m])
              )
            )
          )
          * on(cluster, instance) group_left(node)
          topk by (cluster, instance, node) (1,
            kubelet_node_name{job="kubelet"}
          )
          > 60
        for: 15m
        labels:
          severity: P3
      - alert: KubeletClientCertificateExpiration
        annotations:
          description: Client certificate for Kubelet on node {{ $labels.node }} expires
            in {{ $value | humanizeDuration }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
          summary: Kubelet client certificate is about to expire.
        expr: |
          kubelet_certificate_manager_client_ttl_seconds < 604800
        labels:
          severity: P3
      - alert: KubeletClientCertificateExpiration
        annotations:
          description: Client certificate for Kubelet on node {{ $labels.node }} expires
            in {{ $value | humanizeDuration }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificateexpiration
          summary: Kubelet client certificate is about to expire.
        expr: |
          kubelet_certificate_manager_client_ttl_seconds < 86400
        labels:
          severity: P1
      - alert: KubeletServerCertificateExpiration
        annotations:
          description: Server certificate for Kubelet on node {{ $labels.node }} expires
            in {{ $value | humanizeDuration }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
          summary: Kubelet server certificate is about to expire.
        expr: |
          kubelet_certificate_manager_server_ttl_seconds < 604800
        labels:
          severity: P3
      - alert: KubeletServerCertificateExpiration
        annotations:
          description: Server certificate for Kubelet on node {{ $labels.node }} expires
            in {{ $value | humanizeDuration }}.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificateexpiration
          summary: Kubelet server certificate is about to expire.
        expr: |
          kubelet_certificate_manager_server_ttl_seconds < 86400
        labels:
          severity: P1
      - alert: KubeletClientCertificateRenewalErrors
        annotations:
          description: Kubelet on node {{ $labels.node }} has failed to renew its client
            certificate ({{ $value | humanize }} errors in the last 5 minutes).
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletclientcertificaterenewalerrors
          summary: Kubelet has failed to renew its client certificate.
        expr: |
          increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeletServerCertificateRenewalErrors
        annotations:
          description: Kubelet on node {{ $labels.node }} has failed to renew its server
            certificate ({{ $value | humanize }} errors in the last 5 minutes).
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletservercertificaterenewalerrors
          summary: Kubelet has failed to renew its server certificate.
        expr: |
          increase(kubelet_server_expiration_renew_errors[5m]) > 0
        for: 15m
        labels:
          severity: P3
      - alert: KubeletDown
        annotations:
          description: Kubelet has disappeared from Prometheus target discovery.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
          summary: Target disappeared from Prometheus target discovery.
        expr: |
          absent(up{job="kubelet"} == 1)
        for: 15m
        labels:
          severity: P1
    - name: kubernetes-system-scheduler
      rules:
      - alert: KubeSchedulerDown
        annotations:
          description: KubeScheduler has disappeared from Prometheus target discovery.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
          summary: Target disappeared from Prometheus target discovery.
        expr: |
          absent(up{job="kube-scheduler"} == 1)
        for: 15m
        labels:
          severity: P1
    - name: kubernetes-system-controller-manager
      rules:
      - alert: KubeControllerManagerDown
        annotations:
          description: KubeControllerManager has disappeared from Prometheus target discovery.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
          summary: Target disappeared from Prometheus target discovery.
        expr: |
          absent(up{job="kube-controller-manager"} == 1)
        for: 15m
        labels:
          severity: P1
    - name: kubernetes-system-kube-proxy
      rules:
      - alert: KubeProxyDown
        annotations:
          description: KubeProxy has disappeared from Prometheus target discovery.
          runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeproxydown
          summary: Target disappeared from Prometheus target discovery.
        expr: |
          absent(up{job="kube-proxy"} == 1)
        for: 15m
        labels:
          severity: P1
    - interval: 3m
      name: kube-apiserver-availability.rules
      rules:
      - expr: |
          avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
        record: code_verb:apiserver_request_total:increase30d
      - expr: |
          sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
        labels:
          verb: read
        record: code:apiserver_request_total:increase30d
      - expr: |
          sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
        labels:
          verb: write
        record: code:apiserver_request_total:increase30d
      - expr: |
          sum by (cluster, verb, scope, le) (increase(apiserver_request_sli_duration_seconds_bucket[1h]))
        record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h
      - expr: |
          sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
        record: cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d
      - expr: |
          sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase1h{le="+Inf"})
        record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase1h
      - expr: |
          sum by (cluster, verb, scope) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{le="+Inf"})
        record: cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d
      - expr: |
          1 - (
            (
              # write too slow
              sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
              -
              sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0))
            ) +
            (
              # read too slow
              sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
              -
              (
                sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0))
                +
                sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0))
                +
                sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0))
              )
            ) +
            # errors
            sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
          )
          /
          sum by (cluster) (code:apiserver_request_total:increase30d)
        labels:
          verb: all
        record: apiserver_request:availability30d
      - expr: |
          1 - (
            sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"LIST|GET"})
            -
            (
              # too slow
              sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le=~"1(\\.0)?"} or vector(0))
              +
              sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le=~"5(\\.0)?"} or vector(0))
              +
              sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le=~"30(\\.0)?"} or vector(0))
            )
            +
            # errors
            sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
          )
          /
          sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
        labels:
          verb: read
        record: apiserver_request:availability30d
      - expr: |
          1 - (
            (
              # too slow
              sum by (cluster) (cluster_verb_scope:apiserver_request_sli_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
              -
              sum by (cluster) (cluster_verb_scope_le:apiserver_request_sli_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le=~"1(\\.0)?"} or vector(0))
            )
            +
            # errors
            sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
          )
          /
          sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
        labels:
          verb: write
        record: apiserver_request:availability30d
      - expr: |
          sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
        labels:
          verb: read
        record: code_resource:apiserver_request_total:rate5m
      - expr: |
          sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
        labels:
          verb: write
        record: code_resource:apiserver_request_total:rate5m
      - expr: |
          sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
        record: code_verb:apiserver_request_total:increase1h
      - expr: |
          sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
        record: code_verb:apiserver_request_total:increase1h
      - expr: |
          sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
        record: code_verb:apiserver_request_total:increase1h
      - expr: |
          sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
        record: code_verb:apiserver_request_total:increase1h
    - name: kube-apiserver-burnrate.rules
      rules:
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1d]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1d]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1d]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1d]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1d]))
        labels:
          verb: read
        record: apiserver_request:burnrate1d
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[1h]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[1h]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[1h]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[1h]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[1h]))
        labels:
          verb: read
        record: apiserver_request:burnrate1h
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[2h]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[2h]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[2h]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[2h]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[2h]))
        labels:
          verb: read
        record: apiserver_request:burnrate2h
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[30m]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[30m]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[30m]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[30m]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[30m]))
        labels:
          verb: read
        record: apiserver_request:burnrate30m
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[3d]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[3d]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[3d]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[3d]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[3d]))
        labels:
          verb: read
        record: apiserver_request:burnrate3d
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[5m]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[5m]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[5m]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[5m]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
        labels:
          verb: read
        record: apiserver_request:burnrate5m
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
              -
              (
                (
                  sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope=~"resource|",le=~"1(\\.0)?"}[6h]))
                  or
                  vector(0)
                )
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="namespace",le=~"5(\\.0)?"}[6h]))
                +
                sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward",scope="cluster",le=~"30(\\.0)?"}[6h]))
              )
            )
            +
            # errors
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET",code=~"5.."}[6h]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[6h]))
        labels:
          verb: read
        record: apiserver_request:burnrate6h
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1d]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1d]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1d]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1d]))
        labels:
          verb: write
        record: apiserver_request:burnrate1d
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[1h]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[1h]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[1h]))
        labels:
          verb: write
        record: apiserver_request:burnrate1h
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[2h]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[2h]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[2h]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[2h]))
        labels:
          verb: write
        record: apiserver_request:burnrate2h
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[30m]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[30m]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[30m]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[30m]))
        labels:
          verb: write
        record: apiserver_request:burnrate30m
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[3d]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[3d]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[3d]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[3d]))
        labels:
          verb: write
        record: apiserver_request:burnrate3d
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[5m]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[5m]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
        labels:
          verb: write
        record: apiserver_request:burnrate5m
      - expr: |
          (
            (
              # too slow
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_count{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[6h]))
              -
              sum by (cluster) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward",le=~"1(\\.0)?"}[6h]))
            )
            +
            sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",code=~"5.."}[6h]))
          )
          /
          sum by (cluster) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[6h]))
        labels:
          verb: write
        record: apiserver_request:burnrate6h
    - name: kube-apiserver-histogram.rules
      rules:
      - expr: |
          histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"LIST|GET",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
        labels:
          quantile: "0.99"
          verb: read
        record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.99, sum by (cluster, le, resource) (rate(apiserver_request_sli_duration_seconds_bucket{job="apiserver",verb=~"POST|PUT|PATCH|DELETE",subresource!~"proxy|attach|log|exec|portforward"}[5m]))) > 0
        labels:
          quantile: "0.99"
          verb: write
        record: cluster_quantile:apiserver_request_sli_duration_seconds:histogram_quantile
    - name: k8s.rules.container_cpu_usage_seconds_total
      rules:
      - expr: |
          sum by (cluster, namespace, pod, container) (
            rate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])
          ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
            1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate5m
      - expr: |
          sum by (cluster, namespace, pod, container) (
            irate(container_cpu_usage_seconds_total{job="cadvisor", image!=""}[5m])
          ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
            1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
    - name: k8s.rules.container_memory_working_set_bytes
      rules:
      - expr: |
          container_memory_working_set_bytes{job="cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_working_set_bytes
    - name: k8s.rules.container_memory_rss
      rules:
      - expr: |
          container_memory_rss{job="cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_rss
    - name: k8s.rules.container_memory_cache
      rules:
      - expr: |
          container_memory_cache{job="cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_cache
    - name: k8s.rules.container_memory_swap
      rules:
      - expr: |
          container_memory_swap{job="cadvisor", image!=""}
          * on (cluster, namespace, pod) group_left(node) topk by(cluster, namespace, pod) (1,
            max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
          )
        record: node_namespace_pod_container:container_memory_swap
    - name: k8s.rules.container_memory_requests
      rules:
      - expr: |
          kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_memory:active:kube_pod_container_resource_requests
      - expr: |
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_requests{resource="memory",job="kube-state-metrics"}
                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_memory:kube_pod_container_resource_requests:sum
    - name: k8s.rules.container_cpu_requests
      rules:
      - expr: |
          kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_requests
      - expr: |
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_requests{resource="cpu",job="kube-state-metrics"}
                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_cpu:kube_pod_container_resource_requests:sum
    - name: k8s.rules.container_memory_limits
      rules:
      - expr: |
          kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_memory:active:kube_pod_container_resource_limits
      - expr: |
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_limits{resource="memory",job="kube-state-metrics"}
                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_memory:kube_pod_container_resource_limits:sum
    - name: k8s.rules.container_cpu_limits
      rules:
      - expr: |
          kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}  * on (namespace, pod, cluster)
          group_left() max by (namespace, pod, cluster) (
            (kube_pod_status_phase{phase=~"Pending|Running"} == 1)
          )
        record: cluster:namespace:pod_cpu:active:kube_pod_container_resource_limits
      - expr: |
          sum by (namespace, cluster) (
              sum by (namespace, pod, cluster) (
                  max by (namespace, pod, container, cluster) (
                    kube_pod_container_resource_limits{resource="cpu",job="kube-state-metrics"}
                  ) * on(namespace, pod, cluster) group_left() max by (namespace, pod, cluster) (
                    kube_pod_status_phase{phase=~"Pending|Running"} == 1
                  )
              )
          )
        record: namespace_cpu:kube_pod_container_resource_limits:sum
    - name: k8s.rules.pod_owner
      rules:
      - expr: |
          max by (cluster, namespace, workload, pod) (
            label_replace(
              label_replace(
                kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
                "replicaset", "$1", "owner_name", "(.*)"
              ) * on (cluster, replicaset, namespace) group_left(owner_name) topk by(cluster, replicaset, namespace) (
                1, max by (cluster, replicaset, namespace, owner_name) (
                  kube_replicaset_owner{job="kube-state-metrics", owner_kind=""}
                )
              ),
              "workload", "$1", "replicaset", "(.*)"
            )
          )
        labels:
          workload_type: replicaset
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          max by (cluster, namespace, workload, pod) (
            label_replace(
              label_replace(
                kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
                "replicaset", "$1", "owner_name", "(.*)"
              ) * on(replicaset, namespace, cluster) group_left(owner_name) topk by(cluster, replicaset, namespace) (
                1, max by (cluster, replicaset, namespace, owner_name) (
                  kube_replicaset_owner{job="kube-state-metrics", owner_kind="Deployment"}
                )
              ),
              "workload", "$1", "owner_name", "(.*)"
            )
          )
        labels:
          workload_type: deployment
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
              "workload", "$1", "owner_name", "(.*)"
            )
          )
        labels:
          workload_type: daemonset
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
            "workload", "$1", "owner_name", "(.*)")
          )
        labels:
          workload_type: statefulset
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          group by (cluster, namespace, workload, pod) (
            label_join(
              group by (cluster, namespace, job_name, pod, owner_name) (
                label_join(
                  kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}
                , "job_name", "", "owner_name")
              )
              * on (cluster, namespace, job_name) group_left()
              group by (cluster, namespace, job_name) (
                kube_job_owner{job="kube-state-metrics", owner_kind=~"Pod|"}
              )
            , "workload", "", "owner_name")
          )
        labels:
          workload_type: job
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="", owner_name=""},
            "workload", "$1", "pod", "(.+)")
          )
        labels:
          workload_type: barepod
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          max by (cluster, namespace, workload, pod) (
            label_replace(
              kube_pod_owner{job="kube-state-metrics", owner_kind="Node"},
            "workload", "$1", "pod", "(.+)")
          )
        labels:
          workload_type: staticpod
        record: namespace_workload_pod:kube_pod_owner:relabel
      - expr: |
          group by (cluster, namespace, workload, workload_type, pod) (
            label_join(
              label_join(
                group by (cluster, namespace, job_name, pod) (
                  label_join(
                    kube_pod_owner{job="kube-state-metrics", owner_kind="Job"}
                  , "job_name", "", "owner_name")
                )
                * on (cluster, namespace, job_name) group_left(owner_kind, owner_name)
                group by (cluster, namespace, job_name, owner_kind, owner_name) (
                  kube_job_owner{job="kube-state-metrics", owner_kind!="Pod", owner_kind!=""}
                )
              , "workload", "", "owner_name")
            , "workload_type", "", "owner_kind")

            OR

            label_replace(
              label_replace(
                label_replace(
                  kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}
                  , "replicaset", "$1", "owner_name", "(.+)"
                )
                * on(cluster, namespace, replicaset) group_left(owner_kind, owner_name)
                group by (cluster, namespace, replicaset, owner_kind, owner_name) (
                  kube_replicaset_owner{job="kube-state-metrics", owner_kind!="Deployment", owner_kind!=""}
                )
              , "workload", "$1", "owner_name", "(.+)")
              OR
              label_replace(
                group by (cluster, namespace, pod, owner_name, owner_kind) (
                  kube_pod_owner{job="kube-state-metrics", owner_kind!="ReplicaSet", owner_kind!="DaemonSet", owner_kind!="StatefulSet", owner_kind!="Job", owner_kind!="Node", owner_kind!=""}
                )
                , "workload", "$1", "owner_name", "(.+)"
              )
            , "workload_type", "$1", "owner_kind", "(.+)")
          )
        record: namespace_workload_pod:kube_pod_owner:relabel
    - name: kube-scheduler.rules
      rules:
      - expr: |
          histogram_quantile(0.99, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.99"
        record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.99"
        record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.99"
        record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.9, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.9"
        record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.9"
        record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.9"
        record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.5, sum(rate(scheduler_scheduling_attempt_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.5"
        record: cluster_quantile:scheduler_scheduling_attempt_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.5"
        record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
        labels:
          quantile: "0.5"
        record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
    - name: node.rules
      rules:
      - expr: |
          topk by(cluster, namespace, pod) (1,
            max by (cluster, node, namespace, pod) (
              label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
          ))
        record: 'node_namespace_pod:kube_pod_info:'
      - expr: |
          count by (cluster, node) (
            node_cpu_seconds_total{mode="idle",job="node-exporter"}
            * on (cluster, namespace, pod) group_left(node)
            topk by(cluster, namespace, pod) (1, node_namespace_pod:kube_pod_info:)
          )
        record: node:node_num_cpu:sum
      - expr: |
          sum(
            node_memory_MemAvailable_bytes{job="node-exporter"} or
            (
              node_memory_Buffers_bytes{job="node-exporter"} +
              node_memory_Cached_bytes{job="node-exporter"} +
              node_memory_MemFree_bytes{job="node-exporter"} +
              node_memory_Slab_bytes{job="node-exporter"}
            )
          ) by (cluster)
        record: :node_memory_MemAvailable_bytes:sum
      - expr: |
          avg by (cluster, node) (
            sum without (mode) (
              rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
            )
          )
        record: node:node_cpu_utilization:ratio_rate5m
      - expr: |
          avg by (cluster) (
            node:node_cpu_utilization:ratio_rate5m
          )
        record: cluster:node_cpu:ratio_rate5m
    - name: kubelet.rules
      rules:
      - expr: |
          histogram_quantile(
            0.99,
            sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)
            * on(cluster, instance) group_left (node)
            max by (cluster, instance, node) (kubelet_node_name{job="kubelet"})
          )
        labels:
          quantile: "0.99"
        record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(
            0.9,
            sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)
            * on(cluster, instance) group_left (node)
            max by (cluster, instance, node) (kubelet_node_name{job="kubelet"})
          )
        labels:
          quantile: "0.9"
        record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
      - expr: |
          histogram_quantile(
            0.5,
            sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet"}[5m])) by (cluster, instance, le)
            * on(cluster, instance) group_left (node)
            max by (cluster, instance, node) (kubelet_node_name{job="kubelet"})
          )
        labels:
          quantile: "0.5"
        record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
  monitoring-kube-prometheus-stack-kube-prometheus-general.rules-d39990f2-d25f-4d90-9dcc-7818c78d8104.yaml: |
    groups:
    - name: kube-prometheus-general.rules
      rules:
      - expr: count without(instance, pod, node) (up == 1)
        record: count:up1
      - expr: count without(instance, pod, node) (up == 0)
        record: count:up0
  monitoring-kube-prometheus-stack-kube-prometheus-node-recording.rules-b44861aa-d0cd-455b-935e-36923d541bc5.yaml: |
    groups:
    - name: kube-prometheus-node-recording.rules
      rules:
      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m]))
          BY (instance)
        record: instance:node_cpu:rate:sum
      - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
        record: instance:node_network_receive_bytes:rate:sum
      - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
        record: instance:node_network_transmit_bytes:rate:sum
      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
          WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
          BY (instance, cpu)) BY (instance)
        record: instance:node_cpu:ratio
      - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
        record: cluster:node_cpu:sum_rate5m
      - expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance,
          cpu))
        record: cluster:node_cpu:ratio
  monitoring-kube-prometheus-stack-kube-state-metrics-a0cc8152-2186-4296-a84f-44e666247896.yaml: |
    groups:
    - name: kube-state-metrics
      rules:
      - alert: KubeStateMetricsListErrors
        annotations:
          description: kube-state-metrics is experiencing errors at an elevated rate in
            list operations. This is likely causing it to not be able to expose metrics
            about Kubernetes objects correctly or at all.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricslisterrors
          summary: kube-state-metrics is experiencing errors in list operations.
        expr: |-
          (sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
            /
          sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
          > 0.01
        for: 15m
        labels:
          severity: critical
      - alert: KubeStateMetricsWatchErrors
        annotations:
          description: kube-state-metrics is experiencing errors at an elevated rate in
            watch operations. This is likely causing it to not be able to expose metrics
            about Kubernetes objects correctly or at all.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricswatcherrors
          summary: kube-state-metrics is experiencing errors in watch operations.
        expr: |-
          (sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
            /
          sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
          > 0.01
        for: 15m
        labels:
          severity: critical
      - alert: KubeStateMetricsShardingMismatch
        annotations:
          description: kube-state-metrics pods are running with different --total-shards
            configuration, some Kubernetes objects may be exposed multiple times or not
            exposed at all.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardingmismatch
          summary: kube-state-metrics sharding is misconfigured.
        expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster)
          != 0
        for: 15m
        labels:
          severity: critical
      - alert: KubeStateMetricsShardsMissing
        annotations:
          description: kube-state-metrics shards are missing, some Kubernetes objects
            are not being exposed.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kube-state-metrics/kubestatemetricsshardsmissing
          summary: kube-state-metrics shards are missing.
        expr: |-
          2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
            -
          sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
          != 0
        for: 15m
        labels:
          severity: critical
  monitoring-kube-prometheus-stack-memcached-71f7cfcb-97d5-4d12-80fd-de99f790a616.yaml: |
    groups:
    - name: memcached
      rules:
      - alert: MemcachedDown
        annotations:
          description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} is
            down for more than 15 minutes.
          summary: Memcached instance is down.
        expr: |
          memcached_up == 0
        for: 15m
        labels:
          severity: P1
      - alert: MemcachedConnectionLimitApproaching
        annotations:
          description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection
            usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes.
          summary: Memcached max connection limit is approaching.
        expr: |
          (memcached_current_connections / memcached_max_connections * 100) > 80
        for: 15m
        labels:
          severity: P3
      - alert: MemcachedConnectionLimitApproaching
        annotations:
          description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} connection
            usage is at {{ printf "%0.0f" $value }}% for at least 15 minutes.
          summary: Memcached connections at critical level.
        expr: |
          (memcached_current_connections / memcached_max_connections * 100) > 95
        for: 15m
        labels:
          severity: P1
      - alert: MemcachedOutOfMemoryErrors
        annotations:
          description: Memcached instance {{ $labels.job }} / {{ $labels.instance }} has
            OutOfMemory errors for at least 15 minutes, current rate is {{ printf "%0.0f"
            $value }}
          summary: Memcached has OutOfMemory errors.
        expr: |
          sum without (slab) (rate(memcached_slab_items_outofmemory_total[5m])) > 0
        for: 15m
        labels:
          severity: P3
  monitoring-kube-prometheus-stack-mysqld-1576adf3-10b0-4875-9200-2f8169310dd1.yaml: |
    groups:
    - name: MySQLdAlerts
    - name: GaleraAlerts
      rules:
      - alert: MySQLGaleraNotReady
        annotations:
          description: '{{$labels.job}} on {{$labels.instance}} is not ready.'
          summary: Galera cluster node not ready.
        expr: mysql_global_status_wsrep_ready != 1
        for: 5m
        labels:
          severity: P3
      - alert: MySQLGaleraOutOfSync
        annotations:
          description: The Galera node {{ $labels.instance }} has wsrep_local_state={{
            $value }} which is not the expected value of 4 (Synced).  The node is not
            in Donor state (2) and wsrep_desync is not enabled, indicating an unexpected
            loss of cluster sync.  Normal behavior is wsrep_local_state=4 for all nodes
            not actively serving as SST donors.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#mysqlgaleraoutofsync
          summary: 'Percona XtraDB Cluster: Galera node not in sync with cluster'
        expr: |
          (mysql_global_status_wsrep_local_state != 4 and mysql_global_status_wsrep_local_state != 2 and mysql_global_variables_wsrep_desync == 0)
        for: 15m
        labels:
          severity: P3
      - alert: MySQLGaleraDonorFallingBehind
        annotations:
          description: '{{$labels.job}} on {{$labels.instance}} is a donor (hotbackup)
            and is falling behind (queue size {{$value}}).'
          summary: XtraDB cluster donor node falling behind.
        expr: (mysql_global_status_wsrep_local_state == 2 and mysql_global_status_wsrep_local_recv_queue
          > 100)
        for: 5m
        labels:
          severity: P3
      - alert: MySQLReplicationNotRunning
        annotations:
          description: Replication on {{$labels.instance}} (IO or SQL) has been down for
            more than 2 minutes.
          summary: Replication is not running.
        expr: mysql_slave_status_slave_io_running == 0 or mysql_slave_status_slave_sql_running
          == 0
        for: 2m
        labels:
          severity: P1
      - alert: MySQLReplicationLag
        annotations:
          description: Replication on {{$labels.instance}} has fallen behind and is not
            recovering.
          summary: MySQL slave replication is lagging.
        expr: (instance:mysql_slave_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_slave_lag_seconds[5m],
          60 * 2) > 0)
        for: 1m
        labels:
          severity: P1
      - alert: MySQLHeartbeatLag
        annotations:
          description: The heartbeat is lagging on {{$labels.instance}} and is not recovering.
          summary: MySQL heartbeat is lagging.
        expr: (instance:mysql_heartbeat_lag_seconds > 30) and on(instance) (predict_linear(instance:mysql_heartbeat_lag_seconds[5m],
          60 * 2) > 0)
        for: 1m
        labels:
          severity: P1
      - alert: MySQLInnoDBLogWaits
        annotations:
          description: The innodb logs are waiting for disk at a rate of {{$value}} /
            second
          summary: MySQL innodb log writes stalling.
        expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
        labels:
          severity: P3
    - name: mysqld-extras
      rules:
      - alert: MysqlTooManyConnections
        expr: |
          max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80
        for: 1m
        labels:
          severity: P3
      - alert: MysqlHighThreadsRunning
        expr: |
          max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60
        for: 1m
        labels:
          severity: P3
      - alert: MysqlSlowQueries
        expr: |
          increase(mysql_global_status_slow_queries[1m]) > 0
        for: 2m
        labels:
          severity: P3
      - alert: MysqlClusterDown
        annotations:
          description: '{{ $labels.instance }} replica is down.'
          summary: Percona XtraDB Cluster replica is down
        expr: mysql_up == 0
        for: 5m
        labels:
          severity: P5
      - alert: MysqlClusterDown
        annotations:
          description: '{{ $value }}% of replicas are online.'
          summary: Percona XtraDB Cluster replicas are down
        expr: round(count(mysql_up==1) / count(mysql_up) * 100) <= 50
        for: 5m
        labels:
          severity: P3
      - alert: MysqlClusterDown
        annotations:
          description: All replicas are down.
          summary: Percona XtraDB Cluster is down
        expr: count(mysql_up==0) == count(mysql_up)
        for: 1m
        labels:
          severity: P1
    - name: mysqld_rules
      rules:
      - expr: mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay
        record: instance:mysql_slave_lag_seconds
      - expr: mysql_heartbeat_now_timestamp_seconds - mysql_heartbeat_stored_timestamp_seconds
        record: instance:mysql_heartbeat_lag_seconds
      - expr: sum without (command) (rate(mysql_global_status_commands_total{command=~"(commit|rollback)"}[5m]))
        record: job:mysql_transactions:rate5m
  monitoring-kube-prometheus-stack-nginx-0ba63222-b409-41a4-8bdd-dc6b75a1bbd8.yaml: |
    groups:
    - name: nginx-ingress
      rules:
      - alert: NginxIngressCriticalErrorBudgetBurn
        annotations:
          description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage
            }} over the last hour, which exceeds the 1.44% burn-rate threshold (14.4x
            against 99.9% SLO). At this rate, the 30-day error budget exhausts in under
            2.1 days.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingresscriticalerrorbudgetburn
          summary: 'NGINX Ingress: elevated 5xx errors rapidly consuming error budget'
        expr: |
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[1h]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[1h]))
          ) > 0.0144
          and
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[5m]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[5m]))
          ) > 0.0144
          and
          sum by (service) (rate(nginx_ingress_controller_requests[5m])) > 1
        for: 2m
        labels:
          severity: P2
      - alert: NginxIngressHighErrorBudgetBurn
        annotations:
          description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage
            }} over the last 6 hours, which exceeds the 0.6% burn-rate threshold (6x against
            99.9% SLO). At this rate, the 30-day error budget exhausts in under 5 days.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingresshigherrorbudgetburn
          summary: 'NGINX Ingress: sustained 5xx errors depleting error budget'
        expr: |
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[6h]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[6h]))
          ) > 0.006
          and
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[30m]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[30m]))
          ) > 0.006
          and
          sum by (service) (rate(nginx_ingress_controller_requests[30m])) > 1
        for: 5m
        labels:
          severity: P2
      - alert: NginxIngressModerateErrorBudgetBurn
        annotations:
          description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage
            }} over the last day, which exceeds the 0.3% burn-rate threshold (3x against
            99.9% SLO). At this rate, the 30-day error budget exhausts in under 10 days.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingressmoderateerrorbudgetburn
          summary: 'NGINX Ingress: ongoing 5xx errors steadily consuming error budget'
        expr: |
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[1d]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[1d]))
          ) > 0.003
          and
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[2h]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[2h]))
          ) > 0.003
          and
          sum by (service) (rate(nginx_ingress_controller_requests[2h])) > 1
        for: 15m
        labels:
          severity: P3
      - alert: NginxIngressLowErrorBudgetBurn
        annotations:
          description: The service {{ $labels.service }} error rate is {{ $value | humanizePercentage
            }} over the last 3 days, which exceeds the 0.1% burn-rate threshold (1x against
            99.9% SLO). At this rate, the 30-day error budget exhausts before the window
            resets.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nginxingresslowerrorbudgetburn
          summary: 'NGINX Ingress: low-level 5xx errors eroding error budget'
        expr: |
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[3d]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[3d]))
          ) > 0.001
          and
          (
          sum by (service) (rate(nginx_ingress_controller_requests{status=~"5[0-9]{2}"}[6h]))
          /
          sum by (service) (rate(nginx_ingress_controller_requests[6h]))
          ) > 0.001
          and
          sum by (service) (rate(nginx_ingress_controller_requests[6h])) > 1
        for: 1h
        labels:
          severity: P4
  monitoring-kube-prometheus-stack-node-4e09c8ed-f557-4cc3-9eb7-6c21df020fd5.yaml: |
    groups:
    - name: node-exporter
      rules:
      - alert: NodeFilesystemSpaceFillingUp
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            space left and is filling up.
          summary: Filesystem is predicted to run out of space within the next 24 hours.
        expr: |
          (
            node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
          and
            predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 1h
        labels:
          severity: P3
      - alert: NodeFilesystemSpaceFillingUp
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            space left and is filling up fast.
          summary: Filesystem is predicted to run out of space within the next 4 hours.
        expr: |
          (
            node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
          and
            predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 1h
        labels:
          severity: P1
      - alert: NodeFilesystemAlmostOutOfSpace
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            space left.
          summary: Filesystem has less than 5% space left.
        expr: |
          (
            node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 30m
        labels:
          severity: P3
      - alert: NodeFilesystemAlmostOutOfSpace
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            space left.
          summary: Filesystem has less than 3% space left.
        expr: |
          (
            node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 30m
        labels:
          severity: P1
      - alert: NodeFilesystemFilesFillingUp
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            inodes left and is filling up.
          summary: Filesystem is predicted to run out of inodes within the next 24 hours.
        expr: |
          (
            node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
          and
            predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 1h
        labels:
          severity: P3
      - alert: NodeFilesystemFilesFillingUp
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            inodes left and is filling up fast.
          summary: Filesystem is predicted to run out of inodes within the next 4 hours.
        expr: |
          (
            node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
          and
            predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 1h
        labels:
          severity: P1
      - alert: NodeFilesystemAlmostOutOfFiles
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            inodes left.
          summary: Filesystem has less than 5% inodes left.
        expr: |
          (
            node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 1h
        labels:
          severity: P3
      - alert: NodeFilesystemAlmostOutOfFiles
        annotations:
          description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
            }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
            inodes left.
          summary: Filesystem has less than 3% inodes left.
        expr: |
          (
            node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
          and
            node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
          )
        for: 1h
        labels:
          severity: P1
      - alert: NodeNetworkReceiveErrs
        annotations:
          description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
            {{ printf "%.0f" $value }} receive errors in the last two minutes.'
          summary: Network interface is reporting many receive errors.
        expr: |
          rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) > 0.01
        for: 1h
        labels:
          severity: P3
      - alert: NodeNetworkTransmitErrs
        annotations:
          description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
            {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
          summary: Network interface is reporting many transmit errors.
        expr: |
          rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
        for: 1h
        labels:
          severity: P3
      - alert: NodeHighNumberConntrackEntriesUsed
        annotations:
          description: '{{ $labels.instance }} {{ $value | humanizePercentage }} of conntrack
            entries are used.'
          summary: Number of conntrack are getting close to the limit.
        expr: |
          (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) > 0.75
        labels:
          severity: P3
      - alert: NodeTextFileCollectorScrapeError
        annotations:
          description: Node Exporter text file collector on {{ $labels.instance }} failed
            to scrape.
          summary: Node Exporter text file collector failed to scrape.
        expr: |
          node_textfile_scrape_error{job="node-exporter"} == 1
        labels:
          severity: P3
      - alert: NodeClockSkewDetected
        annotations:
          description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
            Ensure NTP is configured correctly on this host.
          summary: Clock skew detected.
        expr: |
          (
            node_timex_offset_seconds{job="node-exporter"} > 0.05
          and
            deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
          )
          or
          (
            node_timex_offset_seconds{job="node-exporter"} < -0.05
          and
            deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
          )
        for: 10m
        labels:
          severity: P3
      - alert: NodeClockNotSynchronising
        annotations:
          description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
            is configured on this host.
          summary: Clock not synchronising.
        expr: |
          min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
          and
          node_timex_maxerror_seconds{job="node-exporter"} >= 16
        for: 10m
        labels:
          severity: P3
      - alert: NodeRAIDDegraded
        annotations:
          description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
            in degraded state due to one or more disks failures. Number of spare drives
            is insufficient to fix issue automatically.
          summary: RAID Array is degraded.
        expr: |
          node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
        for: 15m
        labels:
          severity: P1
      - alert: NodeRAIDDiskFailure
        annotations:
          description: At least one device in RAID array at {{ $labels.instance }} failed.
            Array '{{ $labels.device }}' needs attention and possibly a disk swap.
          summary: Failed device in RAID array.
        expr: |
          node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
        labels:
          severity: P3
      - alert: NodeFileDescriptorLimit
        annotations:
          description: File descriptors limit at {{ $labels.instance }} is currently at
            {{ printf "%.2f" $value }}%.
          summary: Kernel is predicted to exhaust file descriptors limit soon.
        expr: |
          (
            node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
          )
        for: 15m
        labels:
          severity: P3
      - alert: NodeFileDescriptorLimit
        annotations:
          description: File descriptors limit at {{ $labels.instance }} is currently at
            {{ printf "%.2f" $value }}%.
          summary: Kernel is predicted to exhaust file descriptors limit soon.
        expr: |
          (
            node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
          )
        for: 15m
        labels:
          severity: P1
      - alert: NodeCPUHighUsage
        annotations:
          description: |
            CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
          summary: High CPU usage.
        expr: |
          sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", mode!~"idle|iowait"}[2m]))) * 100 > 90
        for: 15m
        labels:
          severity: P5
      - alert: NodeSystemSaturation
        annotations:
          description: |
            System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
            This might indicate this instance resources saturation and can cause it becoming unresponsive.
          summary: System saturated, load per core is very high.
        expr: |
          node_load1{job="node-exporter"}
          / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
        for: 15m
        labels:
          severity: P3
      - alert: NodeMemoryMajorPagesFaults
        annotations:
          description: |
            Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
            Please check that there is enough memory available at this instance.
          summary: Memory major page faults are occurring at very high rate.
        expr: |
          rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
        for: 15m
        labels:
          severity: P3
      - alert: NodeMemoryHighUtilization
        annotations:
          description: |
            Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
          summary: Host is running out of memory.
        expr: |
          100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} * 100) > 90
        for: 15m
        labels:
          severity: P3
      - alert: NodeSystemdServiceFailed
        annotations:
          description: Systemd service {{ $labels.name }} has entered failed state at
            {{ $labels.instance }}
          summary: Systemd service has entered failed state.
        expr: |
          node_systemd_unit_state{job="node-exporter", state="failed"} == 1
        for: 5m
        labels:
          severity: P3
      - alert: NodeSystemdServiceCrashlooping
        annotations:
          description: Systemd service {{ $labels.name }} has being restarted too many
            times at {{ $labels.instance }} for the last 15 minutes. Please check if service
            is crash looping.
          summary: Systemd service keeps restaring, possibly crash looping.
        expr: |
          increase(node_systemd_service_restart_total{job="node-exporter"}[5m]) > 2
        for: 15m
        labels:
          severity: P3
      - alert: NodeBondingDegraded
        annotations:
          description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
            is in degraded state due to one or more slave failures.
          summary: Bonding interface is degraded.
        expr: |
          (node_bonding_slaves{job="node-exporter"} - node_bonding_active{job="node-exporter"}) != 0
        for: 5m
        labels:
          severity: P3
    - name: node-exporter-extras
      rules:
      - alert: NodeTimeSkewDetected
        annotations:
          description: Node {{ $labels.instance }} has a time difference {{ $value }}.
          summary: Node {{ $labels.instance }} has a time difference.
        expr: |
          abs(timestamp(node_time_seconds{job="node-exporter"}) - node_time_seconds{job="node-exporter"}) > 1
        for: 5m
        labels:
          severity: P3
      - alert: NodeDiskHighLatency
        annotations:
          description: Average IO latency on {{ $labels.device }} at {{ $labels.instance
            }} is {{ $value | humanizeDuration }} over the last 5 minutes, which exceeds
            the threshold of 20ms. Normal SSD latency is below 1ms and normal HDD latency
            is below 15ms.
          runbook_url: https://vexxhost.github.io/atmosphere/admin/monitoring.html#nodediskhighlatency
          summary: 'Node disk: high IO latency affecting workloads'
        expr: |
          (
            (
              rate(node_disk_read_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
              +
              rate(node_disk_write_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
            )
            /
            (
              rate(node_disk_reads_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
              +
              rate(node_disk_writes_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
            )
          ) > 0.02
          and
          (
            rate(node_disk_reads_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
            +
            rate(node_disk_writes_completed_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
          ) > 0
        for: 1h
        labels:
          severity: P4
    - name: node-exporter.rules
      rules:
      - expr: |
          count without (cpu, mode) (
            node_cpu_seconds_total{job="node-exporter",mode="idle"}
          )
        record: instance:node_num_cpu:sum
      - expr: |
          1 - avg without (cpu) (
            sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
          )
        record: instance:node_cpu_utilisation:rate5m
      - expr: |
          (
            node_load1{job="node-exporter"}
          /
            instance:node_num_cpu:sum{job="node-exporter"}
          )
        record: instance:node_load1_per_cpu:ratio
      - expr: |
          1 - (
            (
              node_memory_MemAvailable_bytes{job="node-exporter"}
              or
              (
                node_memory_Buffers_bytes{job="node-exporter"}
                +
                node_memory_Cached_bytes{job="node-exporter"}
                +
                node_memory_MemFree_bytes{job="node-exporter"}
                +
                node_memory_Slab_bytes{job="node-exporter"}
              )
            )
          /
            node_memory_MemTotal_bytes{job="node-exporter"}
          )
        record: instance:node_memory_utilisation:ratio
      - expr: |
          rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
        record: instance:node_vmstat_pgmajfault:rate5m
      - expr: |
          rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
        record: instance_device:node_disk_io_time_seconds:rate5m
      - expr: |
          rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
        record: instance_device:node_disk_io_time_weighted_seconds:rate5m
      - expr: |
          sum without (device) (
            rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
          )
        record: instance:node_network_receive_bytes_excluding_lo:rate5m
      - expr: |
          sum without (device) (
            rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
          )
        record: instance:node_network_transmit_bytes_excluding_lo:rate5m
      - expr: |
          sum without (device) (
            rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
          )
        record: instance:node_network_receive_drop_excluding_lo:rate5m
      - expr: |
          sum without (device) (
            rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
          )
        record: instance:node_network_transmit_drop_excluding_lo:rate5m
  monitoring-kube-prometheus-stack-node-exporter-local-ba176107-189f-4742-8f92-0654ee25e7aa.yaml: |
    groups:
    - name: node
      rules:
      - alert: NodeHighLoadAverage
        expr: node_load5 / count(node_cpu_seconds_total{mode="system"}) without (cpu,
          mode) > 1.5
        for: 30m
        labels:
          severity: P3
      - alert: NodeHighCpuUsage
        expr: sum by(instance)(irate(node_cpu_seconds_total{mode='idle'}[5m])) < 1
        for: 2m
        labels:
          severity: P3
      - alert: NodeLowEntropy
        expr: node_entropy_available_bits / node_entropy_pool_size_bits < 0.20
        for: 5m
        labels:
          severity: P5
    - name: network
      rules:
      - alert: NodeNetworkMulticast
        annotations:
          description: This can result in high software interrupt load on the node which
            can bring network performance down.
          runbook_url: https://github.com/vexxhost/atmosphere/tree/main/roles/kube_prometheus_stack#NodeNetworkMulticast
          summary: 'High multicast traffic on node {{ $labels.instance }}: {{ $value }}
            packets/sec'
        expr: rate(node_network_receive_multicast_total[1m]) > 1000
        for: 5m
        labels:
          severity: P1
    - name: softnet
      rules:
      - alert: SingleNodeSoftNetBacklog
        expr: count(node:softnet:backlog:1m > 5000) > (count(node:softnet:backlog:1m)
          * 0)
        for: 1m
        labels:
          severity: P3
      - alert: MultipleNodesSoftNetBacklog
        expr: count(node:softnet:backlog:1m > 5000) > (count(node:softnet:backlog:1m)
          * 0.5)
        for: 1m
        labels:
          severity: P2
      - alert: MajorityNodesSoftNetBacklog
        expr: count(node:softnet:backlog:1m > 5000) > (count(node:softnet:backlog:1m)
          * 0.75)
        for: 1m
        labels:
          severity: P1
      - alert: SingleNodeSoftNetDropped
        expr: count(node:softnet:dropped:1m > 0) > (count(node:softnet:dropped:1m) * 0)
        for: 1m
        labels:
          severity: P3
      - alert: MultipleNodesSoftNetDropped
        expr: count(node:softnet:dropped:1m > 0) > (count(node:softnet:dropped:1m) * 0.5)
        for: 1m
        labels:
          severity: P2
      - alert: MajorityNodesSoftNetDropped
        expr: count(node:softnet:dropped:1m > 0) > (count(node:softnet:dropped:1m) * 0.75)
        for: 1m
        labels:
          severity: P1
    - name: softnet.rules
      rules:
      - expr: sum(node_softnet_backlog_len) by (instance)
        record: node:softnet:backlog:1m
      - expr: sum(rate(node_softnet_dropped_total[1m])) by (instance)
        record: node:softnet:dropped:1m
  monitoring-kube-prometheus-stack-node-network-affabc94-6941-4bc4-ad5a-e049849cca9a.yaml: |
    groups:
    - name: node-network
      rules:
      - alert: NodeNetworkInterfaceFlapping
        annotations:
          description: Network interface "{{ $labels.device }}" changing its up status
            often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/nodenetworkinterfaceflapping
          summary: Network interface is often changing its status
        expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
        for: 2m
        labels:
          severity: warning
  monitoring-kube-prometheus-stack-openstack-ef280df9-69db-4981-b8b9-f5abff3c11aa.yaml: |
    groups:
    - name: cinder
      rules:
      - alert: CinderAgentDisabled
        annotations:
          description: A Cinder agent has been administratively disabled for more than
            24 hours.
          summary: Cinder agent disabled
        expr: openstack_cinder_agent_state{adminState!="enabled"} > 0
        for: 24h
        labels:
          severity: P5
      - alert: CinderAgentDown
        annotations:
          description: A Cinder agent has been down for more than 15 minutes.
          summary: Cinder agent down
        expr: openstack_cinder_agent_state != 1
        for: 15m
        labels:
          severity: P3
      - alert: CinderAgentGroupDown
        annotations:
          description: All instances of a specific Cinder agent have been down for more
            than 5 minutes.
          summary: Cinder agent group down
        expr: min by (exported_service) (openstack_cinder_agent_state) == 0
        for: 5m
        labels:
          severity: P2
      - alert: CinderVolumeError
        annotations:
          description: A Cinder volume is in an error state.
          summary: Cinder volume error
        expr: openstack_cinder_volume_status{status=~"error.*"} > 0
        for: 24h
        labels:
          severity: P4
    - name: neutron
      rules:
      - alert: NeutronAgentDisabled
        annotations:
          description: A Neutron agent has been administratively disabled for more than
            24 hours.
          summary: Neutron agent disabled
        expr: openstack_neutron_agent_state{adminState!="up"} > 0
        for: 24h
        labels:
          severity: P5
      - alert: NeutronAgentDown
        annotations:
          description: A Neutron agent has been down for more than 15 minutes.
          summary: Neutron agent down
        expr: openstack_neutron_agent_state != 1
        for: 15m
        labels:
          severity: P3
      - alert: NeutronNetworkOutOfIPs
        annotations:
          description: The network {{ $labels.network_id }} is currently at {{ $value
            }}% utilization.  If the IP addresses run out, it will impact the provisioning
            of new ports.
          summary: '[{{ $labels.network_id }}] Network running out of IPs'
        expr: (sum by (network_id) (openstack_neutron_network_ip_availabilities_used{project_id!=""})
          and on (network_id) label_replace(openstack_neutron_network{is_external="true",
          is_shared="true"}, "network_id", "$1", "id", "(.*)")) / (sum by (network_id)
          (openstack_neutron_network_ip_availabilities_total{project_id!=""}) and on (network_id)
          label_replace(openstack_neutron_network{is_external="true", is_shared="true"},
          "network_id", "$1", "id", "(.*)")) * 100 > 80
        for: 6h
        labels:
          severity: P3
      - alert: NeutronRouterMultipleActiveL3Agents
        annotations:
          description: The router with ID {{ $labels.router_id }} has {{ $value }} L3
            agents in active state which can cause network resets and traffic drops.
          summary: Neutron HA router has multiple active L3 agents
        expr: sum by (router_id) (openstack_neutron_l3_agent_of_router{ha_state="active"})
          > 1
        for: 5m
        labels:
          severity: P3
    - name: neutron-port-bindings
      rules:
      - alert: NeutronPortBindingFailed
        annotations:
          description: At least one Neutron port has failed to bind.
          summary: Neutron Port Binding Failed
        expr: count(neutron_port{binding_vif_type="binding_failed"}) > 0
        for: 5m
        labels:
          severity: P4
      - alert: NeutronPortBindingFailed
        annotations:
          description: More than 5% of Neutron ports have failed to bind.
          summary: Neutron Port Binding Failed
        expr: (count(neutron_port{binding_vif_type="binding_failed"}) / count(neutron_port))
          > 0.05
        for: 5m
        labels:
          severity: P3
      - alert: NeutronPortBindingFailed
        annotations:
          description: More than 50% of Neutron ports have failed to bind.
          summary: Neutron Port Binding Failed
        expr: (count(neutron_port{binding_vif_type="binding_failed"}) / count(neutron_port))
          > 0.5
        for: 5m
        labels:
          severity: P2
    - name: nova
      rules:
      - alert: NovaServiceDisabled
        annotations:
          description: A Nova service has been administratively disabled for more than
            24 hours.
          summary: Nova service disabled
        expr: openstack_nova_agent_state{adminState!="enabled"} > 0
        for: 24h
        labels:
          severity: P5
      - alert: NovaServiceDown
        annotations:
          description: A Nova service has been down for more than 15 minutes.
          summary: Nova service down
        expr: openstack_nova_agent_state != 1
        for: 15m
        labels:
          severity: P3
      - alert: NovaServiceGroupDown
        annotations:
          description: All instances of a specific Nova service have been down for more
            than 5 minutes.
          summary: Nova service group down
        expr: sum by (exported_service) (openstack_nova_agent_state) == 0
        for: 5m
        labels:
          severity: P2
      - alert: NovaServerTaskStateStuck
        annotations:
          description: Nova server with ID {{ $labels.id }} stuck in {{ $labels.task_state
            }} state for more than 1 hour
          summary: Nova server stuck in task state
        expr: openstack_nova_server_task_state > 0
        for: 1h
        labels:
          severity: P3
      - alert: NovaInstanceError
        annotations:
          description: A Nova server is in an error state.
          summary: Nova server error
        expr: openstack_nova_server_status{status="ERROR"} > 0
        for: 24h
        labels:
          severity: P4
      - alert: NovaFailureRisk
        annotations:
          description: The cloud capacity will be at {{ $value }} in the event of the
            failure of a single hypervisor which puts the cloud at risk of not being able
            to recover should any hypervisor failures occur.  Please ensure that adequate
            amount of infrastructure is assigned to this deployment to prevent this.
          summary: '[nova] Failure risk'
        expr: (sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes)
          - max(openstack_nova_memory_used_bytes)) / sum(openstack_nova_memory_available_bytes-openstack_nova_memory_used_bytes)
          * 100 < 0.25
        for: 6h
        labels:
          severity: P3
      - alert: NovaCapacity
        annotations:
          description: The cloud capacity is currently at `{{ $value }}` which means there
            is a risk of running out of capacity due to the timeline required to add new
            nodes. Please ensure that adequate amount of infrastructure is assigned to
            this deployment to prevent this.
          summary: '[nova] Capacity risk'
        expr: sum (     openstack_nova_memory_used_bytes   + on(hostname) group_left(adminState)     (0
          * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
          ) / sum (     openstack_nova_memory_available_bytes*0.90   + on(hostname) group_left(adminState)     (0
          * openstack_nova_agent_state{exported_service="nova-compute",adminState="enabled"})
          ) * 100 > 75
        for: 6h
        labels:
          severity: P3
    - name: nova-build-requests
      rules:
      - alert: NovaStuckBuildRequest
        annotations:
          description: 'Instance ID {{ $labels.instance_uuid }} (project: {{ $labels.project_id
            }}) has been stuck in build request state for more than 1 hour.'
          summary: Nova build request stuck in queue for more than 1 hour
        expr: openstack_nova_api_build_request > 0
        for: 1h
        labels:
          severity: P4
      - alert: NovaStuckBuildRequestIncreasing
        annotations:
          description: Build request count rate is increasing across the cluster.
          summary: Nova build request is increasing
        expr: rate(nova:build_requests:sum[5m]) > 0
        for: 15m
        labels:
          severity: P3
    - name: octavia
      rules:
      - alert: OctaviaLoadBalancerMultipleMaster
        annotations:
          description: Load balancer with ID {{ $labels.loadbalancer_id }} has multiple
            MASTER Amphorae for more then 15 minutes.
          summary: Octavia load balancer has multiple MASTER Amphorae
        expr: count by(loadbalancer_id) (openstack_loadbalancer_amphora_status{role="MASTER"})
          > 1
        for: 15m
        labels:
          severity: P3
      - alert: OctaviaLoadBalancerNotActive
        annotations:
          description: Load balancer with ID {{ $labels.id }} stuck in non-active state
            for more then 15 minutes.
          summary: Octavia load balancer not active
        expr: count by (id,name) (openstack_loadbalancer_loadbalancer_status{provisioning_status!="ACTIVE"})
          > 0
        for: 15m
        labels:
          severity: P3
      - alert: OctaviaAmphoraError
        annotations:
          description: Amphora with ID {{ $labels.id }} stuck in error state for more
            then 15 minutes.
          summary: Octavia Amphora in error state
        expr: count by (id,name) (openstack_loadbalancer_amphora_status{status="ERROR"})
          > 0
        for: 15m
        labels:
          severity: P3
      - alert: OctaviaAmphoraNotOperational
        annotations:
          description: Amphora with ID {{ $labels.id }} stuck in non-operational state
            for more then 1 hour.
          summary: Octavia Amphora not operational
        expr: count by (id,name) (openstack_loadbalancer_amphora_status{status!~"READY|ALLOCATED|DELETED"})
          > 0
        for: 1h
        labels:
          severity: P3
    - name: recording
      rules:
      - expr: sum(openstack_nova_api_build_request)
        record: nova:build_requests:sum
  monitoring-kube-prometheus-stack-prometheus-e8714da1-4dd3-48ce-bff7-fe5126dc6730.yaml: |
    groups:
    - name: prometheus
      rules:
      - alert: PrometheusBadConfig
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
            reload its configuration.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusbadconfig
          summary: Failed Prometheus configuration reload.
        expr: |-
          # Without max_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
          max_over_time(prometheus_config_last_reload_successful{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) == 0
        for: 10m
        labels:
          severity: critical
      - alert: PrometheusSDRefreshFailure
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
            refresh SD with mechanism {{$labels.mechanism}}.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheussdrefreshfailure
          summary: Failed Prometheus SD refresh.
        expr: increase(prometheus_sd_refresh_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[10m])
          > 0
        for: 20m
        labels:
          severity: warning
      - alert: PrometheusNotificationQueueRunningFull
        annotations:
          description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
            is running full.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotificationqueuerunningfull
          summary: Prometheus alert notification queue predicted to run full in less than
            30m.
        expr: |-
          # Without min_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
          (
            predict_linear(prometheus_notifications_queue_length{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m], 60 * 30)
          >
            min_over_time(prometheus_notifications_queue_capacity{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          )
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
        annotations:
          description: '{{ printf "%.1f" $value }}% errors while sending alerts from Prometheus
            {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstosomealertmanagers
          summary: Prometheus has encountered more than 1% errors sending alerts to a
            specific Alertmanager.
        expr: |-
          (
            rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          /
            rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          )
          * 100
          > 1
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusNotConnectedToAlertmanagers
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
            to any Alertmanagers.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotconnectedtoalertmanagers
          summary: Prometheus is not connected to any Alertmanagers.
        expr: |-
          # Without max_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
          max_over_time(prometheus_notifications_alertmanagers_discovered{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) < 1
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusTSDBReloadsFailing
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
            | humanize}} reload failures over the last 3h.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbreloadsfailing
          summary: Prometheus has issues reloading blocks from disk.
        expr: increase(prometheus_tsdb_reloads_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h])
          > 0
        for: 4h
        labels:
          severity: warning
      - alert: PrometheusTSDBCompactionsFailing
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected {{$value
            | humanize}} compaction failures over the last 3h.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustsdbcompactionsfailing
          summary: Prometheus has issues compacting blocks.
        expr: increase(prometheus_tsdb_compactions_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[3h])
          > 0
        for: 4h
        labels:
          severity: warning
      - alert: PrometheusNotIngestingSamples
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
            samples.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusnotingestingsamples
          summary: Prometheus is not ingesting samples.
        expr: |-
          (
            sum without(type) (rate(prometheus_tsdb_head_samples_appended_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])) <= 0
          and
            (
              sum without(scrape_job) (prometheus_target_metadata_cache_entries{job="kube-prometheus-stack-prometheus",namespace="monitoring"}) > 0
            or
              sum without(rule_group) (prometheus_rule_group_rules{job="kube-prometheus-stack-prometheus",namespace="monitoring"}) > 0
            )
          )
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusDuplicateTimestamps
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{
            printf "%.4g" $value  }} samples/s with different values but duplicated timestamp.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusduplicatetimestamps
          summary: Prometheus is dropping samples with duplicate timestamps.
        expr: rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusOutOfOrderTimestamps
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping {{
            printf "%.4g" $value  }} samples/s with timestamps arriving out of order.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusoutofordertimestamps
          summary: Prometheus drops samples with out-of-order timestamps.
        expr: rate(prometheus_target_scrapes_sample_out_of_order_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusRemoteStorageFailures
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
            {{ printf "%.1f" $value }}% of the samples to {{ $labels.remote_name}}:{{
            $labels.url }}
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotestoragefailures
          summary: Prometheus fails to send samples to remote storage.
        expr: |-
          (
            (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
          /
            (
              (rate(prometheus_remote_storage_failed_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
            +
              (rate(prometheus_remote_storage_succeeded_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]) or rate(prometheus_remote_storage_samples_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m]))
            )
          )
          * 100
          > 1
        for: 15m
        labels:
          severity: critical
      - alert: PrometheusRemoteWriteBehind
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write is
            {{ printf "%.1f" $value }}s behind for {{ $labels.remote_name}}:{{ $labels.url
            }}.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritebehind
          summary: Prometheus remote write is behind.
        expr: |-
          # Without max_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
          (
            max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          - ignoring(remote_name, url) group_right
            max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          )
          > 120
        for: 15m
        labels:
          severity: critical
      - alert: PrometheusRemoteWriteDesiredShards
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write desired
            shards calculation wants to run {{ $value }} shards for queue {{ $labels.remote_name}}:{{
            $labels.url }}, which is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="kube-prometheus-stack-prometheus",namespace="monitoring"}`
            $labels.instance | query | first | value }}.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusremotewritedesiredshards
          summary: Prometheus remote write desired shards calculation wants to run more
            than configured max shards.
        expr: |-
          # Without max_over_time, failed scrapes could create false negatives, see
          # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
          (
            max_over_time(prometheus_remote_storage_shards_desired{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          >
            max_over_time(prometheus_remote_storage_shards_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          )
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusRuleFailures
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
            evaluate {{ printf "%.0f" $value }} rules in the last 5m.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusrulefailures
          summary: Prometheus is failing rule evaluations.
        expr: increase(prometheus_rule_evaluation_failures_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 15m
        labels:
          severity: critical
      - alert: PrometheusMissingRuleEvaluations
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{
            printf "%.0f" $value }} rule group evaluations in the last 5m.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusmissingruleevaluations
          summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
        expr: increase(prometheus_rule_group_iterations_missed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusTargetLimitHit
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{
            printf "%.0f" $value }} targets because the number of targets exceeded the
            configured target_limit.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetlimithit
          summary: Prometheus has dropped targets because some scrape configs have exceeded
            the targets limit.
        expr: increase(prometheus_target_scrape_pool_exceeded_target_limit_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusLabelLimitHit
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has dropped {{
            printf "%.0f" $value }} targets because some samples exceeded the configured
            label_limit, label_name_length_limit or label_value_length_limit.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuslabellimithit
          summary: Prometheus has dropped targets because some scrape configs have exceeded
            the labels limit.
        expr: increase(prometheus_target_scrape_pool_exceeded_label_limits_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusScrapeBodySizeLimitHit
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{
            printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded
            the configured body_size_limit.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapebodysizelimithit
          summary: Prometheus has dropped some targets that exceeded body size limit.
        expr: increase(prometheus_target_scrapes_exceeded_body_size_limit_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusScrapeSampleLimitHit
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed {{
            printf "%.0f" $value }} scrapes in the last 5m because some targets exceeded
            the configured sample_limit.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheusscrapesamplelimithit
          summary: Prometheus has failed scrapes that have exceeded the configured sample
            limit.
        expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusTargetSyncFailure
        annotations:
          description: '{{ printf "%.0f" $value }} targets in Prometheus {{$labels.namespace}}/{{$labels.pod}}
            have failed to sync because invalid configuration was supplied.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheustargetsyncfailure
          summary: Prometheus has failed to sync targets.
        expr: increase(prometheus_target_sync_failed_total{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[30m])
          > 0
        for: 5m
        labels:
          severity: critical
      - alert: PrometheusHighQueryLoad
        annotations:
          description: Prometheus {{$labels.namespace}}/{{$labels.pod}} query API has
            less than 20% available capacity in its query engine for the last 15 minutes.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheushighqueryload
          summary: Prometheus is reaching its maximum capacity serving concurrent requests.
        expr: avg_over_time(prometheus_engine_queries{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          / max_over_time(prometheus_engine_queries_concurrent_max{job="kube-prometheus-stack-prometheus",namespace="monitoring"}[5m])
          > 0.8
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusErrorSendingAlertsToAnyAlertmanager
        annotations:
          description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
            from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus/prometheuserrorsendingalertstoanyalertmanager
          summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
        expr: |-
          min without (alertmanager) (
            rate(prometheus_notifications_errors_total{job="kube-prometheus-stack-prometheus",namespace="monitoring",alertmanager!~``}[5m])
          /
            rate(prometheus_notifications_sent_total{job="kube-prometheus-stack-prometheus",namespace="monitoring",alertmanager!~``}[5m])
          )
          * 100
          > 3
        for: 15m
        labels:
          severity: critical
  monitoring-kube-prometheus-stack-prometheus-operator-28fa568f-b7a6-48b5-a241-91c31e5fc788.yaml: |
    groups:
    - name: prometheus-operator
      rules:
      - alert: PrometheusOperatorListErrors
        annotations:
          description: Errors while performing List operations in controller {{$labels.controller}}
            in {{$labels.namespace}} namespace.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorlisterrors
          summary: Errors while performing list operations in controller.
        expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m]))
          / sum by (cluster,controller,namespace) (rate(prometheus_operator_list_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[10m])))
          > 0.4
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusOperatorWatchErrors
        annotations:
          description: Errors while performing watch operations in controller {{$labels.controller}}
            in {{$labels.namespace}} namespace.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorwatcherrors
          summary: Errors while performing watch operations in controller.
        expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_failed_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m]))
          / sum by (cluster,controller,namespace) (rate(prometheus_operator_watch_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
          > 0.4
        for: 15m
        labels:
          severity: warning
      - alert: PrometheusOperatorSyncFailed
        annotations:
          description: Controller {{ $labels.controller }} in {{ $labels.namespace }}
            namespace fails to reconcile {{ $value }} objects.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorsyncfailed
          summary: Last controller reconciliation failed
        expr: min_over_time(prometheus_operator_syncs{status="failed",job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])
          > 0
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusOperatorReconcileErrors
        annotations:
          description: '{{ $value | humanizePercentage }} of reconciling operations failed
            for {{ $labels.controller }} controller in {{ $labels.namespace }} namespace.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorreconcileerrors
          summary: Errors while reconciling objects.
        expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
          / (sum by (cluster,controller,namespace) (rate(prometheus_operator_reconcile_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
          > 0.1
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusOperatorStatusUpdateErrors
        annotations:
          description: '{{ $value | humanizePercentage }} of status update operations
            failed for {{ $labels.controller }} controller in {{ $labels.namespace }}
            namespace.'
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorstatusupdateerrors
          summary: Errors while updating objects status.
        expr: (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
          / (sum by (cluster,controller,namespace) (rate(prometheus_operator_status_update_operations_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])))
          > 0.1
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusOperatorNodeLookupErrors
        annotations:
          description: Errors while reconciling Prometheus in {{ $labels.namespace }}
            Namespace.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornodelookuperrors
          summary: Errors while reconciling Prometheus.
        expr: rate(prometheus_operator_node_address_lookup_errors_total{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])
          > 0.1
        for: 10m
        labels:
          severity: warning
      - alert: PrometheusOperatorNotReady
        annotations:
          description: Prometheus operator in {{ $labels.namespace }} namespace isn't
            ready to reconcile {{ $labels.controller }} resources.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatornotready
          summary: Prometheus operator not ready
        expr: min by (cluster,controller,namespace) (max_over_time(prometheus_operator_ready{job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])
          == 0)
        for: 5m
        labels:
          severity: warning
      - alert: PrometheusOperatorRejectedResources
        annotations:
          description: Prometheus operator in {{ $labels.namespace }} namespace rejected
            {{ printf "%0.0f" $value }} {{ $labels.controller }}/{{ $labels.resource }}
            resources.
          runbook_url: https://runbooks.prometheus-operator.dev/runbooks/prometheus-operator/prometheusoperatorrejectedresources
          summary: Resources rejected by Prometheus operator
        expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="kube-prometheus-stack-operator",namespace="monitoring"}[5m])
          > 0
        for: 5m
        labels:
          severity: warning
  monitoring-kube-prometheus-stack-rabbitmq-8e29ae07-2b16-437a-aba5-fbaad959890d.yaml: |
    groups:
    - name: alarms
      rules:
      - alert: RabbitmqAlarmFreeDiskSpace
        expr: rabbitmq_alarms_free_disk_space_watermark == 1
        labels:
          severity: P1
      - alert: RabbitmqAlarmMemoryUsedWatermark
        expr: rabbitmq_alarms_memory_used_watermark == 1
        labels:
          severity: P1
      - alert: RabbitmqAlarmFileDescriptorLimit
        expr: rabbitmq_alarms_file_descriptor_limit == 1
        labels:
          severity: P1
    - name: limits
      rules:
      - alert: RabbitmqMemoryHigh
        expr: rabbitmq:usage:memory > 0.80
        labels:
          severity: P3
      - alert: RabbitmqMemoryHigh
        expr: rabbitmq:usage:memory > 0.95
        labels:
          severity: P1
      - alert: RabbitmqFileDescriptorsUsage
        expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.80
        labels:
          severity: P3
      - alert: RabbitmqFileDescriptorsUsage
        expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds > 0.95
        labels:
          severity: P1
      - alert: RabbitmqTcpSocketsUsage
        expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.80
        labels:
          severity: P3
      - alert: RabbitmqTcpSocketsUsage
        expr: rabbitmq_process_open_tcp_sockets / rabbitmq_process_max_tcp_sockets > 0.95
        labels:
          severity: P1
    - name: msgs
      rules:
      - alert: RabbitmqUnackedMessages
        expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
        for: 5m
        labels:
          severity: P3
      - alert: RabbitmqUnackedMessages
        expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
        for: 1h
        labels:
          severity: P1
    - name: recording
      rules:
      - expr: sum without (job) (   rabbitmq_process_resident_memory_bytes ) / sum without
          (   container,   pod,   job,   namespace,   node,   resource,   uid,   unit
          ) (   label_replace(     cluster:namespace:pod_memory:active:kube_pod_container_resource_limits,     "instance",     "$1",     "pod",     "(.*)"   )
          )
        labels:
          job: rabbitmq
        record: rabbitmq:usage:memory
kind: ConfigMap
metadata:
  creationTimestamp: "2026-04-27T03:01:29Z"
  labels:
    managed-by: prometheus-operator
    prometheus-name: kube-prometheus-stack-prometheus
  name: prometheus-kube-prometheus-stack-prometheus-rulefiles-0
  namespace: monitoring
  ownerReferences:
  - apiVersion: monitoring.coreos.com/v1
    blockOwnerDeletion: true
    controller: true
    kind: Prometheus
    name: kube-prometheus-stack-prometheus
    uid: 1033b3a0-c409-42a6-b1b1-d1516e4a375a
  resourceVersion: "10356"
  uid: e9678036-672c-40d3-ba77-6e220169a201
