{{- /* Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml Do not change in-place! In order to change this file first read following link: https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack */ -}} {{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} {{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} {{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} {{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} {{- if and .Values.alertmanager.enabled .Values.alertmanager.serviceMonitor.selfMonitor }} apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} namespace: {{ template "kube-prometheus-stack.namespace" . }} labels: app: {{ template "kube-prometheus-stack.name" . }} {{ include "kube-prometheus-stack.labels" . | indent 4 }} {{- if .Values.defaultRules.labels }} {{ toYaml .Values.defaultRules.labels | indent 4 }} {{- end }} {{- if .Values.defaultRules.annotations }} annotations: {{ toYaml .Values.defaultRules.annotations | indent 4 }} {{- end }} spec: groups: - name: alertmanager.rules rules: - alert: AlertmanagerFailedReload annotations: description: Configuration has failed to load for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedreload summary: Reloading an Alertmanager configuration has failed. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) == 0 for: 10m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerMembersInconsistent annotations: description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} has only found {{`{{`}} $value {{`}}`}} members of the {{`{{`}}$labels.job{{`}}`}} cluster. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagermembersinconsistent summary: A member of an Alertmanager cluster has not found all other cluster members. expr: |- # Without max_over_time, failed scrapes could create false negatives, see # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < on (namespace,service) group_left count by (namespace,service) (max_over_time(alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m])) for: 15m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerFailedToSendAlerts annotations: description: Alertmanager {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} failed to send {{`{{`}} $value | humanizePercentage {{`}}`}} of notifications to {{`{{`}} $labels.integration {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerfailedtosendalerts summary: An Alertmanager instance failed to send notifications. expr: |- ( rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) / rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) ) > 0.01 for: 5m labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts summary: All Alertmanager instances in a cluster failed to send notifications to a critical integration. expr: |- min by (namespace,service, integration) ( rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) / rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration=~`.*`}[5m]) ) > 0.01 for: 5m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerClusterFailedToSendAlerts annotations: description: The minimum notification failure rate to {{`{{`}} $labels.integration {{`}}`}} sent from any instance in the {{`{{`}}$labels.job{{`}}`}} cluster is {{`{{`}} $value | humanizePercentage {{`}}`}}. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterfailedtosendalerts summary: All Alertmanager instances in a cluster failed to send notifications to a non-critical integration. expr: |- min by (namespace,service, integration) ( rate(alertmanager_notifications_failed_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) / rate(alertmanager_notifications_total{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}", integration!~`.*`}[5m]) ) > 0.01 for: 5m labels: severity: warning {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerConfigInconsistent annotations: description: Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have different configurations. runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerconfiginconsistent summary: Alertmanager instances within the same cluster have different configurations. expr: |- count by (namespace,service) ( count_values by (namespace,service) ("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) ) != 1 for: 20m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerClusterDown annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have been up for less than half of the last 5m.' runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclusterdown summary: Half or more of the Alertmanager instances within the same cluster are down. expr: |- ( count by (namespace,service) ( avg_over_time(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[5m]) < 0.5 ) / count by (namespace,service) ( up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} ) ) >= 0.5 for: 5m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} - alert: AlertmanagerClusterCrashlooping annotations: description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of Alertmanager instances within the {{`{{`}}$labels.job{{`}}`}} cluster have restarted at least 5 times in the last 10m.' runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerclustercrashlooping summary: Half or more of the Alertmanager instances within the same cluster are crashlooping. expr: |- ( count by (namespace,service) ( changes(process_start_time_seconds{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}[10m]) > 4 ) / count by (namespace,service) ( up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} ) ) >= 0.5 for: 5m labels: severity: critical {{- if .Values.defaultRules.additionalRuleLabels }} {{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} {{- end }} {{- end }} {{- end }}