mirror of https://git.rancher.io/charts
298 lines
15 KiB
YAML
298 lines
15 KiB
YAML
{{- /*
|
|
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml
|
|
Do not change in-place! In order to change this file first read following link:
|
|
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
|
*/ -}}
|
|
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
|
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }}
|
|
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
|
|
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
|
labels:
|
|
app: {{ template "kube-prometheus-stack.name" . }}
|
|
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
|
{{- if .Values.defaultRules.labels }}
|
|
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
|
{{- end }}
|
|
{{- if .Values.defaultRules.annotations }}
|
|
annotations:
|
|
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
|
{{- end }}
|
|
spec:
|
|
groups:
|
|
- name: kubernetes-apps
|
|
rules:
|
|
- alert: KubePodCrashLooping
|
|
annotations:
|
|
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 10 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
|
|
summary: Pod is crash looping.
|
|
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m]) * 60 * 5 > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubePodNotReady
|
|
annotations:
|
|
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready
|
|
summary: Pod has been in a non-ready state for more than 15 minutes.
|
|
expr: |-
|
|
sum by (namespace, pod) (
|
|
max by(namespace, pod) (
|
|
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}
|
|
) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (
|
|
1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})
|
|
)
|
|
) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeDeploymentGenerationMismatch
|
|
annotations:
|
|
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch
|
|
summary: Deployment generation mismatch due to possible roll-back
|
|
expr: |-
|
|
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
annotations:
|
|
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch
|
|
summary: Deployment has not matched the expected number of replicas.
|
|
expr: |-
|
|
(
|
|
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
) and (
|
|
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeStatefulSetReplicasMismatch
|
|
annotations:
|
|
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch
|
|
summary: Deployment has not matched the expected number of replicas.
|
|
expr: |-
|
|
(
|
|
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
) and (
|
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeStatefulSetGenerationMismatch
|
|
annotations:
|
|
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch
|
|
summary: StatefulSet generation mismatch due to possible roll-back
|
|
expr: |-
|
|
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
annotations:
|
|
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout
|
|
summary: StatefulSet update has not been rolled out.
|
|
expr: |-
|
|
(
|
|
max without (revision) (
|
|
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
unless
|
|
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
)
|
|
*
|
|
(
|
|
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
)
|
|
) and (
|
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeDaemonSetRolloutStuck
|
|
annotations:
|
|
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck
|
|
summary: DaemonSet rollout is stuck.
|
|
expr: |-
|
|
(
|
|
(
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
) or (
|
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
0
|
|
) or (
|
|
kube_daemonset_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
) or (
|
|
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
)
|
|
) and (
|
|
changes(kube_daemonset_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
|
==
|
|
0
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeContainerWaiting
|
|
annotations:
|
|
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontainerwaiting
|
|
summary: Pod container waiting longer than 1 hour
|
|
expr: sum by (namespace, pod, container) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeDaemonSetNotScheduled
|
|
annotations:
|
|
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled
|
|
summary: DaemonSet pods are not scheduled.
|
|
expr: |-
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
-
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeDaemonSetMisScheduled
|
|
annotations:
|
|
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled
|
|
summary: DaemonSet pods are misscheduled.
|
|
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeJobCompletion
|
|
annotations:
|
|
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than 12 hours to complete.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion
|
|
summary: Job did not complete in time
|
|
expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeJobFailed
|
|
annotations:
|
|
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed
|
|
summary: Job failed to complete.
|
|
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeHpaReplicasMismatch
|
|
annotations:
|
|
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpareplicasmismatch
|
|
summary: HPA has not matched descired number of replicas.
|
|
expr: |-
|
|
(kube_hpa_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
!=
|
|
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
|
and
|
|
(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
>
|
|
kube_hpa_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
|
and
|
|
(kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
<
|
|
kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
|
and
|
|
changes(kube_hpa_status_current_replicas[15m]) == 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
- alert: KubeHpaMaxedOut
|
|
annotations:
|
|
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.hpa {{`}}`}} has been running at max replicas for longer than 15 minutes.
|
|
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubehpamaxedout
|
|
summary: HPA is running at max replicas
|
|
expr: |-
|
|
kube_hpa_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
==
|
|
kube_hpa_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
{{- if .Values.defaultRules.additionalRuleLabels }}
|
|
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
|
{{- end }}
|
|
{{- end }} |