mirror of https://git.rancher.io/charts
parent
944bc7f588
commit
318e04ac77
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -7,4 +7,4 @@ apiVersion: v2
|
|||
description: Installs the CRDs for rancher-monitoring.
|
||||
name: rancher-monitoring-crd
|
||||
type: application
|
||||
version: 104.0.0-rc1+up45.31.1
|
||||
version: 104.0.0-rc1+up57.0.3
|
Binary file not shown.
|
@ -28,23 +28,3 @@ kubernetes.io/os: linux
|
|||
{{- end -}}
|
||||
{{- end -}}
|
||||
|
||||
# CRD Installation
|
||||
|
||||
{{- define "crd.established" -}}
|
||||
{{- if not (regexMatch "^([a-zA-Z]+[.][a-zA-Z]*)+$" .) -}}
|
||||
{{ required (printf "%s is not a valid CRD" .) "" }}
|
||||
{{- else -}}
|
||||
echo "beginning wait for {{ . }} to be established...";
|
||||
num_tries=1;
|
||||
until kubectl get crd {{ . }} -o=jsonpath='{range .status.conditions[*]}{.type}={.status} {end}' | grep -qE 'Established=True'; do
|
||||
if (( num_tries == 30 )); then
|
||||
echo "timed out waiting for {{ . }}";
|
||||
exit 1;
|
||||
fi;
|
||||
num_tries=$(( num_tries + 1 ));
|
||||
echo "{{ . }} is not established. Sleeping for 2 seconds and trying again...";
|
||||
sleep 2;
|
||||
done;
|
||||
echo "successfully established {{ . }}";
|
||||
{{- end -}}
|
||||
{{- end -}}
|
|
@ -19,27 +19,6 @@ spec:
|
|||
securityContext:
|
||||
runAsNonRoot: false
|
||||
runAsUser: 0
|
||||
initContainers:
|
||||
- name: set-preserve-unknown-fields-false
|
||||
image: {{ template "system_default_registry" . }}{{ .Values.image.repository }}:{{ .Values.image.tag }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
{{- range $path, $_ := (.Files.Glob "crd-manifest/**.yaml") }}
|
||||
{{- $crd := get (get ($.Files.Get $path | fromYaml) "metadata") "name" }}
|
||||
if [[ -n "$(kubectl get crd {{ $crd }} -o jsonpath='{.spec.preserveUnknownFields}')" ]]; then
|
||||
patch='{"spec": {"preserveUnknownFields": false}}';
|
||||
if [[ -z "$(kubectl get crd {{ $crd }} -o jsonpath='{.spec.versions[0].schema}')" ]]; then
|
||||
patch='{"spec": {"preserveUnknownFields": false, "versions": [{"name": "v1", "served": false, "storage": true, "schema": {"openAPIV3Schema": {"description": "placeholder", "type": "object"}}}]}}';
|
||||
fi
|
||||
echo "Applying patch to {{ $crd }}: ${patch}"
|
||||
if kubectl patch crd {{ $crd }} -p "${patch}" --type="merge"; then
|
||||
{{- include "crd.established" $crd | nindent 18 }}
|
||||
fi;
|
||||
fi;
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: create-crds
|
||||
image: {{ template "system_default_registry" . }}{{ .Values.image.repository }}:{{ .Values.image.tag }}
|
||||
|
@ -52,18 +31,7 @@ spec:
|
|||
mkdir -p /etc/crd;
|
||||
base64 -d /etc/config/crd-manifest.tgz.b64 | tar -xzv -C /etc/crd;
|
||||
kubectl replace -Rf /etc/crd || kubectl create -Rf /etc/crd;
|
||||
|
||||
echo "Waiting for CRDs to be recognized before finishing installation...";
|
||||
|
||||
{{- range $path, $_ := (.Files.Glob "crd-manifest/**.yaml") }}
|
||||
{{- $apiGroup := get (get ($.Files.Get $path | fromYaml) "spec") "group" }}
|
||||
rm -rf $HOME/.kube/cache/discovery/*/{{ $apiGroup }};
|
||||
{{- end }}
|
||||
|
||||
{{- range $path, $_ := (.Files.Glob "crd-manifest/**.yaml") }}
|
||||
{{- $crd := get (get ($.Files.Get $path | fromYaml) "metadata") "name" }}
|
||||
{{- include "crd.established" $crd | nindent 12 }}
|
||||
{{- end }}
|
||||
echo "Done!"
|
||||
volumeMounts:
|
||||
- name: crd-manifest
|
||||
readOnly: true
|
||||
|
@ -103,24 +71,6 @@ spec:
|
|||
securityContext:
|
||||
runAsNonRoot: false
|
||||
runAsUser: 0
|
||||
initContainers:
|
||||
- name: remove-finalizers
|
||||
image: {{ template "system_default_registry" . }}{{ .Values.image.repository }}:{{ .Values.image.tag }}
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
{{- range $path, $_ := (.Files.Glob "crd-manifest/**.yaml") }}
|
||||
{{- $crd := get (get ($.Files.Get $path | fromYaml) "metadata") "name" }}
|
||||
if kubectl patch crd {{ $crd }} -p '{"metadata": {"finalizers": []}}'; then
|
||||
{{- include "crd.established" $crd | nindent 14 }}
|
||||
fi;
|
||||
{{- end }}
|
||||
volumeMounts:
|
||||
- name: crd-manifest
|
||||
readOnly: true
|
||||
mountPath: /etc/config
|
||||
containers:
|
||||
- name: delete-crds
|
||||
image: {{ template "system_default_registry" . }}{{ .Values.image.repository }}:{{ .Values.image.tag }}
|
|
@ -1,60 +0,0 @@
|
|||
{{- if and .Values.alertmanager.enabled .Values.alertmanager.serviceMonitor.selfMonitor }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-alertmanager
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-alertmanager
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- with .Values.alertmanager.serviceMonitor.additionalLabels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "servicemonitor.scrapeLimits" .Values.alertmanager.serviceMonitor | nindent 2 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-alertmanager
|
||||
release: {{ $.Release.Name | quote }}
|
||||
self-monitor: "true"
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ printf "%s" (include "kube-prometheus-stack.namespace" .) | quote }}
|
||||
endpoints:
|
||||
- port: {{ .Values.alertmanager.alertmanagerSpec.portName }}
|
||||
enableHttp2: {{ .Values.alertmanager.serviceMonitor.enableHttp2 }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.interval }}
|
||||
interval: {{ .Values.alertmanager.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.proxyUrl }}
|
||||
proxyUrl: {{ .Values.alertmanager.serviceMonitor.proxyUrl}}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.scheme }}
|
||||
scheme: {{ .Values.alertmanager.serviceMonitor.scheme }}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.bearerTokenFile }}
|
||||
bearerTokenFile: {{ .Values.alertmanager.serviceMonitor.bearerTokenFile }}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.tlsConfig }}
|
||||
tlsConfig: {{ toYaml .Values.alertmanager.serviceMonitor.tlsConfig | nindent 6 }}
|
||||
{{- end }}
|
||||
path: "{{ trimSuffix "/" .Values.alertmanager.alertmanagerSpec.routePrefix }}/metrics"
|
||||
metricRelabelings:
|
||||
{{- if .Values.alertmanager.serviceMonitor.metricRelabelings }}
|
||||
{{ tpl (toYaml .Values.alertmanager.serviceMonitor.metricRelabelings | indent 6) . }}
|
||||
{{- end }}
|
||||
{{ if .Values.global.cattle.clusterId }}
|
||||
- sourceLabels: [__address__]
|
||||
targetLabel: cluster_id
|
||||
replacement: {{ .Values.global.cattle.clusterId }}
|
||||
{{- end }}
|
||||
{{ if .Values.global.cattle.clusterName }}
|
||||
- sourceLabels: [__address__]
|
||||
targetLabel: cluster_name
|
||||
replacement: {{ .Values.global.cattle.clusterName }}
|
||||
{{- end }}
|
||||
{{- if .Values.alertmanager.serviceMonitor.relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.alertmanager.serviceMonitor.relabelings | indent 6 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,296 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/etcd/main/contrib/mixin/mixin.libsonnet
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.etcd }}
|
||||
{{- if (include "exporter.kubeEtcd.enabled" .)}}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.etcdMembersDown | default false) }}
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": members are down ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdInsufficientMembers | default false) }}
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdNoLeader | default false) }}
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfLeaderChanges | default false) }}
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} leader changes within the last 15 minutes. Frequent elections may be a sign of insufficient resources, high network latency, or disruptions by other components and should be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) >= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedGRPCRequests | default false) }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdGRPCRequestsSlow | default false) }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile of gRPC requests is {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}} for {{`{{`}} $labels.grpc_method {{`}}`}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdMemberCommunicationSlow | default false) }}
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighNumberOfFailedProposals | default false) }}
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last 30 minutes on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighFsyncDurations | default false) }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fsync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdHighCommitDurations | default false) }}
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdDatabaseQuotaLowSpace | default false) }}
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size exceeds the defined quota on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please defrag or increase the quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdExcessiveDatabaseGrowth | default false) }}
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": Predicting running out of disk space in the next four hours, based on write observations within the past four hours on etcd instance {{`{{`}} $labels.instance {{`}}`}}, please check as it might be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes[4h], 4*60*60) > etcd_server_quota_backend_bytes
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.etcdDatabaseHighFragmentationRatio | default false) }}
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": database size in use on instance {{`{{`}} $labels.instance {{`}}`}} is {{`{{`}} $value | humanizePercentage {{`}}`}} of the actual allocated disk space, please run defragmentation (e.g. etcdctl defrag) to retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5 and etcd_mvcc_db_total_size_in_use_in_bytes > 104857600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,193 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-apiserver-availability.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserverAvailability }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver-availability.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- interval: 3m
|
||||
name: kube-apiserver-availability.rules
|
||||
rules:
|
||||
- expr: avg_over_time(code_verb:apiserver_request_total:increase1h[30d]) * 24 * 30
|
||||
record: code_verb:apiserver_request_total:increase30d
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"LIST|GET"})
|
||||
labels:
|
||||
verb: read
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, code) (code_verb:apiserver_request_total:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
labels:
|
||||
verb: write
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: code:apiserver_request_total:increase30d
|
||||
- expr: sum by (cluster, verb, scope) (increase(apiserver_request_slo_duration_seconds_count[1h]))
|
||||
record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, verb, scope) (avg_over_time(cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope:apiserver_request_slo_duration_seconds_count:increase30d
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, verb, scope, le) (increase(apiserver_request_slo_duration_seconds_bucket[1h]))
|
||||
record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, verb, scope, le) (avg_over_time(cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase1h[30d]) * 24 * 30)
|
||||
record: cluster_verb_scope_le:apiserver_request_slo_duration_seconds_bucket:increase30d
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# write too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
) +
|
||||
(
|
||||
# read too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
) +
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d)
|
||||
labels:
|
||||
verb: all
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_count:increase30d{verb=~"LIST|GET"})
|
||||
-
|
||||
(
|
||||
# too slow
|
||||
(
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope=~"resource|",le="1"})
|
||||
or
|
||||
vector(0)
|
||||
)
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="namespace",le="5"})
|
||||
+
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"LIST|GET",scope="cluster",le="30"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="read"})
|
||||
labels:
|
||||
verb: read
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: apiserver_request:availability30d
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
# too slow
|
||||
sum by (cluster) (cluster_verb_scope:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_count:increase30d{verb=~"POST|PUT|PATCH|DELETE"})
|
||||
-
|
||||
sum by (cluster) (cluster_verb_scope_le:apiserver_request{{ if (semverCompare ">=1.23.0-0" $kubeTargetVersion) }}_slo{{ end }}_duration_seconds_bucket:increase30d{verb=~"POST|PUT|PATCH|DELETE",le="1"})
|
||||
)
|
||||
+
|
||||
# errors
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write",code=~"5.."} or vector(0))
|
||||
)
|
||||
/
|
||||
sum by (cluster) (code:apiserver_request_total:increase30d{verb="write"})
|
||||
labels:
|
||||
verb: write
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: apiserver_request:availability30d
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"LIST|GET"}[5m]))
|
||||
labels:
|
||||
verb: read
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster,code,resource) (rate(apiserver_request_total{job="apiserver",verb=~"POST|PUT|PATCH|DELETE"}[5m]))
|
||||
labels:
|
||||
verb: write
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: code_resource:apiserver_request_total:rate5m
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"2.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"3.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"4.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum by (cluster, code, verb) (increase(apiserver_request_total{job="apiserver",verb=~"LIST|GET|POST|PUT|PATCH|DELETE",code=~"5.."}[1h]))
|
||||
record: code_verb:apiserver_request_total:increase1h
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,63 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubePrometheus-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: cluster:node_cpu:sum_rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,107 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-state-metrics' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubeStateMetrics-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubeStateMetrics }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-state-metrics" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-state-metrics
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsListErrors | default false) }}
|
||||
- alert: KubeStateMetricsListErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in list operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricslisterrors
|
||||
summary: kube-state-metrics is experiencing errors in list operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])) by (cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsWatchErrors | default false) }}
|
||||
- alert: KubeStateMetricsWatchErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics is experiencing errors at an elevated rate in watch operations. This is likely causing it to not be able to expose metrics about Kubernetes objects correctly or at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricswatcherrors
|
||||
summary: kube-state-metrics is experiencing errors in watch operations.
|
||||
expr: |-
|
||||
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m])) by (cluster)
|
||||
/
|
||||
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])) by (cluster))
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardingMismatch | default false) }}
|
||||
- alert: KubeStateMetricsShardingMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics pods are running with different --total-shards configuration, some Kubernetes objects may be exposed multiple times or not exposed at all.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardingmismatch
|
||||
summary: kube-state-metrics sharding is misconfigured.
|
||||
expr: stdvar (kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) != 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStateMetricsShardsMissing | default false) }}
|
||||
- alert: KubeStateMetricsShardsMissing
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: kube-state-metrics shards are missing, some Kubernetes objects are not being exposed.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kube-state-metrics/kubestatemetricsshardsmissing
|
||||
summary: kube-state-metrics shards are missing.
|
||||
expr: |-
|
||||
2^max(kube_state_metrics_total_shards{job="kube-state-metrics"}) by (cluster) - 1
|
||||
-
|
||||
sum( 2 ^ max by (cluster, shard_ordinal) (kube_state_metrics_shard_ordinal{job="kube-state-metrics"}) ) by (cluster)
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,50 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubelet.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubelet }}
|
||||
{{- if (include "exporter.kubelet.enabled" .)}}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubelet.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (cluster, instance, le) * on(cluster, instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,375 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubePodCrashLooping | default false) }}
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: 'Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubePodNotReady | default false) }}
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by(namespace, pod, cluster) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown|Failed"}
|
||||
) * on(namespace, pod, cluster) group_left(owner_kind) topk by(namespace, pod, cluster) (
|
||||
1, max by(namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentGenerationMismatch | default false) }}
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDeploymentReplicasMismatch | default false) }}
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetReplicasMismatch | default false) }}
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetGenerationMismatch | default false) }}
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeStatefulSetUpdateNotRolledOut | default false) }}
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetRolloutStuck | default false) }}
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} has not finished or progressed for at least 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeContainerWaiting | default false) }}
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: pod/{{`{{`}} $labels.pod {{`}}`}} in namespace {{`{{`}} $labels.namespace {{`}}`}} on container {{`{{`}} $labels.container{{`}}`}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetNotScheduled | default false) }}
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeDaemonSetMisScheduled | default false) }}
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeJobNotCompleted | default false) }}
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than {{`{{`}} "43200" | humanizeDuration {{`}}`}} to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by(namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeJobFailed | default false) }}
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete. Removing failed job after investigation should clear this alert.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeHpaReplicasMismatch | default false) }}
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeHpaMaxedOut | default false) }}
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: HPA {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.horizontalpodautoscaler {{`}}`}} has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,193 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeCPUOvercommit | default false) }}
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted CPU resource requests for Pods by {{`{{`}} $value {{`}}`}} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeMemoryOvercommit | default false) }}
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted memory resource requests for Pods by {{`{{`}} $value | humanize {{`}}`}} bytes and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeCPUQuotaOvercommit | default false) }}
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"}))
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeMemoryQuotaOvercommit | default false) }}
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"}))
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaAlmostFull | default false) }}
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaFullyUsed | default false) }}
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeQuotaExceeded | default false) }}
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} $value | humanizePercentage {{`}}`}} of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.CPUThrottlingHigh | default false) }}
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container {{`}}`}} in pod {{`{{`}} $labels.pod {{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,130 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-system-apiserver' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-apiserver" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-apiserver
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientCertificateExpiration | default false) }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeClientCertificateExpiration | default false) }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: A client certificate used to authenticate to kubernetes apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeclientcertificateexpiration
|
||||
summary: Client certificate is about to expire.
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAggregatedAPIErrors | default false) }}
|
||||
- alert: KubeAggregatedAPIErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has reported errors. It has appeared unavailable {{`{{`}} $value | humanize {{`}}`}} times averaged over the past 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapierrors
|
||||
summary: Kubernetes aggregated API has reported errors.
|
||||
expr: sum by(name, namespace, cluster)(increase(aggregator_unavailable_apiservice_total[10m])) > 4
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAggregatedAPIDown | default false) }}
|
||||
- alert: KubeAggregatedAPIDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubernetes aggregated API {{`{{`}} $labels.name {{`}}`}}/{{`{{`}} $labels.namespace {{`}}`}} has been only {{`{{`}} $value | humanize {{`}}`}}% available over the last 10m.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeaggregatedapidown
|
||||
summary: Kubernetes aggregated API is down.
|
||||
expr: (1 - max by(name, namespace, cluster)(avg_over_time(aggregator_unavailable_apiservice[10m]))) * 100 < 85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.enabled }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPIDown | default false) }}
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapidown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeAPITerminatedRequests | default false) }}
|
||||
- alert: KubeAPITerminatedRequests
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeapiterminatedrequests
|
||||
summary: The kubernetes apiserver has terminated {{`{{`}} $value | humanizePercentage {{`}}`}} of its incoming requests.
|
||||
expr: sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) / ( sum(rate(apiserver_request_total{job="apiserver"}[10m])) + sum(rate(apiserver_request_terminations_total{job="apiserver"}[10m])) ) > 0.20
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,253 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-system-kubelet' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system-kubelet" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system-kubelet
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeNotReady | default false) }}
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than 15 minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodenotready
|
||||
summary: Node is not ready.
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeUnreachable | default false) }}
|
||||
- alert: KubeNodeUnreachable
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.node {{`}}`}} is unreachable and some workloads may be rescheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodeunreachable
|
||||
summary: Node is unreachable.
|
||||
expr: (kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} unless ignoring(key,value) kube_node_spec_taint{job="kube-state-metrics",key=~"ToBeDeletedByClusterAutoscaler|cloud.google.com/impending-node-termination|aws-node-termination-handler/spot-itn"}) == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletTooManyPods | default false) }}
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet '{{`{{`}} $labels.node {{`}}`}}' is running at {{`{{`}} $value | humanizePercentage {{`}}`}} of its Pod capacity.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubelettoomanypods
|
||||
summary: Kubelet is running at capacity.
|
||||
expr: |-
|
||||
count by(cluster, node) (
|
||||
(kube_pod_status_phase{job="kube-state-metrics",phase="Running"} == 1) * on(instance,pod,namespace,cluster) group_left(node) topk by(instance,pod,namespace,cluster) (1, kube_pod_info{job="kube-state-metrics"})
|
||||
)
|
||||
/
|
||||
max by(cluster, node) (
|
||||
kube_node_status_capacity{job="kube-state-metrics",resource="pods"} != 1
|
||||
) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeNodeReadinessFlapping | default false) }}
|
||||
- alert: KubeNodeReadinessFlapping
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The readiness status of node {{`{{`}} $labels.node {{`}}`}} has changed {{`{{`}} $value {{`}}`}} times in the last 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubenodereadinessflapping
|
||||
summary: Node readiness status is flapping.
|
||||
expr: sum(changes(kube_node_status_condition{job="kube-state-metrics",status="true",condition="Ready"}[15m])) by (cluster, node) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletPlegDurationHigh | default false) }}
|
||||
- alert: KubeletPlegDurationHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration of {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletplegdurationhigh
|
||||
summary: Kubelet Pod Lifecycle Event Generator is taking too long to relist.
|
||||
expr: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletPodStartUpLatencyHigh | default false) }}
|
||||
- alert: KubeletPodStartUpLatencyHigh
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet Pod startup 99th percentile latency is {{`{{`}} $value {{`}}`}} seconds on node {{`{{`}} $labels.node {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletpodstartuplatencyhigh
|
||||
summary: Kubelet Pod startup latency is too high.
|
||||
expr: histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="{{ include "exporter.kubelet.jobName" . }}", metrics_path="/metrics"}[5m])) by (cluster, instance, le)) * on(cluster, instance) group_left(node) kubelet_node_name{job="{{ include "exporter.kubelet.jobName" . }}", metrics_path="/metrics"} > 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateExpiration | default false) }}
|
||||
- alert: KubeletClientCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Client certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificateexpiration
|
||||
summary: Kubelet client certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_client_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateExpiration | default false) }}
|
||||
- alert: KubeletServerCertificateExpiration
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Server certificate for Kubelet on node {{`{{`}} $labels.node {{`}}`}} expires in {{`{{`}} $value | humanizeDuration {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificateexpiration
|
||||
summary: Kubelet server certificate is about to expire.
|
||||
expr: kubelet_certificate_manager_server_ttl_seconds < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletClientCertificateRenewalErrors | default false) }}
|
||||
- alert: KubeletClientCertificateRenewalErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its client certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletclientcertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its client certificate.
|
||||
expr: increase(kubelet_certificate_manager_client_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletServerCertificateRenewalErrors | default false) }}
|
||||
- alert: KubeletServerCertificateRenewalErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet on node {{`{{`}} $labels.node {{`}}`}} has failed to renew its server certificate ({{`{{`}} $value | humanize {{`}}`}} errors in the last 5 minutes).
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletservercertificaterenewalerrors
|
||||
summary: Kubelet has failed to renew its server certificate.
|
||||
expr: increase(kubelet_server_expiration_renew_errors[5m]) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if (include "exporter.kubelet.enabled" .)}}
|
||||
{{- if not (.Values.defaultRules.disabled.KubeletDown | default false) }}
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/kubernetes/kubeletdown
|
||||
summary: Target disappeared from Prometheus target discovery.
|
||||
expr: absent(up{job="{{ include "exporter.kubelet.jobName" . }}", metrics_path="/metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,133 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'node-exporter.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/nodeExporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.nodeExporterRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
count without (cpu, mode) (
|
||||
node_cpu_seconds_total{job="node-exporter",mode="idle"}
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - avg without (cpu) (
|
||||
sum without (mode) (rate(node_cpu_seconds_total{job="node-exporter", mode=~"idle|iowait|steal"}[5m]))
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
1 - (
|
||||
(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Cached_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_MemFree_bytes{job="node-exporter"}
|
||||
+
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m])
|
||||
record: instance:node_vmstat_pgmajfault:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[5m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,398 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'node-exporter' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/nodeExporter-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.nodeExporterAlerting }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-exporter" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemSpaceFillingUp | default false) }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemSpaceFillingUp | default false) }}
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfSpace | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfSpace | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available space left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemFilesFillingUp | default false) }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemFilesFillingUp | default false) }}
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left and is filling up fast.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfFiles | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFilesystemAlmostOutOfFiles | default false) }}
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Filesystem on {{`{{`}} $labels.device {{`}}`}} at {{`{{`}} $labels.instance {{`}}`}} has only {{`{{`}} printf "%.2f" $value {{`}}`}}% available inodes left.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkReceiveErrs | default false) }}
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} receive errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeNetworkTransmitErrs | default false) }}
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $labels.instance {{`}}`}} interface {{`{{`}} $labels.device {{`}}`}} has encountered {{`{{`}} printf "%.0f" $value {{`}}`}} transmit errors in the last two minutes.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeHighNumberConntrackEntriesUsed | default false) }}
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of conntrack entries are used.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit) > 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeTextFileCollectorScrapeError | default false) }}
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Node Exporter text file collector failed to scrape.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeClockSkewDetected | default false) }}
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is out of sync by more than 0.05s. Ensure NTP is configured correctly on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeClockNotSynchronising | default false) }}
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Clock on {{`{{`}} $labels.instance {{`}}`}} is not synchronising. Ensure NTP is configured on this host.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeRAIDDegraded | default false) }}
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: RAID array '{{`{{`}} $labels.device {{`}}`}}' on {{`{{`}} $labels.instance {{`}}`}} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddegraded
|
||||
summary: RAID Array is degraded
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeRAIDDiskFailure | default false) }}
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: At least one device in RAID array on {{`{{`}} $labels.instance {{`}}`}} failed. Array '{{`{{`}} $labels.device {{`}}`}}' needs attention and possibly a disk swap.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFileDescriptorLimit | default false) }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.NodeFileDescriptorLimit | default false) }}
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: File descriptors limit at {{`{{`}} $labels.instance {{`}}`}} is currently at {{`{{`}} printf "%.2f" $value {{`}}`}}%.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,83 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetesControlPlane-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: |-
|
||||
topk by(cluster, namespace, pod) (1,
|
||||
max by (cluster, node, namespace, pod) (
|
||||
label_replace(kube_pod_info{job="kube-state-metrics",node!=""}, "pod", "$1", "pod", "(.*)")
|
||||
))
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
count by (cluster, node) (
|
||||
node_cpu_seconds_total{mode="idle",job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
topk by(namespace, pod) (1, node_namespace_pod:kube_pod_info:)
|
||||
)
|
||||
record: node:node_num_cpu:sum
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
sum(
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"} or
|
||||
(
|
||||
node_memory_Buffers_bytes{job="node-exporter"} +
|
||||
node_memory_Cached_bytes{job="node-exporter"} +
|
||||
node_memory_MemFree_bytes{job="node-exporter"} +
|
||||
node_memory_Slab_bytes{job="node-exporter"}
|
||||
)
|
||||
) by (cluster)
|
||||
record: :node_memory_MemAvailable_bytes:sum
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
avg by (cluster, node) (
|
||||
sum without (mode) (
|
||||
rate(node_cpu_seconds_total{mode!="idle",mode!="iowait",mode!="steal",job="node-exporter"}[5m])
|
||||
)
|
||||
)
|
||||
record: node:node_cpu_utilization:ratio_rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
- expr: |-
|
||||
avg by (cluster) (
|
||||
node:node_cpu_utilization:ratio_rate5m
|
||||
)
|
||||
record: cluster:node_cpu:ratio_rate5m
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
labels:
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,148 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheusOperator-prometheusRule.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorListErrors | default false) }}
|
||||
- alert: PrometheusOperatorListErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while performing List operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorlisterrors
|
||||
summary: Errors while performing list operations in controller.
|
||||
expr: (sum by (controller,namespace,cluster) (rate(prometheus_operator_list_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m])) / sum by (controller,namespace,cluster) (rate(prometheus_operator_list_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[10m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorWatchErrors | default false) }}
|
||||
- alert: PrometheusOperatorWatchErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while performing watch operations in controller {{`{{`}}$labels.controller{{`}}`}} in {{`{{`}}$labels.namespace{{`}}`}} namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorwatcherrors
|
||||
summary: Errors while performing watch operations in controller.
|
||||
expr: (sum by (controller,namespace,cluster) (rate(prometheus_operator_watch_operations_failed_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m])) / sum by (controller,namespace,cluster) (rate(prometheus_operator_watch_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.4
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorSyncFailed | default false) }}
|
||||
- alert: PrometheusOperatorSyncFailed
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Controller {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} namespace fails to reconcile {{`{{`}} $value {{`}}`}} objects.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorsyncfailed
|
||||
summary: Last controller reconciliation failed
|
||||
expr: min_over_time(prometheus_operator_syncs{status="failed",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorReconcileErrors | default false) }}
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: '{{`{{`}} $value | humanizePercentage {{`}}`}} of reconciling operations failed for {{`{{`}} $labels.controller {{`}}`}} controller in {{`{{`}} $labels.namespace {{`}}`}} namespace.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorreconcileerrors
|
||||
summary: Errors while reconciling controller.
|
||||
expr: (sum by (controller,namespace,cluster) (rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) / (sum by (controller,namespace,cluster) (rate(prometheus_operator_reconcile_operations_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]))) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorNodeLookupErrors | default false) }}
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornodelookuperrors
|
||||
summary: Errors while reconciling Prometheus.
|
||||
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorNotReady | default false) }}
|
||||
- alert: PrometheusOperatorNotReady
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace isn't ready to reconcile {{`{{`}} $labels.controller {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatornotready
|
||||
summary: Prometheus operator not ready
|
||||
expr: min by (controller,namespace,cluster) (max_over_time(prometheus_operator_ready{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) == 0)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if not (.Values.defaultRules.disabled.PrometheusOperatorRejectedResources | default false) }}
|
||||
- alert: PrometheusOperatorRejectedResources
|
||||
annotations:
|
||||
{{- if .Values.defaultRules.additionalRuleAnnotations }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleAnnotations | indent 8 }}
|
||||
{{- end }}
|
||||
description: Prometheus operator in {{`{{`}} $labels.namespace {{`}}`}} namespace rejected {{`{{`}} printf "%0.0f" $value {{`}}`}} {{`{{`}} $labels.controller {{`}}`}}/{{`{{`}} $labels.resource {{`}}`}} resources.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}/prometheus-operator/prometheusoperatorrejectedresources
|
||||
summary: Resources rejected by Prometheus operator
|
||||
expr: min_over_time(prometheus_operator_managed_resources{state="rejected",job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | nindent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,56 +0,0 @@
|
|||
{{- if and .Values.prometheus.enabled .Values.prometheus.serviceMonitor.selfMonitor }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-prometheus
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- with .Values.prometheus.serviceMonitor.additionalLabels }}
|
||||
{{- toYaml . | nindent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
{{- include "servicemonitor.scrapeLimits" .Values.prometheus.serviceMonitor | nindent 2 }}
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}-prometheus
|
||||
release: {{ $.Release.Name | quote }}
|
||||
self-monitor: "true"
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- {{ printf "%s" (include "kube-prometheus-stack.namespace" .) | quote }}
|
||||
endpoints:
|
||||
- port: {{ .Values.prometheus.prometheusSpec.portName }}
|
||||
{{- if .Values.prometheus.serviceMonitor.interval }}
|
||||
interval: {{ .Values.prometheus.serviceMonitor.interval }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.scheme }}
|
||||
scheme: {{ .Values.prometheus.serviceMonitor.scheme }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.tlsConfig }}
|
||||
tlsConfig: {{ toYaml .Values.prometheus.serviceMonitor.tlsConfig | nindent 6 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.bearerTokenFile }}
|
||||
bearerTokenFile: {{ .Values.prometheus.serviceMonitor.bearerTokenFile }}
|
||||
{{- end }}
|
||||
path: "{{ trimSuffix "/" .Values.prometheus.prometheusSpec.routePrefix }}/metrics"
|
||||
metricRelabelings:
|
||||
{{- if .Values.prometheus.serviceMonitor.metricRelabelings }}
|
||||
{{ tpl (toYaml .Values.prometheus.serviceMonitor.metricRelabelings | indent 6) . }}
|
||||
{{- end }}
|
||||
{{ if .Values.global.cattle.clusterId }}
|
||||
- sourceLabels: [__address__]
|
||||
targetLabel: cluster_id
|
||||
replacement: {{ .Values.global.cattle.clusterId }}
|
||||
{{- end }}
|
||||
{{ if .Values.global.cattle.clusterName}}
|
||||
- sourceLabels: [__address__]
|
||||
targetLabel: cluster_name
|
||||
replacement: {{ .Values.global.cattle.clusterName }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.serviceMonitor.relabelings }}
|
||||
relabelings:
|
||||
{{ toYaml .Values.prometheus.serviceMonitor.relabelings | indent 6 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -0,0 +1,5 @@
|
|||
root = true
|
||||
|
||||
[files/dashboards/*.json]
|
||||
indent_size = 2
|
||||
indent_style = space
|
|
@ -26,3 +26,4 @@ ci/
|
|||
kube-prometheus-*.tgz
|
||||
|
||||
unittests/
|
||||
files/dashboards/
|
|
@ -20,98 +20,76 @@ annotations:
|
|||
catalog.cattle.io/requests-memory: 4000Mi
|
||||
catalog.cattle.io/type: cluster-tool
|
||||
catalog.cattle.io/ui-component: monitoring
|
||||
catalog.cattle.io/upstream-version: 45.31.1
|
||||
catalog.cattle.io/upstream-version: 57.0.3
|
||||
apiVersion: v2
|
||||
appVersion: v0.65.1
|
||||
appVersion: v0.72.0
|
||||
dependencies:
|
||||
- condition: grafana.enabled
|
||||
name: grafana
|
||||
repository: file://./charts/grafana
|
||||
version: '*'
|
||||
- condition: hardenedKubelet.enabled
|
||||
name: hardenedKubelet
|
||||
repository: file://./charts/hardenedKubelet
|
||||
version: '*'
|
||||
- condition: hardenedNodeExporter.enabled
|
||||
name: hardenedNodeExporter
|
||||
repository: file://./charts/hardenedNodeExporter
|
||||
version: '*'
|
||||
- condition: k3sServer.enabled
|
||||
name: k3sServer
|
||||
repository: file://./charts/k3sServer
|
||||
version: '*'
|
||||
- condition: kubeStateMetrics.enabled
|
||||
name: kube-state-metrics
|
||||
repository: file://./charts/kube-state-metrics
|
||||
version: '*'
|
||||
- condition: kubeAdmControllerManager.enabled
|
||||
name: kubeAdmControllerManager
|
||||
repository: file://./charts/kubeAdmControllerManager
|
||||
version: '*'
|
||||
- condition: kubeAdmEtcd.enabled
|
||||
name: kubeAdmEtcd
|
||||
repository: file://./charts/kubeAdmEtcd
|
||||
version: '*'
|
||||
- condition: kubeAdmProxy.enabled
|
||||
name: kubeAdmProxy
|
||||
repository: file://./charts/kubeAdmProxy
|
||||
version: '*'
|
||||
- condition: kubeAdmScheduler.enabled
|
||||
name: kubeAdmScheduler
|
||||
repository: file://./charts/kubeAdmScheduler
|
||||
version: '*'
|
||||
- condition: prometheus-adapter.enabled
|
||||
name: prometheus-adapter
|
||||
repository: file://./charts/prometheus-adapter
|
||||
version: '*'
|
||||
- condition: nodeExporter.enabled
|
||||
name: prometheus-node-exporter
|
||||
repository: file://./charts/prometheus-node-exporter
|
||||
version: '*'
|
||||
- condition: rke2ControllerManager.enabled
|
||||
name: rke2ControllerManager
|
||||
repository: file://./charts/rke2ControllerManager
|
||||
version: '*'
|
||||
- condition: rke2Etcd.enabled
|
||||
name: rke2Etcd
|
||||
repository: file://./charts/rke2Etcd
|
||||
version: '*'
|
||||
- condition: rke2IngressNginx.enabled
|
||||
name: rke2IngressNginx
|
||||
repository: file://./charts/rke2IngressNginx
|
||||
version: '*'
|
||||
- condition: rke2Proxy.enabled
|
||||
name: rke2Proxy
|
||||
repository: file://./charts/rke2Proxy
|
||||
version: '*'
|
||||
- condition: rke2Scheduler.enabled
|
||||
name: rke2Scheduler
|
||||
repository: file://./charts/rke2Scheduler
|
||||
version: '*'
|
||||
- condition: rkeControllerManager.enabled
|
||||
name: rkeControllerManager
|
||||
repository: file://./charts/rkeControllerManager
|
||||
version: '*'
|
||||
- condition: rkeEtcd.enabled
|
||||
name: rkeEtcd
|
||||
repository: file://./charts/rkeEtcd
|
||||
version: '*'
|
||||
- condition: rkeIngressNginx.enabled
|
||||
name: rkeIngressNginx
|
||||
repository: file://./charts/rkeIngressNginx
|
||||
version: '*'
|
||||
- condition: rkeProxy.enabled
|
||||
name: rkeProxy
|
||||
repository: file://./charts/rkeProxy
|
||||
version: '*'
|
||||
- condition: rkeScheduler.enabled
|
||||
name: rkeScheduler
|
||||
repository: file://./charts/rkeScheduler
|
||||
version: '*'
|
||||
- condition: global.cattle.windows.enabled
|
||||
- condition: windowsExporter.enabled
|
||||
name: windowsExporter
|
||||
repository: file://./charts/windowsExporter
|
||||
version: '*'
|
||||
description: kube-prometheus-stack collects Kubernetes manifests, Grafana dashboards,
|
||||
and Prometheus rules combined with documentation and scripts to provide easy to
|
||||
operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus
|
||||
|
@ -122,7 +100,7 @@ keywords:
|
|||
- operator
|
||||
- prometheus
|
||||
- kube-prometheus
|
||||
kubeVersion: '>=1.26.0-0'
|
||||
kubeVersion: '>=1.19.0-0'
|
||||
maintainers:
|
||||
- email: andrew@quadcorps.co.uk
|
||||
name: andrewgkew
|
||||
|
@ -132,6 +110,8 @@ maintainers:
|
|||
name: gkarthiks
|
||||
- email: kube-prometheus-stack@sisti.pt
|
||||
name: GMartinez-Sisti
|
||||
- email: github@jkroepke.de
|
||||
name: jkroepke
|
||||
- email: scott@r6by.com
|
||||
name: scottrigby
|
||||
- email: miroslav.hadzhiev@gmail.com
|
||||
|
@ -143,4 +123,4 @@ sources:
|
|||
- https://github.com/prometheus-community/helm-charts
|
||||
- https://github.com/prometheus-operator/kube-prometheus
|
||||
type: application
|
||||
version: 104.0.0-rc1+up45.31.1
|
||||
version: 104.0.0-rc1+up57.0.3
|
|
@ -8,7 +8,7 @@ _Note: This chart was formerly named `prometheus-operator` chart, now renamed to
|
|||
|
||||
## Prerequisites
|
||||
|
||||
- Kubernetes 1.16+
|
||||
- Kubernetes 1.19+
|
||||
- Helm 3+
|
||||
|
||||
## Get Helm Repository Info
|
||||
|
@ -59,8 +59,10 @@ kubectl delete crd alertmanagerconfigs.monitoring.coreos.com
|
|||
kubectl delete crd alertmanagers.monitoring.coreos.com
|
||||
kubectl delete crd podmonitors.monitoring.coreos.com
|
||||
kubectl delete crd probes.monitoring.coreos.com
|
||||
kubectl delete crd prometheusagents.monitoring.coreos.com
|
||||
kubectl delete crd prometheuses.monitoring.coreos.com
|
||||
kubectl delete crd prometheusrules.monitoring.coreos.com
|
||||
kubectl delete crd scrapeconfigs.monitoring.coreos.com
|
||||
kubectl delete crd servicemonitors.monitoring.coreos.com
|
||||
kubectl delete crd thanosrulers.monitoring.coreos.com
|
||||
```
|
||||
|
@ -80,9 +82,249 @@ _See [helm upgrade](https://helm.sh/docs/helm/helm_upgrade/) for command documen
|
|||
|
||||
A major chart version change (like v1.2.3 -> v2.0.0) indicates that there is an incompatible breaking change needing manual actions.
|
||||
|
||||
### From 56.x to 57.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.72.0
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.72.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 55.x to 56.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.71.0, Prometheus to 2.49.1
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.71.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 54.x to 55.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.70.0
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.70.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 53.x to 54.x
|
||||
|
||||
Grafana Helm Chart has bumped to version 7
|
||||
|
||||
Please note Grafana Helm Chart [changelog](https://github.com/grafana/helm-charts/tree/main/charts/grafana#to-700).
|
||||
|
||||
### From 52.x to 53.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.69.1, Prometheus to 2.47.2
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.69.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 51.x to 52.x
|
||||
|
||||
This includes the ability to select between using existing secrets or create new secret objects for various thanos config. The defaults have not changed but if you were setting:
|
||||
|
||||
- `thanosRuler.thanosRulerSpec.alertmanagersConfig` or
|
||||
- `thanosRuler.thanosRulerSpec.objectStorageConfig` or
|
||||
- `thanosRuler.thanosRulerSpec.queryConfig` or
|
||||
- `prometheus.prometheusSpec.thanos.objectStorageConfig`
|
||||
|
||||
you will have to need to set `existingSecret` or `secret` based on your requirement
|
||||
|
||||
For instance, the `thanosRuler.thanosRulerSpec.alertmanagersConfig` used to be configured as follow:
|
||||
|
||||
```yaml
|
||||
thanosRuler:
|
||||
thanosRulerSpec:
|
||||
alertmanagersConfig:
|
||||
alertmanagers:
|
||||
- api_version: v2
|
||||
http_config:
|
||||
basic_auth:
|
||||
username: some_user
|
||||
password: some_pass
|
||||
static_configs:
|
||||
- alertmanager.thanos.io
|
||||
scheme: http
|
||||
timeout: 10s
|
||||
```
|
||||
|
||||
But it now moved to:
|
||||
|
||||
```yaml
|
||||
thanosRuler:
|
||||
thanosRulerSpec:
|
||||
alertmanagersConfig:
|
||||
secret:
|
||||
alertmanagers:
|
||||
- api_version: v2
|
||||
http_config:
|
||||
basic_auth:
|
||||
username: some_user
|
||||
password: some_pass
|
||||
static_configs:
|
||||
- alertmanager.thanos.io
|
||||
scheme: http
|
||||
timeout: 10s
|
||||
```
|
||||
|
||||
or the `thanosRuler.thanosRulerSpec.objectStorageConfig` used to be configured as follow:
|
||||
|
||||
```yaml
|
||||
thanosRuler:
|
||||
thanosRulerSpec:
|
||||
objectStorageConfig:
|
||||
name: existing-secret-not-created-by-this-chart
|
||||
key: object-storage-configs.yaml
|
||||
```
|
||||
|
||||
But it now moved to:
|
||||
|
||||
```yaml
|
||||
thanosRuler:
|
||||
thanosRulerSpec:
|
||||
objectStorageConfig:
|
||||
existingSecret:
|
||||
name: existing-secret-not-created-by-this-chart
|
||||
key: object-storage-configs.yaml
|
||||
```
|
||||
|
||||
### From 50.x to 51.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.68.0, Prometheus to 2.47.0 and Thanos to v0.32.2
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.68.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 49.x to 50.x
|
||||
|
||||
This version requires Kubernetes 1.19+.
|
||||
|
||||
We do not expect any breaking changes in this version.
|
||||
|
||||
### From 48.x to 49.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.67.1, 0, Alertmanager to v0.26.0, Prometheus to 2.46.0 and Thanos to v0.32.0
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.67.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 47.x to 48.x
|
||||
|
||||
This version moved all CRDs into a dedicated sub-chart. No new CRDs are introduced in this version.
|
||||
See [#3548](https://github.com/prometheus-community/helm-charts/issues/3548) for more context.
|
||||
|
||||
We do not expect any breaking changes in this version.
|
||||
|
||||
### From 46.x to 47.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.66.0 with new CRDs (PrometheusAgent and ScrapeConfig).
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.66.0/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 45.x to 46.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.65.1 with new CRDs (PrometheusAgent and ScrapeConfig), Prometheus to v2.44.0 and Thanos to v0.31.0.
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
||||
```console
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagerconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_alertmanagers.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_podmonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_probes.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusagents.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheuses.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_scrapeconfigs.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_servicemonitors.yaml
|
||||
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.65.1/example/prometheus-operator-crd/monitoring.coreos.com_thanosrulers.yaml
|
||||
```
|
||||
|
||||
### From 44.x to 45.x
|
||||
|
||||
This version upgrades Prometheus-Operator to v0.63.0, Prometheus to v2.43.0 and Thanos to v0.30.2.
|
||||
This version upgrades Prometheus-Operator to v0.63.0, Prometheus to v2.42.0 and Thanos to v0.30.2.
|
||||
|
||||
Run these commands to update the CRDs before applying the upgrade.
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue