mirror of https://git.rancher.io/charts
(dev-v2.6-archive) [rancher-monitoring-crd] code refactor
(partially cherry picked from commit cfc2433953
)
pull/1680/head
parent
7f2f418b2c
commit
5f1dde5a21
|
@ -1,2 +1,2 @@
|
|||
url: https://github.com/rancher/backup-restore-operator/releases/download/v2.1.0-rc1/rancher-backup-crd-2.1.0-rc1.tgz
|
||||
version: 2.1.0
|
||||
url: https://github.com/rancher/backup-restore-operator/releases/download/v2.0.1/rancher-backup-crd-2.0.1.tgz
|
||||
version: 2.0.1
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
url: https://github.com/rancher/backup-restore-operator/releases/download/v2.1.0-rc1/rancher-backup-2.1.0-rc1.tgz
|
||||
version: 2.1.0
|
||||
url: https://github.com/rancher/backup-restore-operator/releases/download/v2.0.1/rancher-backup-2.0.1.tgz
|
||||
version: 2.0.1
|
||||
|
|
|
@ -8,7 +8,7 @@ image:
|
|||
tag: v1.0.6
|
||||
securityScan:
|
||||
repository: rancher/security-scan
|
||||
tag: v0.2.5-rc2
|
||||
tag: v0.2.5-rc1
|
||||
sonobuoy:
|
||||
repository: rancher/mirrored-sonobuoy-sonobuoy
|
||||
tag: v0.53.2
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
--- charts-original/Chart.yaml
|
||||
+++ charts/Chart.yaml
|
||||
@@ -4,12 +4,12 @@
|
||||
catalog.cattle.io/kube-version: < 1.22.0
|
||||
@@ -3,12 +3,12 @@
|
||||
catalog.cattle.io/display-name: External IP Webhook
|
||||
catalog.cattle.io/namespace: cattle-externalip-system
|
||||
catalog.cattle.io/os: linux
|
||||
- catalog.cattle.io/release-name: rancher-externalip-webhook
|
||||
|
@ -9,14 +9,14 @@
|
|||
+ catalog.cattle.io/release-name: rancher-external-ip-webhook
|
||||
+ catalog.cattle.io/ui-component: rancher-external-ip-webhook
|
||||
apiVersion: v1
|
||||
appVersion: v1.0.1-rc2
|
||||
appVersion: v1.0.1-rc1
|
||||
description: |
|
||||
- Deploy the externalip-webhook to mitigate k8s CVE-2020-8554
|
||||
+ Deploy the external-ip-webhook to mitigate k8s CVE-2020-8554
|
||||
home: https://github.com/rancher/externalip-webhook
|
||||
keywords:
|
||||
- cve
|
||||
@@ -20,7 +20,7 @@
|
||||
@@ -19,7 +19,7 @@
|
||||
maintainers:
|
||||
- email: raul@rancher.com
|
||||
name: rawmind0
|
||||
|
@ -24,4 +24,4 @@
|
|||
+name: rancher-external-ip-webhook
|
||||
sources:
|
||||
- https://github.com/rancher/externalip-webhook
|
||||
version: 1.0.1-rc2
|
||||
version: 1.0.1-rc1
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
url: https://github.com/rancher/externalip-webhook/releases/download/v1.0.1-rc2/rancher-externalip-webhook-1.0.1-rc2.tgz
|
||||
url: https://github.com/rancher/externalip-webhook/releases/download/v1.0.1-rc1/rancher-externalip-webhook-1.0.1-rc1.tgz
|
||||
version: 100.0.1
|
||||
|
|
|
@ -10,9 +10,9 @@
|
|||
apiVersion: v2
|
||||
-name: grafana
|
||||
+name: rancher-grafana
|
||||
version: 6.11.0
|
||||
-appVersion: 7.5.5
|
||||
+appVersion: 7.5.8
|
||||
version: 6.16.14
|
||||
-appVersion: 8.2.1
|
||||
+appVersion: 7.5.11
|
||||
kubeVersion: '^1.8.0-0'
|
||||
description: The leading tool for querying and visualizing time series and metrics.
|
||||
home: https://grafana.net
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
--- charts-original/README.md
|
||||
+++ charts/README.md
|
||||
@@ -59,8 +59,8 @@
|
||||
| `securityContext` | Deployment securityContext | `{"runAsUser": 472, "runAsGroup": 472, "fsGroup": 472}` |
|
||||
| `priorityClassName` | Name of Priority Class to assign pods | `nil` |
|
||||
| `image.repository` | Image repository | `grafana/grafana` |
|
||||
-| `image.tag` | Image tag (`Must be >= 5.0.0`) | `8.0.3` |
|
||||
-| `image.sha` | Image sha (optional) | `80c6d6ac633ba5ab3f722976fb1d9a138f87ca6a9934fcd26a5fc28cbde7dbfa` |
|
||||
+| `image.tag` | Image tag (`Must be >= 5.0.0`) | `7.5.11` |
|
||||
+| `image.sha` | Image sha (optional) | `` |
|
||||
| `image.pullPolicy` | Image pull policy | `IfNotPresent` |
|
||||
| `image.pullSecrets` | Image pull secrets | `{}` |
|
||||
| `service.enabled` | Enable grafana service | `true` |
|
||||
@@ -188,8 +188,8 @@
|
||||
| `rbac.extraRoleRules` | Additional rules to add to the Role | [] |
|
||||
| `rbac.extraClusterRoleRules` | Additional rules to add to the ClusterRole | [] |
|
||||
| `command` | Define command to be executed by grafana container at startup | `nil` |
|
||||
-| `testFramework.enabled` | Whether to create test-related resources | `true` |
|
||||
-| `testFramework.image` | `test-framework` image repository. | `bats/bats` |
|
||||
+| `testFramework.enabled` | Whether to create test-related resources | `false` |
|
||||
+| `testFramework.image` | `test-framework` image repository. | `rancher/mirrored-bats-bats` |
|
||||
| `testFramework.tag` | `test-framework` image tag. | `v1.1.0` |
|
||||
| `testFramework.imagePullPolicy` | `test-framework` image pull policy. | `IfNotPresent` |
|
||||
| `testFramework.securityContext` | `test-framework` securityContext | `{}` |
|
|
@ -5,7 +5,7 @@
|
|||
{{- define "grafana.pod" -}}
|
||||
{{- if .Values.schedulerName }}
|
||||
schedulerName: "{{ .Values.schedulerName }}"
|
||||
@@ -21,9 +20,9 @@
|
||||
@@ -22,9 +21,9 @@
|
||||
{{- if ( and .Values.persistence.enabled .Values.initChownData.enabled ) }}
|
||||
- name: init-chown-data
|
||||
{{- if .Values.initChownData.image.sha }}
|
||||
|
@ -17,7 +17,7 @@
|
|||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.initChownData.image.pullPolicy }}
|
||||
securityContext:
|
||||
@@ -42,9 +41,9 @@
|
||||
@@ -43,9 +42,9 @@
|
||||
{{- if .Values.dashboards }}
|
||||
- name: download-dashboards
|
||||
{{- if .Values.downloadDashboardsImage.sha }}
|
||||
|
@ -29,7 +29,7 @@
|
|||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.downloadDashboardsImage.pullPolicy }}
|
||||
command: ["/bin/sh"]
|
||||
@@ -79,11 +78,16 @@
|
||||
@@ -80,11 +79,16 @@
|
||||
{{- if .Values.sidecar.datasources.enabled }}
|
||||
- name: {{ template "grafana.name" . }}-sc-datasources
|
||||
{{- if .Values.sidecar.image.sha }}
|
||||
|
@ -48,7 +48,7 @@
|
|||
env:
|
||||
- name: METHOD
|
||||
value: LIST
|
||||
@@ -118,9 +122,9 @@
|
||||
@@ -119,9 +123,9 @@
|
||||
{{- if .Values.sidecar.notifiers.enabled }}
|
||||
- name: {{ template "grafana.name" . }}-sc-notifiers
|
||||
{{- if .Values.sidecar.image.sha }}
|
||||
|
@ -60,7 +60,7 @@
|
|||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
|
||||
env:
|
||||
@@ -163,9 +167,9 @@
|
||||
@@ -165,9 +169,9 @@
|
||||
{{- if .Values.sidecar.dashboards.enabled }}
|
||||
- name: {{ template "grafana.name" . }}-sc-dashboard
|
||||
{{- if .Values.sidecar.image.sha }}
|
||||
|
@ -72,7 +72,7 @@
|
|||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.sidecar.imagePullPolicy }}
|
||||
env:
|
||||
@@ -205,9 +209,9 @@
|
||||
@@ -207,9 +211,9 @@
|
||||
{{- end}}
|
||||
- name: {{ .Chart.Name }}
|
||||
{{- if .Values.image.sha }}
|
||||
|
@ -84,7 +84,7 @@
|
|||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
{{- if .Values.command }}
|
||||
@@ -307,7 +311,7 @@
|
||||
@@ -315,7 +319,7 @@
|
||||
{{- end }}
|
||||
ports:
|
||||
- name: {{ .Values.service.portName }}
|
||||
|
@ -93,7 +93,7 @@
|
|||
protocol: TCP
|
||||
- name: {{ .Values.podPortName }}
|
||||
containerPort: 3000
|
||||
@@ -388,17 +392,17 @@
|
||||
@@ -402,17 +406,17 @@
|
||||
{{- with .Values.extraContainers }}
|
||||
{{ tpl . $ | indent 2 }}
|
||||
{{- end }}
|
||||
|
|
|
@ -21,7 +21,7 @@
|
|||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.imageRenderer.image.pullPolicy }}
|
||||
{{- if .Values.imageRenderer.command }}
|
||||
@@ -97,16 +99,16 @@
|
||||
@@ -101,16 +103,16 @@
|
||||
resources:
|
||||
{{ toYaml . | indent 12 }}
|
||||
{{- end }}
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
--- charts-original/templates/podsecuritypolicy.yaml
|
||||
+++ charts/templates/podsecuritypolicy.yaml
|
||||
@@ -6,13 +6,9 @@
|
||||
namespace: {{ template "grafana.namespace" . }}
|
||||
@@ -5,13 +5,9 @@
|
||||
name: {{ template "grafana.fullname" . }}
|
||||
labels:
|
||||
{{- include "grafana.labels" . | nindent 4 }}
|
||||
- annotations:
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
--- charts-original/templates/tests/test.yaml
|
||||
+++ charts/templates/tests/test.yaml
|
||||
@@ -33,7 +33,7 @@
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: {{ .Release.Name }}-test
|
||||
- image: "{{ .Values.testFramework.image}}:{{ .Values.testFramework.tag }}"
|
||||
+ image: "{{ template "system_default_registry" . }}{{ .Values.testFramework.image}}:{{ .Values.testFramework.tag }}"
|
||||
imagePullPolicy: "{{ .Values.testFramework.imagePullPolicy}}"
|
||||
command: ["/opt/bats/bin/bats", "-t", "/tests/run.sh"]
|
||||
volumeMounts:
|
|
@ -1,12 +1,10 @@
|
|||
--- charts-original/values.yaml
|
||||
+++ charts/values.yaml
|
||||
@@ -1,9 +1,25 @@
|
||||
@@ -1,9 +1,23 @@
|
||||
+global:
|
||||
+ cattle:
|
||||
+ systemDefaultRegistry: ""
|
||||
+
|
||||
+autoscaling:
|
||||
+ enabled: false
|
||||
rbac:
|
||||
create: true
|
||||
## Use an existing ClusterRole/Role (depending on rbac.namespaced false/true)
|
||||
|
@ -27,22 +25,24 @@
|
|||
namespaced: false
|
||||
extraRoleRules: []
|
||||
# - apiGroups: []
|
||||
@@ -68,8 +84,8 @@
|
||||
@@ -69,8 +83,8 @@
|
||||
# schedulerName: "default-scheduler"
|
||||
|
||||
image:
|
||||
- repository: grafana/grafana
|
||||
- tag: 7.5.5
|
||||
- tag: 8.2.1
|
||||
+ repository: rancher/mirrored-grafana-grafana
|
||||
+ tag: 7.5.8
|
||||
+ tag: 7.5.11
|
||||
sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
@@ -82,12 +98,15 @@
|
||||
@@ -82,13 +96,16 @@
|
||||
# - myRegistrKeySecretName
|
||||
|
||||
testFramework:
|
||||
enabled: true
|
||||
- enabled: true
|
||||
- image: "bats/bats"
|
||||
+ enabled: false
|
||||
+ image: "rancher/mirrored-bats-bats"
|
||||
tag: "v1.1.0"
|
||||
imagePullPolicy: IfNotPresent
|
||||
|
@ -56,7 +56,7 @@
|
|||
runAsUser: 472
|
||||
runAsGroup: 472
|
||||
fsGroup: 472
|
||||
@@ -115,8 +134,8 @@
|
||||
@@ -116,8 +133,8 @@
|
||||
# priorityClassName:
|
||||
|
||||
downloadDashboardsImage:
|
||||
|
@ -67,7 +67,16 @@
|
|||
sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
@@ -301,7 +320,7 @@
|
||||
@@ -189,7 +206,7 @@
|
||||
labels: {}
|
||||
path: /
|
||||
|
||||
- # pathType is only for k8s >= 1.1=
|
||||
+ # pathType is only for k8s >= 1.18
|
||||
pathType: Prefix
|
||||
|
||||
hosts:
|
||||
@@ -303,7 +320,7 @@
|
||||
## initChownData container image
|
||||
##
|
||||
image:
|
||||
|
@ -76,18 +85,16 @@
|
|||
tag: "1.31.1"
|
||||
sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
@@ -602,8 +621,8 @@
|
||||
@@ -614,7 +631,7 @@
|
||||
## Requires at least Grafana 5 to work and can't be used together with parameters dashboardProviders, datasources and dashboards
|
||||
sidecar:
|
||||
image:
|
||||
- repository: quay.io/kiwigrid/k8s-sidecar
|
||||
- tag: 1.10.7
|
||||
+ repository: rancher/mirrored-kiwigrid-k8s-sidecar
|
||||
+ tag: 1.12.2
|
||||
tag: 1.12.3
|
||||
sha: ""
|
||||
imagePullPolicy: IfNotPresent
|
||||
resources: {}
|
||||
@@ -690,9 +709,9 @@
|
||||
@@ -702,9 +719,9 @@
|
||||
replicas: 1
|
||||
image:
|
||||
# image-renderer Image repository
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
url: https://github.com/grafana/helm-charts.git
|
||||
subdirectory: charts/grafana
|
||||
commit: bf7e1110a5ee9258190d0377fea319bb8e764e62
|
||||
commit: ad033c2a2b1f81284a924fb8627ca00700952fc5
|
||||
version: 100.0.0
|
||||
doNotRelease: true
|
||||
|
|
|
@ -8,27 +8,27 @@
|
|||
{{- end }}
|
||||
{{- if .Values.priorityClassName }}
|
||||
priorityClassName: {{ .Values.priorityClassName }}
|
||||
@@ -173,7 +174,7 @@
|
||||
@@ -107,7 +108,7 @@
|
||||
readOnly: true
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
- image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
|
||||
+ image: "{{ template "system_default_registry" . }}{{ .Values.image.repository }}:{{ .Values.image.tag }}"
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
{{- if .Values.selfMonitor.enabled }}
|
||||
@@ -207,12 +208,12 @@
|
||||
- containerPort: {{ .Values.service.port | default 8080}}
|
||||
{{- if .Values.selfMonitor.enabled }}
|
||||
@@ -141,12 +142,12 @@
|
||||
affinity:
|
||||
{{ toYaml .Values.affinity | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
+ nodeSelector: {{ include "linux-node-selector" . | nindent 8 }}
|
||||
{{- if .Values.nodeSelector }}
|
||||
{{- if .Values.nodeSelector }}
|
||||
- nodeSelector:
|
||||
{{ toYaml .Values.nodeSelector | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
+ tolerations: {{ include "linux-node-tolerations" . | nindent 8 }}
|
||||
{{- if .Values.tolerations }}
|
||||
{{- if .Values.tolerations }}
|
||||
- tolerations:
|
||||
{{ toYaml .Values.tolerations | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeconfig.enabled}}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeconfig.enabled}}
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
--- charts-original/templates/role.yaml
|
||||
+++ charts/templates/role.yaml
|
||||
@@ -1,3 +1,6 @@
|
||||
+{{- if not (kindIs "slice" .Values.collectors) }}
|
||||
+{{- fail "Collectors need to be a List since kube-state-metrics chart 3.2.2. Please check README for more information."}}
|
||||
+{{- end }}
|
||||
{{- if and (eq .Values.rbac.create true) (not .Values.rbac.useExistingRole) -}}
|
||||
{{- range (split "," .Values.namespaces) }}
|
||||
---
|
|
@ -10,10 +10,10 @@
|
|||
image:
|
||||
- repository: k8s.gcr.io/kube-state-metrics/kube-state-metrics
|
||||
+ repository: rancher/mirrored-kube-state-metrics-kube-state-metrics
|
||||
tag: v2.0.0
|
||||
tag: v2.2.0
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
@@ -84,6 +88,7 @@
|
||||
@@ -86,6 +90,7 @@
|
||||
|
||||
securityContext:
|
||||
enabled: true
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
url: https://github.com/prometheus-community/helm-charts.git
|
||||
subdirectory: charts/kube-state-metrics
|
||||
commit: 086f1f7f0870e110abf30aa6bfe7c141e83cc950
|
||||
commit: 3f371027f2c384cb2e58b46b2249b6bfa200b1e7
|
||||
version: 100.0.0
|
||||
doNotRelease: true
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
dependencies:
|
||||
- name: kube-state-metrics
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 3.1.1
|
||||
version: 3.5.2
|
||||
- name: prometheus-node-exporter
|
||||
repository: https://prometheus-community.github.io/helm-charts
|
||||
version: 1.18.1
|
||||
version: 2.0.4
|
||||
- name: grafana
|
||||
repository: https://grafana.github.io/helm-charts
|
||||
version: 6.12.0
|
||||
digest: sha256:11886645ff1ade77d0fefdca90afba4a92f2b535997280074a59828e8d1dab4e
|
||||
generated: "2021-06-09T16:56:40.364303181+02:00"
|
||||
version: 6.16.10
|
||||
digest: sha256:94dad976ca1630e9e3cd006fadb255783387b53bd9d0d19e105bd39d8e8e34be
|
||||
generated: "2021-09-28T10:26:46.319411+07:00"
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Fetch dashboards from provided urls into this chart."""
|
||||
import json
|
||||
import re
|
||||
import textwrap
|
||||
from os import makedirs, path
|
||||
|
||||
|
@ -26,16 +27,18 @@ def change_style(style, representer):
|
|||
# Source files list
|
||||
charts = [
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/grafana-dashboardDefinitions.yaml',
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/grafana-dashboardDefinitions.yaml',
|
||||
'destination': '../templates/grafana/dashboards-1.14',
|
||||
'type': 'yaml',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
'min_kubernetes': '1.14.0-0',
|
||||
'multicluster_key': '.Values.grafana.sidecar.dashboards.multicluster.global.enabled',
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/grafana.json',
|
||||
'destination': '../templates/grafana/dashboards-1.14',
|
||||
'type': 'json',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
'min_kubernetes': '1.14.0-0',
|
||||
'multicluster_key': '(or .Values.grafana.sidecar.dashboards.multicluster.global.enabled .Values.grafana.sidecar.dashboards.multicluster.etcd.enabled)'
|
||||
},
|
||||
]
|
||||
|
||||
|
@ -64,7 +67,7 @@ https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-promet
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
name: {{ printf "%%s-%%s" (include "kube-prometheus-stack.fullname" $) "%(name)s" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
|
@ -103,20 +106,29 @@ def yaml_str_repr(struct, indent=2):
|
|||
text = textwrap.indent(text, ' ' * indent)
|
||||
return text
|
||||
|
||||
|
||||
def patch_json_for_multicluster_configuration(content):
|
||||
def patch_dashboards_json(content, multicluster_key):
|
||||
try:
|
||||
content_struct = json.loads(content)
|
||||
|
||||
# multicluster
|
||||
overwrite_list = []
|
||||
for variable in content_struct['templating']['list']:
|
||||
if variable['name'] == 'cluster':
|
||||
variable['hide'] = ':multicluster:'
|
||||
overwrite_list.append(variable)
|
||||
content_struct['templating']['list'] = overwrite_list
|
||||
|
||||
# fix drilldown links. See https://github.com/kubernetes-monitoring/kubernetes-mixin/issues/659
|
||||
for row in content_struct['rows']:
|
||||
for panel in row['panels']:
|
||||
for style in panel.get('styles', []):
|
||||
if 'linkUrl' in style and style['linkUrl'].startswith('./d'):
|
||||
style['linkUrl'] = style['linkUrl'].replace('./d', '/d')
|
||||
|
||||
content_array = []
|
||||
original_content_lines = content.split('\n')
|
||||
for i, line in enumerate(json.dumps(content_struct, indent=4).split('\n')):
|
||||
if ('[]' not in line and '{}' not in line) or line == original_content_lines[i]:
|
||||
if (' []' not in line and ' {}' not in line) or line == original_content_lines[i]:
|
||||
content_array.append(line)
|
||||
continue
|
||||
|
||||
|
@ -136,7 +148,7 @@ def patch_json_for_multicluster_configuration(content):
|
|||
if multicluster != -1:
|
||||
content = ''.join((
|
||||
content[:multicluster-1],
|
||||
'\{\{ if .Values.grafana.sidecar.dashboards.multicluster \}\}0\{\{ else \}\}2\{\{ end \}\}',
|
||||
'\{\{ if %s \}\}0\{\{ else \}\}2\{\{ end \}\}' % multicluster_key,
|
||||
content[multicluster + 15:]
|
||||
))
|
||||
except (ValueError, KeyError):
|
||||
|
@ -145,7 +157,12 @@ def patch_json_for_multicluster_configuration(content):
|
|||
return content
|
||||
|
||||
|
||||
def write_group_to_file(resource_name, content, url, destination, min_kubernetes, max_kubernetes):
|
||||
def patch_json_set_timezone_as_variable(content):
|
||||
# content is no more in json format, so we have to replace using regex
|
||||
return re.sub(r'"timezone"\s*:\s*"(?:\\.|[^\"])*"', '"timezone": "\{\{ .Values.grafana.defaultDashboardsTimezone \}\}"', content, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def write_group_to_file(resource_name, content, url, destination, min_kubernetes, max_kubernetes, multicluster_key):
|
||||
# initialize header
|
||||
lines = header % {
|
||||
'name': resource_name,
|
||||
|
@ -155,7 +172,8 @@ def write_group_to_file(resource_name, content, url, destination, min_kubernetes
|
|||
'max_kubernetes': max_kubernetes
|
||||
}
|
||||
|
||||
content = patch_json_for_multicluster_configuration(content)
|
||||
content = patch_dashboards_json(content, multicluster_key)
|
||||
content = patch_json_set_timezone_as_variable(content)
|
||||
|
||||
filename_struct = {resource_name + '.json': (LiteralStr(content))}
|
||||
# rules themselves
|
||||
|
@ -196,17 +214,17 @@ def main():
|
|||
groups = yaml_text['items']
|
||||
for group in groups:
|
||||
for resource, content in group['data'].items():
|
||||
write_group_to_file(resource.replace('.json', ''), content, chart['source'], chart['destination'], chart['min_kubernetes'], chart['max_kubernetes'])
|
||||
write_group_to_file(resource.replace('.json', ''), content, chart['source'], chart['destination'], chart['min_kubernetes'], chart['max_kubernetes'], chart['multicluster_key'])
|
||||
elif chart['type'] == 'json':
|
||||
json_text = json.loads(raw_text)
|
||||
# is it already a dashboard structure or is it nested (etcd case)?
|
||||
flat_structure = bool(json_text.get('annotations'))
|
||||
if flat_structure:
|
||||
resource = path.basename(chart['source']).replace('.json', '')
|
||||
write_group_to_file(resource, json.dumps(json_text, indent=4), chart['source'], chart['destination'], chart['min_kubernetes'], chart['max_kubernetes'])
|
||||
write_group_to_file(resource, json.dumps(json_text, indent=4), chart['source'], chart['destination'], chart['min_kubernetes'], chart['max_kubernetes'], chart['multicluster_key'])
|
||||
else:
|
||||
for resource, content in json_text.items():
|
||||
write_group_to_file(resource.replace('.json', ''), json.dumps(content, indent=4), chart['source'], chart['destination'], chart['min_kubernetes'], chart['max_kubernetes'])
|
||||
write_group_to_file(resource.replace('.json', ''), json.dumps(content, indent=4), chart['source'], chart['destination'], chart['min_kubernetes'], chart['max_kubernetes'], chart['multicluster_key'])
|
||||
print("Finished")
|
||||
|
||||
|
||||
|
|
|
@ -25,7 +25,37 @@ def change_style(style, representer):
|
|||
# Source files list
|
||||
charts = [
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/master/manifests/kubernetes-prometheusRule.yaml',
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/alertmanager-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-prometheus-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kubernetes-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/kube-state-metrics-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/node-exporter-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/main/manifests/prometheus-operator-prometheusRule.yaml',
|
||||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
|
@ -34,18 +64,6 @@ charts = [
|
|||
'destination': '../templates/prometheus/rules-1.14',
|
||||
'min_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml',
|
||||
'destination': '../templates/prometheus/rules',
|
||||
'min_kubernetes': '1.10.0-0',
|
||||
'max_kubernetes': '1.14.0-0'
|
||||
},
|
||||
{
|
||||
'source': 'https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml',
|
||||
'destination': '../templates/prometheus/rules',
|
||||
'min_kubernetes': '1.10.0-0',
|
||||
'max_kubernetes': '1.14.0-0'
|
||||
},
|
||||
]
|
||||
|
||||
# Additional conditions map
|
||||
|
@ -93,6 +111,7 @@ alert_condition_map = {
|
|||
'NodeExporterDown': '.Values.nodeExporter.enabled',
|
||||
'CoreDNSDown': '.Values.kubeDns.enabled',
|
||||
'AlertmanagerDown': '.Values.alertmanager.enabled',
|
||||
'AggregatedAPIDown': 'semverCompare ">=1.18.0-0" $kubeTargetVersion',
|
||||
}
|
||||
|
||||
replacement_map = {
|
||||
|
|
|
@ -1,63 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
annotations:
|
||||
message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync.
|
||||
expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerFailedReload
|
||||
annotations:
|
||||
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}.
|
||||
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: AlertmanagerMembersInconsistent
|
||||
annotations:
|
||||
message: Alertmanager has not found all other members of the cluster.
|
||||
expr: |-
|
||||
alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}
|
||||
!= on (service) GROUP_LEFT()
|
||||
count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,179 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).'
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.'
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.'
|
||||
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}'
|
||||
expr: |-
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHighNumberOfFailedHTTPRequests
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.'
|
||||
expr: |-
|
||||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: etcdHTTPRequestsSlow
|
||||
annotations:
|
||||
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,56 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'general.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.general }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "general.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}}% of the {{`{{`}} $labels.job {{`}}`}} targets are down.'
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
message: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
|
||||
'
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,83 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'k8s.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.k8s }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "k8s.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace)
|
||||
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |-
|
||||
sum by (namespace, pod_name, container_name) (
|
||||
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])
|
||||
)
|
||||
record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate
|
||||
- expr: |-
|
||||
sum by(namespace) (
|
||||
kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
|
||||
* on (endpoint, instance, job, namespace, pod, service)
|
||||
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
|
||||
)
|
||||
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |-
|
||||
sum by (namespace) (
|
||||
kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
|
||||
* on (endpoint, instance, job, namespace, pod, service)
|
||||
group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)
|
||||
)
|
||||
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |-
|
||||
sum(
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |-
|
||||
sum(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |-
|
||||
sum(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
{{- end }}
|
|
@ -1,39 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-apiserver.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeApiServer.enabled .Values.defaultRules.rules.kubeApiserver }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-apiserver.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-apiserver.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:apiserver_request_latencies:histogram_quantile
|
||||
{{- end }}
|
|
@ -1,47 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-prometheus-node-alerting.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeAlerting }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-alerting.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-alerting.rules
|
||||
rules:
|
||||
- alert: NodeDiskRunningFull
|
||||
annotations:
|
||||
message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 24 hours.
|
||||
expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeDiskRunningFull
|
||||
annotations:
|
||||
message: Device {{`{{`}} $labels.device {{`}}`}} of node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} will be full within the next 2 hours.
|
||||
expr: '(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)'
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,41 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-prometheus-node-recording.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubePrometheusNodeRecording }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-prometheus-node-recording.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY (instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance)
|
||||
record: instance:node_filesystem_usage:sum
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
{{- end }}
|
|
@ -1,63 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kube-scheduler.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeScheduler.enabled .Values.defaultRules.rules.kubeScheduler }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kube-scheduler.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.99'
|
||||
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.9'
|
||||
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile
|
||||
- expr: histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06
|
||||
labels:
|
||||
quantile: '0.5'
|
||||
record: cluster_quantile:scheduler_binding_latency:histogram_quantile
|
||||
{{- end }}
|
|
@ -1,159 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-absent' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesAbsent }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-absent" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-absent
|
||||
rules:
|
||||
{{- if .Values.alertmanager.enabled }}
|
||||
- alert: AlertmanagerDown
|
||||
annotations:
|
||||
message: Alertmanager has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-alertmanagerdown
|
||||
expr: absent(up{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeDns.enabled }}
|
||||
- alert: CoreDNSDown
|
||||
annotations:
|
||||
message: CoreDNS has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-corednsdown
|
||||
expr: absent(up{job="kube-dns"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeApiServer.enabled }}
|
||||
- alert: KubeAPIDown
|
||||
annotations:
|
||||
message: KubeAPI has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapidown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeControllerManager.enabled }}
|
||||
- alert: KubeControllerManagerDown
|
||||
annotations:
|
||||
message: KubeControllerManager has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecontrollermanagerdown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeScheduler.enabled }}
|
||||
- alert: KubeSchedulerDown
|
||||
annotations:
|
||||
message: KubeScheduler has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeschedulerdown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.kubeStateMetrics.enabled }}
|
||||
- alert: KubeStateMetricsDown
|
||||
annotations:
|
||||
message: KubeStateMetrics has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatemetricsdown
|
||||
expr: absent(up{job="kube-state-metrics"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.kubeletService.enabled }}
|
||||
- alert: KubeletDown
|
||||
annotations:
|
||||
message: Kubelet has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeletdown
|
||||
expr: absent(up{job="kubelet"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- if .Values.nodeExporter.enabled }}
|
||||
- alert: NodeExporterDown
|
||||
annotations:
|
||||
message: NodeExporter has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-nodeexporterdown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
- alert: PrometheusDown
|
||||
annotations:
|
||||
message: Prometheus has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusdown
|
||||
expr: absent(up{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.enabled }}
|
||||
- alert: PrometheusOperatorDown
|
||||
annotations:
|
||||
message: PrometheusOperator has disappeared from Prometheus target discovery.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-prometheusoperatordown
|
||||
expr: absent(up{job="{{ $operatorJob }}",namespace="{{ $namespace }}"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,200 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-apps' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesApps }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-apps" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} ({{`{{`}} $labels.container {{`}}`}}) is restarting {{`{{`}} printf "%.2f" $value {{`}}`}} times / 5 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodcrashlooping
|
||||
expr: rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}[15m]) * 60 * 5 > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
message: Pod {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}} has been in a non-ready state for longer than an hour.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepodnotready
|
||||
expr: sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}", phase=~"Pending|Unknown"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
message: Deployment generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} does not match, this indicates that the Deployment has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentgenerationmismatch
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
message: Deployment {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.deployment {{`}}`}} has not matched the expected number of replicas for longer than an hour.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedeploymentreplicasmismatch
|
||||
expr: |-
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} has not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetreplicasmismatch
|
||||
expr: |-
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
message: StatefulSet generation for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} does not match, this indicates that the StatefulSet has failed but has not been rolled back.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetgenerationmismatch
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
message: StatefulSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.statefulset {{`}}`}} update has not been rolled out.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubestatefulsetupdatenotrolledout
|
||||
expr: |-
|
||||
max without (revision) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message: Only {{`{{`}} $value {{`}}`}}% of the desired Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are scheduled and ready.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetrolloutstuck
|
||||
expr: |-
|
||||
kube_daemonset_status_number_ready{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
/
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} * 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are not scheduled.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetnotscheduled
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
message: '{{`{{`}} $value {{`}}`}} Pods of DaemonSet {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.daemonset {{`}}`}} are running where they are not supposed to run.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubedaemonsetmisscheduled
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeCronJobRunning
|
||||
annotations:
|
||||
message: CronJob {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.cronjob {{`}}`}} is taking more than 1h to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecronjobrunning
|
||||
expr: time() - kube_cronjob_next_schedule_time{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 3600
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeJobCompletion
|
||||
annotations:
|
||||
message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} is taking more than one hour to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobcompletion
|
||||
expr: kube_job_spec_completions{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} - kube_job_status_succeeded{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
message: Job {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.job_name {{`}}`}} failed to complete.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubejobfailed
|
||||
expr: kube_job_status_failed{job="kube-state-metrics", namespace=~"{{ $targetNamespace }}"} > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,121 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-resources' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesResources }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-resources" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
|
||||
expr: |-
|
||||
sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum)
|
||||
/
|
||||
sum(node:node_num_cpu:sum)
|
||||
>
|
||||
(count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeMemOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Pods and cannot tolerate node failure.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit
|
||||
expr: |-
|
||||
sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum)
|
||||
/
|
||||
sum(node_memory_MemTotal_bytes)
|
||||
>
|
||||
(count(node:node_num_cpu:sum)-1)
|
||||
/
|
||||
count(node:node_num_cpu:sum)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubecpuovercommit
|
||||
expr: |-
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
||||
/
|
||||
sum(node:node_num_cpu:sum)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeMemOvercommit
|
||||
annotations:
|
||||
message: Cluster has overcommitted memory resource requests for Namespaces.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubememovercommit
|
||||
expr: |-
|
||||
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
||||
/
|
||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
message: Namespace {{`{{`}} $labels.namespace {{`}}`}} is using {{`{{`}} printf "%0.0f" $value {{`}}`}}% of its {{`{{`}} $labels.resource {{`}}`}} quota.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubequotaexceeded
|
||||
expr: |-
|
||||
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
message: '{{`{{`}} printf "%0.0f" $value {{`}}`}}% throttling of CPU in namespace {{`{{`}} $labels.namespace {{`}}`}} for container {{`{{`}} $labels.container_name {{`}}`}} in pod {{`{{`}} $labels.pod_name {{`}}`}}.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-cputhrottlinghigh
|
||||
expr: |-
|
||||
100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!="", }[5m])) by (container_name, pod_name, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)
|
||||
> 25
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,72 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-storage' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesStorage }}
|
||||
{{- $targetNamespace := .Values.defaultRules.appNamespacesTarget }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-storage" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeUsageCritical
|
||||
annotations:
|
||||
message: The PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is only {{`{{`}} printf "%0.2f" $value {{`}}`}}% free.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeusagecritical
|
||||
expr: |-
|
||||
100 * kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
|
||||
< 3
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubePersistentVolumeFullInFourDays
|
||||
annotations:
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{`{{`}} $labels.persistentvolumeclaim {{`}}`}} in Namespace {{`{{`}} $labels.namespace {{`}}`}} is expected to fill up within four days. Currently {{`{{`}} printf "%0.2f" $value {{`}}`}}% is available.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefullinfourdays
|
||||
expr: |-
|
||||
100 * (
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}
|
||||
) < 15
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
message: The persistent volume {{`{{`}} $labels.persistentvolume {{`}}`}} has status {{`{{`}} $labels.phase {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumeerrors
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,184 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'kubernetes-system' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.kubernetesSystem }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "kubernetes-system" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: kubernetes-system
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
message: '{{`{{`}} $labels.node {{`}}`}} has been unready for more than an hour.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubenodenotready
|
||||
expr: kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeVersionMismatch
|
||||
annotations:
|
||||
message: There are {{`{{`}} $value {{`}}`}} different semantic versions of Kubernetes components running.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeversionmismatch
|
||||
expr: count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}}% errors.'
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
|
||||
expr: |-
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
/
|
||||
sum(rate(rest_client_requests_total[5m])) by (instance, job))
|
||||
* 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{`{{`}} $labels.job {{`}}`}}/{{`{{`}} $labels.instance {{`}}`}}' is experiencing {{`{{`}} printf "%0.0f" $value {{`}}`}} errors / second.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclienterrors
|
||||
expr: sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
message: Kubelet {{`{{`}} $labels.instance {{`}}`}} is running {{`{{`}} $value {{`}}`}} Pods, close to the limit of 110.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubelettoomanypods
|
||||
expr: kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPILatencyHigh
|
||||
annotations:
|
||||
message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
|
||||
expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPILatencyHigh
|
||||
annotations:
|
||||
message: The API server has a 99th percentile latency of {{`{{`}} $value {{`}}`}} seconds for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapilatencyhigh
|
||||
expr: cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
|
||||
expr: |-
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
|
||||
expr: |-
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
|
||||
expr: |-
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.verb {{`}}`}} {{`{{`}} $labels.resource {{`}}`}} {{`{{`}} $labels.subresource {{`}}`}}.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeapierrorshigh
|
||||
expr: |-
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 7.0 days.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubeclientcertificateexpiration
|
||||
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,57 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'node-network' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.network }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-network" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NetworkReceiveErrors
|
||||
annotations:
|
||||
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing receive errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NetworkTransmitErrors
|
||||
annotations:
|
||||
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" showing transmit errors on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
message: Network interface "{{`{{`}} $labels.device {{`}}`}}" changing it's up status often on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}"
|
||||
expr: changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,37 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'node-time' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.time }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node-time" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node-time
|
||||
rules:
|
||||
- alert: ClockSkewDetected
|
||||
annotations:
|
||||
message: Clock skew detected on node-exporter {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod {{`}}`}}. Ensure NTP is configured correctly on this host.
|
||||
expr: abs(node_timex_offset_seconds{job="node-exporter"}) > 0.03
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,202 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'node.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.node }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "node.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- expr: sum(min(kube_pod_info) by (node))
|
||||
record: ':kube_pod_info_node_count:'
|
||||
- expr: max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
||||
record: 'node_namespace_pod:kube_pod_info:'
|
||||
- expr: |-
|
||||
count by (node) (sum by (node, cpu) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
))
|
||||
record: node:node_num_cpu:sum
|
||||
- expr: 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
||||
record: :node_cpu_utilisation:avg1m
|
||||
- expr: |-
|
||||
1 - avg by (node) (
|
||||
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:)
|
||||
record: node:node_cpu_utilisation:avg1m
|
||||
- expr: |-
|
||||
node:node_cpu_utilisation:avg1m
|
||||
*
|
||||
node:node_num_cpu:sum
|
||||
/
|
||||
scalar(sum(node:node_num_cpu:sum))
|
||||
record: node:cluster_cpu_utilisation:ratio
|
||||
- expr: |-
|
||||
sum(node_load1{job="node-exporter"})
|
||||
/
|
||||
sum(node:node_num_cpu:sum)
|
||||
record: ':node_cpu_saturation_load1:'
|
||||
- expr: |-
|
||||
sum by (node) (
|
||||
node_load1{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
/
|
||||
node:node_num_cpu:sum
|
||||
record: 'node:node_cpu_saturation_load1:'
|
||||
- expr: |-
|
||||
1 -
|
||||
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
||||
/
|
||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
||||
record: ':node_memory_utilisation:'
|
||||
- expr: sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
||||
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
||||
- expr: sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
||||
record: :node_memory_MemTotal_bytes:sum
|
||||
- expr: |-
|
||||
sum by (node) (
|
||||
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_memory_bytes_available:sum
|
||||
- expr: |-
|
||||
sum by (node) (
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_memory_bytes_total:sum
|
||||
- expr: |-
|
||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
||||
/
|
||||
node:node_memory_bytes_total:sum
|
||||
record: node:node_memory_utilisation:ratio
|
||||
- expr: |-
|
||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
||||
/
|
||||
scalar(sum(node:node_memory_bytes_total:sum))
|
||||
record: node:cluster_memory_utilisation:ratio
|
||||
- expr: |-
|
||||
1e3 * sum(
|
||||
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
||||
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
||||
)
|
||||
record: :node_memory_swap_io_bytes:sum_rate
|
||||
- expr: |-
|
||||
1 -
|
||||
sum by (node) (
|
||||
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
/
|
||||
sum by (node) (
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: 'node:node_memory_utilisation:'
|
||||
- expr: 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
||||
record: 'node:node_memory_utilisation_2:'
|
||||
- expr: |-
|
||||
1e3 * sum by (node) (
|
||||
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
||||
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_memory_swap_io_bytes:sum_rate
|
||||
- expr: avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
||||
record: :node_disk_utilisation:avg_irate
|
||||
- expr: |-
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_disk_utilisation:avg_irate
|
||||
- expr: avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
||||
record: :node_disk_saturation:avg_irate
|
||||
- expr: |-
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_disk_saturation:avg_irate
|
||||
- expr: |-
|
||||
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
|
||||
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
||||
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
||||
record: 'node:node_filesystem_usage:'
|
||||
- expr: max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
||||
record: 'node:node_filesystem_avail:'
|
||||
- expr: |-
|
||||
sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
||||
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
||||
record: :node_net_utilisation:sum_irate
|
||||
- expr: |-
|
||||
sum by (node) (
|
||||
(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
||||
irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_net_utilisation:sum_irate
|
||||
- expr: |-
|
||||
sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
||||
sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
||||
record: :node_net_saturation:sum_irate
|
||||
- expr: |-
|
||||
sum by (node) (
|
||||
(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
||||
irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_net_saturation:sum_irate
|
||||
- expr: |-
|
||||
max(
|
||||
max(
|
||||
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
||||
) by (node, host_ip)
|
||||
* on (host_ip) group_right (node)
|
||||
label_replace(
|
||||
(max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
||||
)
|
||||
) by (node)
|
||||
record: 'node:node_inodes_total:'
|
||||
- expr: |-
|
||||
max(
|
||||
max(
|
||||
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
||||
) by (node, host_ip)
|
||||
* on (host_ip) group_right (node)
|
||||
label_replace(
|
||||
(max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
||||
)
|
||||
) by (node)
|
||||
record: 'node:node_inodes_free:'
|
||||
{{- end }}
|
|
@ -1,49 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'prometheus-operator' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheusOperator }}
|
||||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus-operator" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus-operator
|
||||
rules:
|
||||
- alert: PrometheusOperatorReconcileErrors
|
||||
annotations:
|
||||
message: Errors while reconciling {{`{{`}} $labels.controller {{`}}`}} in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
expr: rate(prometheus_operator_reconcile_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusOperatorNodeLookupErrors
|
||||
annotations:
|
||||
message: Errors while reconciling Prometheus in {{`{{`}} $labels.namespace {{`}}`}} Namespace.
|
||||
expr: rate(prometheus_operator_node_address_lookup_errors_total{job="{{ $operatorJob }}",namespace="{{ $namespace }}"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -1,139 +0,0 @@
|
|||
{{- /*
|
||||
Generated from 'prometheus.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml
|
||||
Do not change in-place! In order to change this file first read following link:
|
||||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.prometheus }}
|
||||
{{- $prometheusJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "prometheus.rules" | trunc 63 | trimSuffix "-" }}
|
||||
namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
labels:
|
||||
app: {{ template "kube-prometheus-stack.name" . }}
|
||||
{{ include "kube-prometheus-stack.labels" . | indent 4 }}
|
||||
{{- if .Values.defaultRules.labels }}
|
||||
{{ toYaml .Values.defaultRules.labels | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.defaultRules.annotations }}
|
||||
annotations:
|
||||
{{ toYaml .Values.defaultRules.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
spec:
|
||||
groups:
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}}
|
||||
summary: Reloading Prometheus' configuration failed
|
||||
expr: prometheus_config_last_reload_successful{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}
|
||||
summary: Prometheus' alert notification queue is running full
|
||||
expr: predict_linear(prometheus_notifications_queue_length{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}}
|
||||
summary: Errors while sending alert from Prometheus
|
||||
expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} to Alertmanager {{`{{`}}$labels.Alertmanager{{`}}`}}
|
||||
summary: Errors while sending alerts from Prometheus
|
||||
expr: rate(prometheus_notifications_errors_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) / rate(prometheus_notifications_sent_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} is not connected to any Alertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
annotations:
|
||||
description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
annotations:
|
||||
description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} had {{`{{`}}$value | humanize{{`}}`}} compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTSDBWALCorruptions
|
||||
annotations:
|
||||
description: '{{`{{`}}$labels.job{{`}}`}} at {{`{{`}}$labels.instance{{`}}`}} has a corrupted write-ahead log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
expr: prometheus_tsdb_wal_corruptions_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"} > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusNotIngestingSamples
|
||||
annotations:
|
||||
description: Prometheus {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}} isn't ingesting samples.
|
||||
summary: Prometheus isn't ingesting samples
|
||||
expr: rate(prometheus_tsdb_head_samples_appended_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) <= 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
- alert: PrometheusTargetScrapesDuplicate
|
||||
annotations:
|
||||
description: '{{`{{`}}$labels.namespace{{`}}`}}/{{`{{`}}$labels.pod{{`}}`}} has many samples rejected due to duplicate timestamps but different values'
|
||||
summary: Prometheus has many samples rejected
|
||||
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="{{ $prometheusJob }}",namespace="{{ $namespace }}"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
{{- end }}
|
|
@ -0,0 +1,97 @@
|
|||
suite: test ingress
|
||||
templates:
|
||||
- alertmanager/ingress.yaml
|
||||
tests:
|
||||
- it: should be empty if alertmanager is not enabled
|
||||
set:
|
||||
alertmanager.enabled: false
|
||||
alertmanager.ingress.enabled: true
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 0
|
||||
- it: should be empty if ingress is not enabled
|
||||
set:
|
||||
alertmanager.enabled: true
|
||||
alertmanager.ingress.enabled: false
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 0
|
||||
- it: should have apiVersion extensions/v1beta1 for k8s < 1.16
|
||||
set:
|
||||
alertmanager.enabled: true
|
||||
alertmanager.ingress.enabled: true
|
||||
capabilities:
|
||||
majorVersion: 1
|
||||
minorVersion: 15
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Ingress
|
||||
- isAPIVersion:
|
||||
of: extensions/v1beta1
|
||||
- it: should have apiVersion networking.k8s.io/v1beta1 for k8s >= 1.16 < 1.19
|
||||
set:
|
||||
alertmanager.enabled: true
|
||||
alertmanager.ingress.enabled: true
|
||||
capabilities:
|
||||
majorVersion: 1
|
||||
minorVersion: 16
|
||||
apiVersions:
|
||||
- networking.k8s.io/v1beta1
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Ingress
|
||||
- isAPIVersion:
|
||||
of: networking.k8s.io/v1beta1
|
||||
- it: should have apiVersion networking.k8s.io/v1 for k8s >= 1.19 < 1.22
|
||||
set:
|
||||
alertmanager.enabled: true
|
||||
alertmanager.ingress.enabled: true
|
||||
capabilities:
|
||||
majorVersion: 1
|
||||
minorVersion: 10
|
||||
apiVersions:
|
||||
- networking.k8s.io/v1
|
||||
- networking.k8s.io/v1beta1
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Ingress
|
||||
- isAPIVersion:
|
||||
of: networking.k8s.io/v1beta1
|
||||
- it: should have apiVersion networking.k8s.io/v1 for k8s >= 1.22
|
||||
set:
|
||||
alertmanager.enabled: true
|
||||
alertmanager.ingress.enabled: true
|
||||
capabilities:
|
||||
majorVersion: 1
|
||||
minorVersion: 22
|
||||
apiVersions:
|
||||
- networking.k8s.io/v1
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- isKind:
|
||||
of: Ingress
|
||||
- isAPIVersion:
|
||||
of: networking.k8s.io/v1
|
||||
|
||||
- it: should have explicit pathType ImplementationSpecific for networking.k8s.io/v1 by default
|
||||
set:
|
||||
alertmanager.enabled: true
|
||||
alertmanager.ingress.enabled: true
|
||||
capabilities:
|
||||
majorVersion: 1
|
||||
minorVersion: 19
|
||||
apiVersions:
|
||||
- networking.k8s.io/v1
|
||||
asserts:
|
||||
- hasDocuments:
|
||||
count: 1
|
||||
- equal:
|
||||
path: spec.rules[0].http.paths[0].pathType
|
||||
value: ImplementationSpecific
|
|
@ -13,3 +13,17 @@ The chart installs the following components:
|
|||
- [Prometheus Adapter](https://github.com/helm/charts/tree/master/stable/prometheus-adapter) - The adapter allows a user to expose custom metrics, resource metrics, and external metrics on the default [Prometheus](https://prometheus.io/) instance to the Kubernetes API Server.
|
||||
|
||||
For more information, review the Helm README of this chart.
|
||||
|
||||
## Upgrading from 100.0.0+up16.6.0 to 100.1.0+up19.0.3
|
||||
|
||||
### Noticeable changes:
|
||||
Grafana:
|
||||
- `sidecar.dashboards.searchNamespace`, `sidecar.datasources.searchNamespace` and `sidecar.notifiers.searchNamespace` support a list of namespaces now.
|
||||
|
||||
Kube-state-metrics
|
||||
- the type of `collectors` is changed from Dictionary to List.
|
||||
- `kubeStateMetrics.serviceMonitor.namespaceOverride` was replaced by `kube-state-metrics.namespaceOverride`.
|
||||
|
||||
### Known issues:
|
||||
- Occasionally, the upgrade fails with errors related to the webhook `prometheusrulemutate.monitoring.coreos.com`. This is a known issue in the upstream, and the workaround is to trigger the upgrade one more time. [32416](https://github.com/rancher/rancher/issues/32416#issuecomment-828881726)
|
||||
|
||||
|
|
|
@ -43,7 +43,6 @@ metadata:
|
|||
rbac.authorization.k8s.io/aggregate-to-edit: "true"
|
||||
{{- end }}
|
||||
rules:
|
||||
rules:
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
--- charts-original/Chart.yaml
|
||||
+++ charts/Chart.yaml
|
||||
@@ -1,3 +1,35 @@
|
||||
@@ -1,3 +1,38 @@
|
||||
+apiVersion: v2
|
||||
+description: Collects several related Helm charts, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator.
|
||||
+icon: https://raw.githubusercontent.com/prometheus/prometheus.github.io/master/assets/prometheus_logo-cb55bb5c346.png
|
||||
|
@ -20,12 +20,15 @@
|
|||
+ - name: Arvind
|
||||
+ email: arvind.iyengar@suse.com
|
||||
+ url: ""
|
||||
+ - name: Jack
|
||||
+ email: jiaqi.luo@suse.com
|
||||
+ url: "https://github.com/jiaqiluo"
|
||||
+name: rancher-monitoring
|
||||
+sources:
|
||||
+ - https://github.com/prometheus-community/helm-charts
|
||||
+ - https://github.com/prometheus-operator/kube-prometheus
|
||||
+version: 16.6.0
|
||||
+appVersion: 0.48.0
|
||||
+version: 19.0.3
|
||||
+appVersion: 0.50.0
|
||||
+kubeVersion: ">=1.16.0-0"
|
||||
+home: https://github.com/prometheus-operator/kube-prometheus
|
||||
+keywords:
|
||||
|
@ -36,12 +39,12 @@
|
|||
annotations:
|
||||
artifacthub.io/links: |
|
||||
- name: Chart Source
|
||||
@@ -5,8 +37,16 @@
|
||||
@@ -5,8 +40,19 @@
|
||||
- name: Upstream Project
|
||||
url: https://github.com/prometheus-operator/kube-prometheus
|
||||
artifacthub.io/operator: "true"
|
||||
-apiVersion: v2
|
||||
-appVersion: 0.48.0
|
||||
-appVersion: 0.50.0
|
||||
+ catalog.cattle.io/certified: rancher
|
||||
+ catalog.cattle.io/namespace: cattle-monitoring-system
|
||||
+ catalog.cattle.io/release-name: rancher-monitoring
|
||||
|
@ -52,10 +55,13 @@
|
|||
+ catalog.cattle.io/auto-install: rancher-monitoring-crd=match
|
||||
+ catalog.cattle.io/requests-cpu: "4500m"
|
||||
+ catalog.cattle.io/requests-memory: "4000Mi"
|
||||
+ catalog.cattle.io/rancher-version: ">= 2.6.0-0 <=2.6.99-0"
|
||||
+ catalog.cattle.io/kube-version: ">=1.16.0-0"
|
||||
+ catalog.cattle.io/upstream-version: "19.0.3"
|
||||
dependencies:
|
||||
- condition: grafana.enabled
|
||||
name: grafana
|
||||
@@ -71,34 +111,6 @@
|
||||
@@ -71,34 +117,6 @@
|
||||
- condition: rkeScheduler.enabled
|
||||
name: rkeScheduler
|
||||
repository: file://./charts/rkeScheduler
|
||||
|
@ -90,4 +96,4 @@
|
|||
-- https://github.com/prometheus-community/helm-charts
|
||||
-- https://github.com/prometheus-operator/kube-prometheus
|
||||
-type: application
|
||||
-version: 16.6.0
|
||||
-version: 19.0.3
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
--- charts-original/README.md
|
||||
+++ charts/README.md
|
||||
@@ -193,7 +193,39 @@
|
||||
@@ -230,7 +230,39 @@
|
||||
helm show values prometheus-community/kube-prometheus-stack
|
||||
```
|
||||
|
||||
|
|
|
@ -1,11 +0,0 @@
|
|||
--- charts-original/charts/windowsExporter/values.yaml
|
||||
+++ charts/charts/windowsExporter/values.yaml
|
||||
@@ -25,7 +25,7 @@
|
||||
port: 9796
|
||||
image:
|
||||
repository: rancher/windows_exporter-package
|
||||
- tag: v0.0.2
|
||||
+ tag: v0.0.3
|
||||
os: "windows"
|
||||
|
||||
# Specify the IP addresses of nodes that you want to collect metrics from
|
|
@ -25,7 +25,7 @@
|
|||
{{- end }}
|
||||
paused: {{ .Values.alertmanager.alertmanagerSpec.paused }}
|
||||
@@ -104,8 +106,8 @@
|
||||
- {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-alertmanager]}
|
||||
- {key: alertmanager, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-alertmanager]}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
+ tolerations: {{ include "linux-node-tolerations" . | nindent 4 }}
|
||||
|
|
|
@ -20,7 +20,7 @@
|
|||
{{ toYaml .Values.alertmanager.secret.annotations | indent 4 }}
|
||||
{{- end }}
|
||||
labels:
|
||||
@@ -20,4 +26,4 @@
|
||||
@@ -24,4 +30,4 @@
|
||||
{{- range $key, $val := .Values.alertmanager.templateFiles }}
|
||||
{{ $key }}: {{ $val | b64enc | quote }}
|
||||
{{- end }}
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
--- charts-original/templates/exporters/kube-state-metrics/serviceMonitor.yaml
|
||||
+++ charts/templates/exporters/kube-state-metrics/serviceMonitor.yaml
|
||||
@@ -1,4 +1,7 @@
|
||||
{{- if .Values.kubeStateMetrics.enabled }}
|
||||
+{{- if .Values.kubeStateMetrics.serviceMonitor.namespaceOverride }}
|
||||
+{{- fail "kubeStateMetrics.serviceMonitor.namespaceOverride was removed. Please use kube-state-metrics.namespaceOverride instead." }}
|
||||
+{{- end }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
|
@ -4,7 +4,7 @@
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) $dashboardName | trunc 63 | trimSuffix "-" }}
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" $ }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" $ }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
labels:
|
||||
{{- if $.Values.grafana.sidecar.dashboards.label }}
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
kind: ConfigMap
|
||||
metadata:
|
||||
name: {{ template "kube-prometheus-stack.fullname" . }}-grafana-datasource
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ default .Values.grafana.sidecar.datasources.searchNamespace (include "kube-prometheus-stack.namespace" .) }}
|
||||
{{- if .Values.grafana.sidecar.datasources.annotations }}
|
||||
annotations:
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
--- charts-original/templates/grafana/dashboards-1.14/alertmanager-overview.yaml
|
||||
+++ charts/templates/grafana/dashboards-1.14/alertmanager-overview.yaml
|
||||
@@ -5,10 +5,11 @@
|
||||
*/ -}}
|
||||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }}
|
||||
{{- if and (or .Values.grafana.enabled .Values.grafana.forceDeployDashboards) (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.grafana.defaultDashboardsEnabled }}
|
||||
+{{- if and .Values.alertmanager.enabled .Values.alertmanager.serviceMonitor.selfMonitor }}
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "alertmanager-overview" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -607,4 +608,5 @@
|
||||
"uid": "alertmanager-overview",
|
||||
"version": 0
|
||||
}
|
||||
-{{- end }}
|
||||
\ No newline at end of file
|
||||
+{{- end }}
|
||||
+{{- end }}
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "apiserver" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,8 +4,17 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "cluster-total" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -1831,7 +1831,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "controller-manager" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
@ -32,29 +32,29 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
+ "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
- "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
|
||||
+ "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
@@ -282,7 +287,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
+ "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance, name)",
|
||||
- "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
|
||||
+ "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
@@ -388,7 +393,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (instance, name, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance, name, le))",
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-controller-manager\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} {{`{{`}}name{{`}}`}}",
|
||||
@@ -494,28 +499,28 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
|
@ -133,12 +133,21 @@
|
|||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1100,7 +1105,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kube-controller-manager\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
@@ -1126,7 +1131,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)",
|
||||
+ "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\"}, instance)",
|
||||
- "query": "label_values(up{cluster=\"$cluster\", job=\"kube-controller-manager\"}, instance)",
|
||||
+ "query": "label_values(up{cluster=\"$cluster\", job=\"{{ include "exporter.kubeControllerManager.jobName" . }}\"}, instance)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "etcd" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-coredns" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,8 +4,17 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-cluster" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -2973,7 +2973,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-namespace" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-node" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-pod" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-workload" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "k8s-resources-workloads-namespace" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -10,268 +10,277 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "kubelet" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -107,7 +108,7 @@
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})",
|
||||
+ "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
@@ -191,7 +192,7 @@
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
+ "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -275,7 +276,7 @@
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
+ "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -359,7 +360,7 @@
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
|
||||
+ "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -443,7 +444,7 @@
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})",
|
||||
+ "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -527,7 +528,7 @@
|
||||
"tableColumn": "",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))",
|
||||
+ "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -609,7 +610,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
|
||||
+ "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -702,7 +703,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
+ "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -808,7 +809,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -914,14 +915,14 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} pod",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} worker",
|
||||
@@ -1014,14 +1015,14 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} pod",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} worker",
|
||||
@@ -1129,7 +1130,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
+ "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
|
||||
@@ -1224,7 +1225,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
+ "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
|
||||
@@ -1332,7 +1333,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
|
||||
@@ -1438,7 +1439,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
+ "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -1531,7 +1532,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -1638,7 +1639,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1731,7 +1732,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1837,7 +1838,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1943,28 +1944,28 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "2xx",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "3xx",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "4xx",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "5xx",
|
||||
@@ -2070,7 +2071,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}",
|
||||
@@ -2176,7 +2177,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
+ "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -2269,7 +2270,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])",
|
||||
+ "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -2362,7 +2363,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
+ "expr": "go_goroutines{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -2482,7 +2483,7 @@
|
||||
@@ -87,7 +88,7 @@
|
||||
"pluginVersion": "7",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\"})",
|
||||
+ "expr": "sum(kubelet_node_name{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "",
|
||||
@@ -144,7 +145,7 @@
|
||||
"pluginVersion": "7",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
+ "expr": "sum(kubelet_running_pods{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_pod_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -201,7 +202,7 @@
|
||||
"pluginVersion": "7",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
+ "expr": "sum(kubelet_running_containers{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}) OR sum(kubelet_running_container_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -258,7 +259,7 @@
|
||||
"pluginVersion": "7",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
|
||||
+ "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\", state=\"actual_state_of_world\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -315,7 +316,7 @@
|
||||
"pluginVersion": "7",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})",
|
||||
+ "expr": "sum(volume_manager_total_volumes{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",state=\"desired_state_of_world\"})",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -372,7 +373,7 @@
|
||||
"pluginVersion": "7",
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))",
|
||||
+ "expr": "sum(rate(kubelet_node_config_error{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -431,7 +432,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
|
||||
+ "expr": "sum(rate(kubelet_runtime_operations_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (operation_type, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -526,7 +527,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
+ "expr": "sum(rate(kubelet_runtime_operations_errors_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -621,7 +622,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_runtime_operations_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -716,14 +717,14 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} pod",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(kubelet_pod_worker_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} worker",
|
||||
@@ -818,14 +819,14 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_start_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} pod",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} worker",
|
||||
@@ -922,7 +923,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
+ "expr": "sum(rate(storage_operation_duration_seconds_count{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
|
||||
@@ -1019,7 +1020,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
+ "expr": "sum(rate(storage_operation_errors_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
|
||||
@@ -1116,7 +1117,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(storage_operation_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_name, volume_plugin, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_name{{`}}`}} {{`{{`}}volume_plugin{{`}}`}}",
|
||||
@@ -1211,7 +1212,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
+ "expr": "sum(rate(kubelet_cgroup_manager_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -1306,7 +1307,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_cgroup_manager_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, operation_type, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}operation_type{{`}}`}}",
|
||||
@@ -1402,7 +1403,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(kubelet_pleg_relist_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1497,7 +1498,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_interval_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1592,7 +1593,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1687,28 +1688,28 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"2..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "2xx",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"3..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "3xx",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"4..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "4xx",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
|
||||
+ "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\",code=~\"5..\"}[5m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "5xx",
|
||||
@@ -1803,7 +1804,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\", instance=~\"$instance\"}[5m])) by (instance, verb, url, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} {{`{{`}}verb{{`}}`}} {{`{{`}}url{{`}}`}}",
|
||||
@@ -1898,7 +1899,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
+ "expr": "process_resident_memory_bytes{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1993,7 +1994,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])",
|
||||
+ "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}[5m])",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -2088,7 +2089,7 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "go_goroutines{cluster=\"$cluster\",job=\"kubelet\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
+ "expr": "go_goroutines{cluster=\"$cluster\",job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\",instance=~\"$instance\"}",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -2177,7 +2178,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
@@ -2203,7 +2204,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
|
@ -280,7 +289,7 @@
|
|||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
@@ -2530,4 +2531,5 @@
|
||||
@@ -2251,4 +2252,5 @@
|
||||
"uid": "3138fa155d5915769fbded898ac09fd9",
|
||||
"version": 0
|
||||
}
|
||||
|
|
|
@ -4,8 +4,17 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "namespace-by-pod" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -1301,7 +1301,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
|
|
|
@ -4,8 +4,17 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "namespace-by-workload" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -1541,7 +1541,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "node-cluster-rsrc-use" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "node-rsrc-use" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "nodes" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "persistentvolumesusage" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,8 +4,17 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "pod-total" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
{{ toYaml .Values.grafana.sidecar.dashboards.annotations | indent 4 }}
|
||||
@@ -1033,7 +1033,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kubelet\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubelet.jobName" . }}\", metrics_path=\"/metrics/cadvisor\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 0,
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "prometheus-remote-write" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "prometheus" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "proxy" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "scheduler" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
@ -32,68 +32,68 @@
|
|||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
- "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
+ "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
- "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
+ "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} binding",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
- "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
+ "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
- "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
+ "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (instance)",
|
||||
- "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"kube-scheduler\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
+ "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\", instance=~\"$instance\"}[5m])) by (cluster, instance)",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} volume",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
|
||||
@@ -290,28 +295,28 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} e2e",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} e2e",
|
||||
"refId": "A"
|
||||
},
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} binding",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} binding",
|
||||
"refId": "B"
|
||||
},
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} scheduling algorithm",
|
||||
"refId": "C"
|
||||
},
|
||||
{
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (instance, le))",
|
||||
- "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"kube-scheduler\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
+ "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"{{ include "exporter.kubeScheduler.jobName" . }}\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}} volume",
|
||||
"legendFormat": "{{`{{`}}cluster{{`}}`}} {{`{{`}}instance{{`}}`}} volume",
|
||||
@@ -417,28 +422,28 @@
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
|
@ -172,6 +172,15 @@
|
|||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{`{{`}}instance{{`}}`}}",
|
||||
@@ -1023,7 +1028,7 @@
|
||||
"options": [
|
||||
|
||||
],
|
||||
- "query": "label_values(up{job=\"kube-scheduler\"}, cluster)",
|
||||
+ "query": "label_values(up{job=\"{{ include "exporter.kubeScheduler.jobName" . }}\"}, cluster)",
|
||||
"refresh": 2,
|
||||
"regex": "",
|
||||
"sort": 1,
|
||||
@@ -1049,7 +1054,7 @@
|
||||
"options": [
|
||||
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "statefulset" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
- namespace: {{ template "kube-prometheus-stack.namespace" . }}
|
||||
- namespace: {{ template "kube-prometheus-stack-grafana.namespace" . }}
|
||||
+ namespace: {{ .Values.grafana.defaultDashboards.namespace }}
|
||||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" $) "workload-total" | trunc 63 | trimSuffix "-" }}
|
||||
annotations:
|
||||
|
|
|
@ -22,9 +22,21 @@
|
|||
- - --prometheus-config-reloader={{ .Values.prometheusOperator.prometheusConfigReloaderImage.repository }}:{{ .Values.prometheusOperator.prometheusConfigReloaderImage.tag }}
|
||||
+ - --prometheus-config-reloader={{ template "system_default_registry" . }}{{ .Values.prometheusOperator.prometheusConfigReloaderImage.repository }}:{{ .Values.prometheusOperator.prometheusConfigReloaderImage.tag }}
|
||||
{{- end }}
|
||||
- --config-reloader-cpu={{ .Values.prometheusOperator.configReloaderCpu }}
|
||||
- --config-reloader-memory={{ .Values.prometheusOperator.configReloaderMemory }}
|
||||
@@ -130,16 +130,16 @@
|
||||
- --config-reloader-cpu-request={{ .Values.prometheusOperator.configReloaderCpu }}
|
||||
- --config-reloader-cpu-limit={{ .Values.prometheusOperator.configReloaderCpu }}
|
||||
@@ -81,9 +81,9 @@
|
||||
- --prometheus-instance-namespaces={{ .Values.prometheusOperator.prometheusInstanceNamespaces | join "," }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.thanosImage.sha }}
|
||||
- - --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}@sha256:{{ .Values.prometheusOperator.thanosImage.sha }}
|
||||
+ - --thanos-default-base-image={{ template "system_default_registry" . }}{{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}@sha256:{{ .Values.prometheusOperator.thanosImage.sha }}
|
||||
{{- else }}
|
||||
- - --thanos-default-base-image={{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}
|
||||
+ - --thanos-default-base-image={{ template "system_default_registry" . }}{{ .Values.prometheusOperator.thanosImage.repository }}:{{ .Values.prometheusOperator.thanosImage.tag }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheusOperator.thanosRulerInstanceNamespaces }}
|
||||
- --thanos-ruler-instance-namespaces={{ .Values.prometheusOperator.thanosRulerInstanceNamespaces | join "," }}
|
||||
@@ -137,16 +137,16 @@
|
||||
hostNetwork: true
|
||||
dnsPolicy: ClusterFirstWithHostNet
|
||||
{{- end }}
|
||||
|
|
|
@ -1,15 +1,24 @@
|
|||
--- charts-original/templates/prometheus/prometheus.yaml
|
||||
+++ charts/templates/prometheus/prometheus.yaml
|
||||
@@ -32,7 +32,7 @@
|
||||
{{ toYaml .Values.prometheus.prometheusSpec.apiserverConfig | indent 4}}
|
||||
@@ -33,13 +33,13 @@
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.image }}
|
||||
- image: {{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}
|
||||
+ image: {{ template "system_default_registry" . }}{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}
|
||||
{{- if and .Values.prometheus.prometheusSpec.image.tag .Values.prometheus.prometheusSpec.image.sha }}
|
||||
- image: "{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}@sha256:{{ .Values.prometheus.prometheusSpec.image.sha }}"
|
||||
+ image: "{{ template "system_default_registry" . }}{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}@sha256:{{ .Values.prometheus.prometheusSpec.image.sha }}"
|
||||
{{- else if .Values.prometheus.prometheusSpec.image.sha }}
|
||||
- image: "{{ .Values.prometheus.prometheusSpec.image.repository }}@sha256:{{ .Values.prometheus.prometheusSpec.image.sha }}"
|
||||
+ image: "{{ template "system_default_registry" . }}{{ .Values.prometheus.prometheusSpec.image.repository }}@sha256:{{ .Values.prometheus.prometheusSpec.image.sha }}"
|
||||
{{- else if .Values.prometheus.prometheusSpec.image.tag }}
|
||||
- image: "{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}"
|
||||
+ image: "{{ template "system_default_registry" . }}{{ .Values.prometheus.prometheusSpec.image.repository }}:{{ .Values.prometheus.prometheusSpec.image.tag }}"
|
||||
{{- else }}
|
||||
- image: "{{ .Values.prometheus.prometheusSpec.image.repository }}"
|
||||
+ image: "{{ template "system_default_registry" . }}{{ .Values.prometheus.prometheusSpec.image.repository }}"
|
||||
{{- end }}
|
||||
version: {{ .Values.prometheus.prometheusSpec.image.tag }}
|
||||
{{- if .Values.prometheus.prometheusSpec.image.sha }}
|
||||
sha: {{ .Values.prometheus.prometheusSpec.image.sha }}
|
||||
@@ -56,11 +56,13 @@
|
||||
@@ -64,11 +64,13 @@
|
||||
externalUrl: "{{ tpl .Values.prometheus.prometheusSpec.externalUrl . }}"
|
||||
{{- else if and .Values.prometheus.ingress.enabled .Values.prometheus.ingress.hosts }}
|
||||
externalUrl: "http://{{ tpl (index .Values.prometheus.ingress.hosts 0) . }}{{ .Values.prometheus.prometheusSpec.routePrefix }}"
|
||||
|
@ -24,7 +33,7 @@
|
|||
{{ toYaml .Values.prometheus.prometheusSpec.nodeSelector | indent 4 }}
|
||||
{{- end }}
|
||||
paused: {{ .Values.prometheus.prometheusSpec.paused }}
|
||||
@@ -232,8 +234,8 @@
|
||||
@@ -244,8 +246,8 @@
|
||||
- {key: prometheus, operator: In, values: [{{ template "kube-prometheus-stack.fullname" . }}-prometheus]}
|
||||
{{- end }}
|
||||
{{- end }}
|
||||
|
@ -34,7 +43,7 @@
|
|||
{{ toYaml .Values.prometheus.prometheusSpec.tolerations | indent 4 }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.topologySpreadConstraints }}
|
||||
@@ -266,7 +268,7 @@
|
||||
@@ -284,7 +286,7 @@
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.containers }}
|
||||
containers:
|
||||
|
@ -43,7 +52,7 @@
|
|||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.initContainers }}
|
||||
initContainers:
|
||||
@@ -282,6 +284,7 @@
|
||||
@@ -300,6 +302,7 @@
|
||||
{{- if .Values.prometheus.prometheusSpec.disableCompaction }}
|
||||
disableCompaction: {{ .Values.prometheus.prometheusSpec.disableCompaction }}
|
||||
{{- end }}
|
||||
|
@ -51,7 +60,7 @@
|
|||
portName: {{ .Values.prometheus.prometheusSpec.portName }}
|
||||
{{- end }}
|
||||
{{- if .Values.prometheus.prometheusSpec.volumes }}
|
||||
@@ -326,3 +329,4 @@
|
||||
@@ -356,3 +359,4 @@
|
||||
{{- if .Values.prometheus.prometheusSpec.allowOverlappingBlocks }}
|
||||
allowOverlappingBlocks: {{ .Values.prometheus.prometheusSpec.allowOverlappingBlocks }}
|
||||
{{- end }}
|
||||
|
|
|
@ -0,0 +1,18 @@
|
|||
--- charts-original/templates/prometheus/rules-1.14/alertmanager.rules.yaml
|
||||
+++ charts/templates/prometheus/rules-1.14/alertmanager.rules.yaml
|
||||
@@ -7,6 +7,7 @@
|
||||
{{- if and (semverCompare ">=1.14.0-0" $kubeTargetVersion) (semverCompare "<9.9.9-9" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }}
|
||||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }}
|
||||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }}
|
||||
+{{- if and .Values.alertmanager.enabled .Values.alertmanager.serviceMonitor.selfMonitor }}
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
@@ -172,4 +173,5 @@
|
||||
{{- if .Values.defaultRules.additionalRuleLabels }}
|
||||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }}
|
||||
{{- end }}
|
||||
-{{- end }}
|
||||
\ No newline at end of file
|
||||
+{{- end }}
|
||||
+{{- end }}
|
|
@ -4,12 +4,12 @@
|
|||
rules:
|
||||
- expr: |-
|
||||
sum by (cluster, namespace, pod, container) (
|
||||
- rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
+ rate(container_cpu_usage_seconds_total{job="{{ include "exporter.kubelet.jobName" . }}", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
- irate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
+ irate(container_cpu_usage_seconds_total{job="{{ include "exporter.kubelet.jobName" . }}", metrics_path="/metrics/cadvisor", image!=""}[5m])
|
||||
) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
|
||||
1, max by(cluster, namespace, pod, node) (kube_pod_info{node!=""})
|
||||
)
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
||||
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_irate
|
||||
- expr: |-
|
||||
- container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
|
||||
+ container_memory_working_set_bytes{job="{{ include "exporter.kubelet.jobName" . }}", metrics_path="/metrics/cadvisor", image!=""}
|
||||
|
|
|
@ -1,18 +1,25 @@
|
|||
--- charts-original/templates/prometheus/rules-1.14/kubernetes-storage.yaml
|
||||
+++ charts/templates/prometheus/rules-1.14/kubernetes-storage.yaml
|
||||
@@ -31,9 +31,9 @@
|
||||
@@ -31,13 +31,12 @@
|
||||
runbook_url: {{ .Values.defaultRules.runbookUrl }}alert-name-kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
- kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
- (
|
||||
- kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
- /
|
||||
- kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
- ) < 0.03
|
||||
+ kubelet_volume_stats_available_bytes{job="{{ include "exporter.kubelet.jobName" . }}", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
/
|
||||
- kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
+ /
|
||||
+ kubelet_volume_stats_capacity_bytes{job="{{ include "exporter.kubelet.jobName" . }}", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
< 0.03
|
||||
+ < 0.03
|
||||
and
|
||||
- kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
+ kubelet_volume_stats_used_bytes{job="{{ include "exporter.kubelet.jobName" . }}", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
for: 1m
|
||||
labels:
|
||||
@@ -48,12 +48,12 @@
|
||||
severity: critical
|
||||
@@ -51,14 +50,14 @@
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
|
@ -23,6 +30,9 @@
|
|||
+ kubelet_volume_stats_capacity_bytes{job="{{ include "exporter.kubelet.jobName" . }}", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
- kubelet_volume_stats_used_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
+ kubelet_volume_stats_used_bytes{job="{{ include "exporter.kubelet.jobName" . }}", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"} > 0
|
||||
and
|
||||
- predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
+ predict_linear(kubelet_volume_stats_available_bytes{job="{{ include "exporter.kubelet.jobName" . }}", namespace=~"{{ $targetNamespace }}", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 1h
|
||||
|
|
|
@ -463,7 +463,7 @@
|
|||
pspEnabled: true
|
||||
pspAnnotations: {}
|
||||
## Specify pod annotations
|
||||
@@ -187,25 +625,76 @@
|
||||
@@ -187,25 +625,77 @@
|
||||
## ref: https://prometheus.io/docs/alerting/notifications/
|
||||
## https://prometheus.io/docs/alerting/notification_examples/
|
||||
##
|
||||
|
@ -477,7 +477,7 @@
|
|||
- # {{- $root := . -}}
|
||||
- # {{ range .Alerts }}
|
||||
- # *Alert:* {{ .Annotations.summary }} - `{{ .Labels.severity }}`
|
||||
- # *Cluster:* {{ template "cluster" $root }}
|
||||
- # *Cluster:* {{ template "cluster" $root }}
|
||||
- # *Description:* {{ .Annotations.description }}
|
||||
- # *Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:>
|
||||
- # *Runbook:* <{{ .Annotations.runbook }}|:spiral_note_pad:>
|
||||
|
@ -486,6 +486,7 @@
|
|||
- # {{ end }}
|
||||
- # {{ end }}
|
||||
- # {{ end }}
|
||||
+
|
||||
+ templateFiles:
|
||||
+ rancher_defaults.tmpl: |-
|
||||
+ {{- define "slack.rancher.text" -}}
|
||||
|
@ -559,7 +560,7 @@
|
|||
|
||||
ingress:
|
||||
enabled: false
|
||||
@@ -395,7 +884,7 @@
|
||||
@@ -397,7 +887,7 @@
|
||||
## Image of Alertmanager
|
||||
##
|
||||
image:
|
||||
|
@ -568,7 +569,7 @@
|
|||
tag: v0.22.2
|
||||
sha: ""
|
||||
|
||||
@@ -507,9 +996,13 @@
|
||||
@@ -509,9 +999,13 @@
|
||||
## Define resources requests and limits for single Pods.
|
||||
## ref: https://kubernetes.io/docs/user-guide/compute-resources/
|
||||
##
|
||||
|
@ -585,7 +586,7 @@
|
|||
|
||||
## Pod anti-affinity can prevent the scheduler from placing Prometheus replicas on the same node.
|
||||
## The default value "soft" means that the scheduler should *prefer* to not schedule two replica pods onto the same node but no guarantee is provided.
|
||||
@@ -613,6 +1106,30 @@
|
||||
@@ -625,6 +1119,30 @@
|
||||
enabled: true
|
||||
namespaceOverride: ""
|
||||
|
||||
|
@ -616,7 +617,7 @@
|
|||
## ForceDeployDatasources Create datasource configmap even if grafana deployment has been disabled
|
||||
##
|
||||
forceDeployDatasources: false
|
||||
@@ -625,6 +1142,18 @@
|
||||
@@ -637,6 +1155,18 @@
|
||||
##
|
||||
defaultDashboardsEnabled: true
|
||||
|
||||
|
@ -632,10 +633,10 @@
|
|||
+ # Ignore if useExistingNamespace is true
|
||||
+ cleanupOnUninstall: false
|
||||
+
|
||||
adminPassword: prom-operator
|
||||
|
||||
ingress:
|
||||
@@ -664,6 +1193,7 @@
|
||||
## Timezone for the default dashboards
|
||||
## Other options are: browser or a specific timezone, i.e. Europe/Luxembourg
|
||||
##
|
||||
@@ -681,6 +1211,7 @@
|
||||
dashboards:
|
||||
enabled: true
|
||||
label: grafana_dashboard
|
||||
|
@ -643,7 +644,7 @@
|
|||
|
||||
## Annotations for Grafana dashboard configmaps
|
||||
##
|
||||
@@ -716,7 +1246,60 @@
|
||||
@@ -739,7 +1270,60 @@
|
||||
## Passed to grafana subchart and used by servicemonitor below
|
||||
##
|
||||
service:
|
||||
|
@ -705,11 +706,10 @@
|
|||
|
||||
## If true, create a serviceMonitor for grafana
|
||||
##
|
||||
@@ -746,6 +1329,14 @@
|
||||
# targetLabel: nodename
|
||||
@@ -773,6 +1357,17 @@
|
||||
# replacement: $1
|
||||
# action: replace
|
||||
+
|
||||
|
||||
+ resources:
|
||||
+ limits:
|
||||
+ memory: 200Mi
|
||||
|
@ -717,10 +717,14 @@
|
|||
+ requests:
|
||||
+ memory: 100Mi
|
||||
+ cpu: 100m
|
||||
|
||||
+
|
||||
+ testFramework:
|
||||
+ enabled: false
|
||||
+
|
||||
## Component scraping the kube api server
|
||||
##
|
||||
@@ -907,7 +1498,7 @@
|
||||
kubeApiServer:
|
||||
@@ -952,7 +1547,7 @@
|
||||
## Component scraping the kube controller manager
|
||||
##
|
||||
kubeControllerManager:
|
||||
|
@ -729,7 +733,7 @@
|
|||
|
||||
## If your kube controller manager is not deployed as a pod, specify IPs it can be found on
|
||||
##
|
||||
@@ -1054,7 +1645,7 @@
|
||||
@@ -1110,7 +1705,7 @@
|
||||
## Component scraping etcd
|
||||
##
|
||||
kubeEtcd:
|
||||
|
@ -738,7 +742,7 @@
|
|||
|
||||
## If your etcd is not deployed as a pod, specify IPs it can be found on
|
||||
##
|
||||
@@ -1119,7 +1710,7 @@
|
||||
@@ -1177,7 +1772,7 @@
|
||||
## Component scraping kube scheduler
|
||||
##
|
||||
kubeScheduler:
|
||||
|
@ -747,7 +751,7 @@
|
|||
|
||||
## If your kube scheduler is not deployed as a pod, specify IPs it can be found on
|
||||
##
|
||||
@@ -1177,7 +1768,7 @@
|
||||
@@ -1237,7 +1832,7 @@
|
||||
## Component scraping kube proxy
|
||||
##
|
||||
kubeProxy:
|
||||
|
@ -756,7 +760,7 @@
|
|||
|
||||
## If your kube proxy is not deployed as a pod, specify IPs it can be found on
|
||||
##
|
||||
@@ -1266,6 +1857,13 @@
|
||||
@@ -1337,6 +1932,13 @@
|
||||
create: true
|
||||
podSecurityPolicy:
|
||||
enabled: true
|
||||
|
@ -770,7 +774,7 @@
|
|||
|
||||
## Deploy node exporter as a daemonset to all nodes
|
||||
##
|
||||
@@ -1319,6 +1917,16 @@
|
||||
@@ -1392,6 +1994,16 @@
|
||||
extraArgs:
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|bpf|cgroup2?|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|iso9660|mqueue|nsfs|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|selinuxfs|squashfs|sysfs|tracefs)$
|
||||
|
@ -787,7 +791,7 @@
|
|||
|
||||
## Manages Prometheus and Alertmanager components
|
||||
##
|
||||
@@ -1331,8 +1939,8 @@
|
||||
@@ -1404,8 +2016,8 @@
|
||||
enabled: true
|
||||
# Value must match version names from https://golang.org/pkg/crypto/tls/#pkg-constants
|
||||
tlsMinVersion: VersionTLS13
|
||||
|
@ -798,16 +802,19 @@
|
|||
|
||||
## Admission webhook support for PrometheusRules resources added in Prometheus Operator 0.30 can be enabled to prevent incorrectly formatted
|
||||
## rules from making their way into prometheus and potentially preventing the container from starting
|
||||
@@ -1349,7 +1957,7 @@
|
||||
@@ -1422,9 +2034,9 @@
|
||||
patch:
|
||||
enabled: true
|
||||
image:
|
||||
- repository: jettech/kube-webhook-certgen
|
||||
+ repository: rancher/mirrored-jettech-kube-webhook-certgen
|
||||
tag: v1.5.2
|
||||
sha: ""
|
||||
- repository: k8s.gcr.io/ingress-nginx/kube-webhook-certgen
|
||||
+ repository: rancher/mirrored-ingress-nginx-kube-webhook-certgen
|
||||
tag: v1.0
|
||||
- sha: "f3b6b39a6062328c095337b4cadcefd1612348fdd5190b1dcbcb9b9e90bd8068"
|
||||
+ sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
@@ -1498,13 +2106,13 @@
|
||||
resources: {}
|
||||
## Provide a priority class name to the webhook patching job
|
||||
@@ -1571,13 +2183,13 @@
|
||||
|
||||
## Resource limits & requests
|
||||
##
|
||||
|
@ -828,25 +835,34 @@
|
|||
|
||||
# Required for use in managed kubernetes clusters (such as AWS EKS) with custom CNI (such as calico),
|
||||
# because control-plane managed by AWS cannot communicate with pods' IP CIDR and admission webhooks are not working
|
||||
@@ -1557,7 +2165,7 @@
|
||||
@@ -1630,7 +2242,7 @@
|
||||
## Prometheus-operator image
|
||||
##
|
||||
image:
|
||||
- repository: quay.io/prometheus-operator/prometheus-operator
|
||||
+ repository: rancher/mirrored-prometheus-operator-prometheus-operator
|
||||
tag: v0.48.0
|
||||
tag: v0.50.0
|
||||
sha: ""
|
||||
pullPolicy: IfNotPresent
|
||||
@@ -1573,7 +2181,7 @@
|
||||
@@ -1646,7 +2258,7 @@
|
||||
## Prometheus-config-reloader image to use for config and rule reloading
|
||||
##
|
||||
prometheusConfigReloaderImage:
|
||||
- repository: quay.io/prometheus-operator/prometheus-config-reloader
|
||||
+ repository: rancher/mirrored-prometheus-operator-prometheus-config-reloader
|
||||
tag: v0.48.0
|
||||
tag: v0.50.0
|
||||
sha: ""
|
||||
|
||||
@@ -1659,7 +2267,7 @@
|
||||
@@ -1661,7 +2273,7 @@
|
||||
## Thanos side-car image when configured
|
||||
##
|
||||
thanosImage:
|
||||
- repository: quay.io/thanos/thanos
|
||||
+ repository: rancher/mirrored-thanos-thanos
|
||||
tag: v0.17.2
|
||||
sha: ""
|
||||
|
||||
@@ -1781,7 +2393,7 @@
|
||||
port: 9090
|
||||
|
||||
## To be used with a proxy extraContainer port
|
||||
|
@ -855,28 +871,16 @@
|
|||
|
||||
## List of IP addresses at which the Prometheus server service is available
|
||||
## Ref: https://kubernetes.io/docs/user-guide/services/#external-ips
|
||||
@@ -1916,7 +2524,7 @@
|
||||
@@ -2054,7 +2666,7 @@
|
||||
## Image of Prometheus.
|
||||
##
|
||||
image:
|
||||
- repository: quay.io/prometheus/prometheus
|
||||
+ repository: rancher/mirrored-prometheus-prometheus
|
||||
tag: v2.27.1
|
||||
tag: v2.28.1
|
||||
sha: ""
|
||||
|
||||
@@ -1979,6 +2587,11 @@
|
||||
##
|
||||
externalUrl: ""
|
||||
|
||||
+ ## Ignore NamespaceSelector settings from the PodMonitor and ServiceMonitor configs
|
||||
+ ## If true, PodMonitors and ServiceMonitors can only discover Pods and Services within the namespace they are deployed into
|
||||
+ ##
|
||||
+ ignoreNamespaceSelectors: false
|
||||
+
|
||||
## Define which Nodes the Pods are scheduled on.
|
||||
## ref: https://kubernetes.io/docs/user-guide/node-selection/
|
||||
##
|
||||
@@ -2011,7 +2624,7 @@
|
||||
@@ -2149,7 +2761,7 @@
|
||||
## prometheus resource to be created with selectors based on values in the helm deployment,
|
||||
## which will also match the PrometheusRule resources created
|
||||
##
|
||||
|
@ -885,7 +889,7 @@
|
|||
|
||||
## PrometheusRules to be selected for target discovery.
|
||||
## If {}, select all PrometheusRules
|
||||
@@ -2036,7 +2649,7 @@
|
||||
@@ -2174,7 +2786,7 @@
|
||||
## prometheus resource to be created with selectors based on values in the helm deployment,
|
||||
## which will also match the servicemonitors created
|
||||
##
|
||||
|
@ -894,7 +898,7 @@
|
|||
|
||||
## ServiceMonitors to be selected for target discovery.
|
||||
## If {}, select all ServiceMonitors
|
||||
@@ -2059,7 +2672,7 @@
|
||||
@@ -2197,7 +2809,7 @@
|
||||
## prometheus resource to be created with selectors based on values in the helm deployment,
|
||||
## which will also match the podmonitors created
|
||||
##
|
||||
|
@ -903,7 +907,7 @@
|
|||
|
||||
## PodMonitors to be selected for target discovery.
|
||||
## If {}, select all PodMonitors
|
||||
@@ -2190,9 +2803,13 @@
|
||||
@@ -2328,9 +2940,13 @@
|
||||
|
||||
## Resource limits & requests
|
||||
##
|
||||
|
@ -912,7 +916,7 @@
|
|||
- # memory: 400Mi
|
||||
+ resources:
|
||||
+ limits:
|
||||
+ memory: 1500Mi
|
||||
+ memory: 3000Mi
|
||||
+ cpu: 1000m
|
||||
+ requests:
|
||||
+ memory: 750Mi
|
||||
|
@ -920,7 +924,7 @@
|
|||
|
||||
## Prometheus StorageSpec for persistent data
|
||||
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/user-guides/storage.md
|
||||
@@ -2215,7 +2832,13 @@
|
||||
@@ -2353,7 +2969,13 @@
|
||||
# medium: Memory
|
||||
|
||||
# Additional volumes on the output StatefulSet definition.
|
||||
|
@ -935,9 +939,9 @@
|
|||
|
||||
# Additional VolumeMounts on the output StatefulSet definition.
|
||||
volumeMounts: []
|
||||
@@ -2322,9 +2945,34 @@
|
||||
##
|
||||
thanos: {}
|
||||
@@ -2475,9 +3097,34 @@
|
||||
# fileName: "objstore.yaml"
|
||||
# objectStorageConfigFile: /var/secrets/object-store.yaml
|
||||
|
||||
+ proxy:
|
||||
+ image:
|
||||
|
@ -945,7 +949,7 @@
|
|||
+ tag: 1.21.1-alpine
|
||||
+
|
||||
## Containers allows injecting additional containers. This is meant to allow adding an authentication proxy to a Prometheus pod.
|
||||
## if using proxy extraContainer update targetPort with proxy container port
|
||||
## if using proxy extraContainer update targetPort with proxy container port
|
||||
- containers: []
|
||||
+ containers: |
|
||||
+ - name: prometheus-proxy
|
||||
|
@ -971,7 +975,7 @@
|
|||
|
||||
## InitContainers allows injecting additional initContainers. This is meant to allow doing some changes
|
||||
## (permissions, dir tree) on mounted volumes before starting prometheus
|
||||
@@ -2332,7 +2980,7 @@
|
||||
@@ -2485,7 +3132,7 @@
|
||||
|
||||
## PortName to use for Prometheus.
|
||||
##
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
url: https://github.com/prometheus-community/helm-charts.git
|
||||
subdirectory: charts/kube-prometheus-stack
|
||||
commit: ba91bdb2d79ca4419cf72078f5f4bfcc426d4599
|
||||
version: 100.0.1
|
||||
commit: c6208979d494156a3869d2e5faab669ce4301c68
|
||||
version: 100.1.0
|
||||
additionalCharts:
|
||||
- workingDir: charts-crd
|
||||
crdOptions:
|
||||
templateDirectory: crd-template
|
||||
crdDirectory: crd-manifest
|
||||
addCRDValidationToMainChart: true
|
||||
useTarArchive: true
|
||||
|
|
|
@ -49,7 +49,9 @@ spec:
|
|||
- -c
|
||||
- >
|
||||
echo "Applying CRDs...";
|
||||
kubectl apply -f /etc/config/crd-manifest.yaml;
|
||||
mkdir -p /etc/crd;
|
||||
base64 -d /etc/config/crd-manifest.tgz.b64 | tar -xzv -C /etc/crd;
|
||||
kubectl apply -Rf /etc/crd;
|
||||
|
||||
echo "Waiting for CRDs to be recognized before finishing installation...";
|
||||
|
||||
|
@ -68,7 +70,13 @@ spec:
|
|||
mountPath: /etc/config
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector: {{ include "linux-node-selector" . | nindent 8 }}
|
||||
{{- if .Values.nodeSelector }}
|
||||
{{- toYaml .Values.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
tolerations: {{ include "linux-node-tolerations" . | nindent 8 }}
|
||||
{{- if .Values.tolerations }}
|
||||
{{- toYaml .Values.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: crd-manifest
|
||||
configMap:
|
||||
|
@ -121,14 +129,23 @@ spec:
|
|||
- /bin/sh
|
||||
- -c
|
||||
- >
|
||||
kubectl delete -f /etc/config/crd-manifest.yaml
|
||||
echo "Deleting CRDs...";
|
||||
mkdir -p /etc/crd;
|
||||
base64 -d /etc/config/crd-manifest.tgz.b64 | tar -xzv -C /etc/crd;
|
||||
kubectl delete -Rf /etc/crd;
|
||||
volumeMounts:
|
||||
- name: crd-manifest
|
||||
readOnly: true
|
||||
mountPath: /etc/config
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector: {{ include "linux-node-selector" . | nindent 8 }}
|
||||
{{- if .Values.nodeSelector }}
|
||||
{{- toYaml .Values.nodeSelector | nindent 8 }}
|
||||
{{- end }}
|
||||
tolerations: {{ include "linux-node-tolerations" . | nindent 8 }}
|
||||
{{- if .Values.tolerations }}
|
||||
{{- toYaml .Values.tolerations | nindent 8 }}
|
||||
{{- end }}
|
||||
volumes:
|
||||
- name: crd-manifest
|
||||
configMap:
|
||||
|
|
|
@ -4,11 +4,5 @@ metadata:
|
|||
name: {{ .Chart.Name }}-manifest
|
||||
namespace: {{ .Release.Namespace }}
|
||||
data:
|
||||
crd-manifest.yaml: |
|
||||
{{- $currentScope := . -}}
|
||||
{{- $crds := (.Files.Glob "crd-manifest/**.yaml") -}}
|
||||
{{- range $path, $_ := $crds -}}
|
||||
{{- with $currentScope -}}
|
||||
{{ .Files.Get $path | nindent 4 }}
|
||||
---
|
||||
{{- end -}}{{- end -}}
|
||||
crd-manifest.tgz.b64:
|
||||
{{- .Files.Get "files/crd-manifest.tgz" | b64enc | indent 4 }}
|
||||
|
|
|
@ -9,3 +9,7 @@ global:
|
|||
image:
|
||||
repository: rancher/shell
|
||||
tag: v0.1.8
|
||||
|
||||
nodeSelector: {}
|
||||
|
||||
tolerations: []
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
service:
|
||||
targetPort: 9102
|
||||
port: 9102
|
|
@ -7,11 +7,16 @@
|
|||
+ catalog.rancher.io/release-name: rancher-node-exporter
|
||||
+ catalog.cattle.io/hidden: "true"
|
||||
+ catalog.cattle.io/os: linux
|
||||
apiVersion: v1
|
||||
appVersion: 1.1.2
|
||||
apiVersion: v2
|
||||
appVersion: 1.2.2
|
||||
description: A Helm chart for prometheus node-exporter
|
||||
-name: prometheus-node-exporter
|
||||
+name: rancher-node-exporter
|
||||
version: 1.18.1
|
||||
version: 2.2.0
|
||||
type: application
|
||||
home: https://github.com/prometheus/node_exporter/
|
||||
sources:
|
||||
@@ -16,3 +22,4 @@
|
||||
name: gianrubio
|
||||
- name: vsliouniaev
|
||||
- name: bismarck
|
||||
+
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
--- charts-original/templates/daemonset.yaml
|
||||
+++ charts/templates/daemonset.yaml
|
||||
@@ -35,7 +35,7 @@
|
||||
@@ -36,7 +36,7 @@
|
||||
{{- end }}
|
||||
containers:
|
||||
- name: node-exporter
|
||||
|
@ -9,7 +9,7 @@
|
|||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
args:
|
||||
- --path.procfs=/host/proc
|
||||
@@ -133,18 +133,18 @@
|
||||
@@ -134,18 +134,18 @@
|
||||
affinity:
|
||||
{{ toYaml .Values.affinity | indent 8 }}
|
||||
{{- end }}
|
||||
|
|
|
@ -12,10 +12,10 @@
|
|||
image:
|
||||
- repository: quay.io/prometheus/node-exporter
|
||||
+ repository: rancher/mirrored-prometheus-node-exporter
|
||||
tag: v1.1.2
|
||||
tag: v1.2.2
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
@@ -126,6 +131,8 @@
|
||||
@@ -135,6 +140,8 @@
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
url: https://github.com/prometheus-community/helm-charts.git
|
||||
subdirectory: charts/prometheus-node-exporter
|
||||
commit: a05f7b8888d6174827b815aa097d64b94f00af3e
|
||||
commit: cd13facd9c6042f7f15978024572cf297fbac6c0
|
||||
version: 100.0.0
|
||||
doNotRelease: true
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
rules:
|
||||
external:
|
||||
- seriesQuery: '{__name__=~"^some_metric_count$"}'
|
||||
resources:
|
||||
template: <<.Resource>>
|
||||
name:
|
||||
matches: ""
|
||||
as: "my_custom_metric"
|
||||
metricsQuery: sum(<<.Series>>{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
|
@ -10,6 +10,6 @@
|
|||
apiVersion: v1
|
||||
-name: prometheus-adapter
|
||||
+name: rancher-prometheus-adapter
|
||||
version: 2.14.0
|
||||
appVersion: v0.8.4
|
||||
version: 2.17.0
|
||||
appVersion: v0.9.0
|
||||
description: A Helm chart for k8s prometheus adapter
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
--- charts-original/templates/deployment.yaml
|
||||
+++ charts/templates/deployment.yaml
|
||||
@@ -40,7 +40,7 @@
|
||||
@@ -41,7 +41,7 @@
|
||||
{{- end}}
|
||||
containers:
|
||||
- name: {{ .Chart.Name }}
|
||||
|
@ -9,7 +9,7 @@
|
|||
imagePullPolicy: {{ .Values.image.pullPolicy }}
|
||||
args:
|
||||
- /adapter
|
||||
@@ -102,13 +102,17 @@
|
||||
@@ -105,8 +105,10 @@
|
||||
name: volume-serving-cert
|
||||
readOnly: true
|
||||
{{- end }}
|
||||
|
@ -22,6 +22,10 @@
|
|||
affinity:
|
||||
{{- toYaml .Values.affinity | nindent 8 }}
|
||||
priorityClassName: {{ .Values.priorityClassName }}
|
||||
@@ -114,8 +116,10 @@
|
||||
securityContext:
|
||||
{{- toYaml .Values.podSecurityContext | nindent 8 }}
|
||||
{{- end }}
|
||||
- tolerations:
|
||||
- {{- toYaml .Values.tolerations | nindent 8 }}
|
||||
+ tolerations: {{ include "linux-node-tolerations" . | nindent 8 }}
|
||||
|
|
|
@ -9,8 +9,8 @@
|
|||
affinity: {}
|
||||
|
||||
image:
|
||||
- repository: directxman12/k8s-prometheus-adapter-amd64
|
||||
+ repository: rancher/mirrored-directxman12-k8s-prometheus-adapter
|
||||
tag: v0.8.4
|
||||
- repository: k8s.gcr.io/prometheus-adapter/prometheus-adapter
|
||||
+ repository: rancher/mirrored-prometheus-adapter-prometheus-adapter
|
||||
tag: v0.9.0
|
||||
pullPolicy: IfNotPresent
|
||||
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
url: https://github.com/prometheus-community/helm-charts.git
|
||||
subdirectory: charts/prometheus-adapter
|
||||
commit: 50b719af447594abbae7beeb4d6458a19f8e9689
|
||||
commit: 843d2f3d302ca24d745abf539a86ec680ae4f8c0
|
||||
version: 100.0.0
|
||||
doNotRelease: true
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
apiVersion: v1
|
||||
version: 0.1.4
|
||||
version: 0.1.5
|
||||
appVersion: 0.1.0
|
||||
annotations:
|
||||
catalog.rancher.io/certified: rancher
|
||||
|
|
|
@ -82,7 +82,6 @@ k8s-app: {{ template "pushProxy.proxy.name" . }}
|
|||
|
||||
{{- define "pushProxy.serviceMonitor.labels" -}}
|
||||
app: {{ template "pushprox.serviceMonitor.name" . }}
|
||||
release: {{ .Release.Name | quote }}
|
||||
{{ template "pushProxy.commonLabels" . }}
|
||||
{{- end -}}
|
||||
|
||||
|
|
|
@ -1,2 +1,2 @@
|
|||
url: local
|
||||
version: 100.0.0
|
||||
version: 100.0.1
|
||||
|
|
|
@ -25,7 +25,7 @@ clients:
|
|||
port: 9796
|
||||
image:
|
||||
repository: rancher/windows_exporter-package
|
||||
tag: v0.0.2
|
||||
tag: v0.0.3
|
||||
os: "windows"
|
||||
|
||||
# Specify the IP addresses of nodes that you want to collect metrics from
|
||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue