Add Grafana dashboard for Fleet performance (#4106)

pull/4120/head
Patrick Seidensal 2024-06-21 14:33:23 +02:00 committed by GitHub
parent f82ffa9afc
commit f68ec410bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 910 additions and 2 deletions

View File

@ -0,0 +1,454 @@
{
"description": "Controller Runtime",
"graphTooltip": 1,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "controller_runtime_active_workers{job=\"$job\", namespace=\"$namespace\"}",
"legendFormat": "{{controller}} {{instance}}"
}
],
"title": "Number of Workers in Use",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"id": 2,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(controller_runtime_reconcile_errors_total{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod)",
"legendFormat": "{{instance}} {{pod}}"
}
],
"title": "Reconciliation Error Count per Controller",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"id": 3,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(controller_runtime_reconcile_total{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod)",
"legendFormat": "{{instance}} {{pod}}"
}
],
"title": "Total Reconciliation Count per Controller",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 24
},
"id": 4,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "workqueue_depth{job=\"$job\", namespace=\"$namespace\"}",
"legendFormat": "{{instance}} {{pod}}"
}
],
"title": "WorkQueue Depth",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 32
},
"id": 5,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.50, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P50 {{name}}"
}
],
"title": "Seconds for Items Stay in Queue (before being requested) P50",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 40
},
"id": 6,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.90, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P90 {{name}}"
}
],
"title": "Seconds for Items Stay in Queue (before being requested) P90",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 48
},
"id": 7,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P99 {{name}}"
}
],
"title": "Seconds for Items Stay in Queue (before being requested) P99",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 56
},
"id": 8,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(workqueue_adds_total{job=\"$job\", namespace=\"$namespace\"}[2m])) by (instance, name)",
"legendFormat": "{{name}} {{instance}}"
}
],
"title": "Work Queue Add Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 64
},
"id": 9,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(workqueue_unfinished_work_seconds{job=\"$job\", namespace=\"$namespace\"}[5m])",
"legendFormat": "{{name}} {{instance}}"
}
],
"title": "Unfinished Seconds",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 72
},
"id": 10,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.50, sum(rate(workqueue_work_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P50 {{name}}"
}
],
"title": "Seconds Processing Items from WorkQueue - 50th Percentile",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 80
},
"id": 11,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.90, sum(rate(workqueue_work_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P90 {{name}}"
}
],
"title": "Seconds Processing Items from WorkQueue - 90th Percentile",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 88
},
"id": 12,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.99, sum(rate(workqueue_work_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P99 {{name}}"
}
],
"title": "Seconds Processing Items from WorkQueue - 99th Percentile",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 96
},
"id": 13,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(workqueue_retries_total{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name)",
"legendFormat": "{{name}} {{instance}}"
}
],
"title": "Work Queue Retries Rate",
"type": "timeseries"
}
],
"schemaVersion": 39,
"templating": {
"list": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"name": "namespace",
"query": "label_values(controller_runtime_reconcile_total, namespace)",
"refresh": 2,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"name": "job",
"query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\"}, job)",
"refresh": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Fleet / Controller-Runtime",
"uid": "fleet-controller-runtime"
}

View File

@ -16369,7 +16369,7 @@ entries:
catalog.cattle.io/upstream-version: 45.31.1
apiVersion: v2
appVersion: v0.65.1
created: "2024-06-14T12:26:55.367402441+02:00"
created: "2024-06-21T10:33:45.247640617+02:00"
dependencies:
- condition: grafana.enabled
name: grafana
@ -16463,7 +16463,7 @@ entries:
and Prometheus rules combined with documentation and scripts to provide easy
to operate end-to-end Kubernetes cluster monitoring with Prometheus using the
Prometheus Operator.
digest: 52a951a481140284364185f050ef67c2832b3f2eff9da2d283614a208604adaa
digest: c8647120ec147163717b16a83c4276bb8e3b81656e3067f25ef70e8b9b0fffd5
home: https://github.com/prometheus-operator/kube-prometheus
icon: file://assets/logos/rancher-monitoring.png
keywords:

View File

@ -0,0 +1,454 @@
{
"description": "Controller Runtime",
"graphTooltip": 1,
"panels": [
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 0
},
"id": 1,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "controller_runtime_active_workers{job=\"$job\", namespace=\"$namespace\"}",
"legendFormat": "{{controller}} {{instance}}"
}
],
"title": "Number of Workers in Use",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 8
},
"id": 2,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(controller_runtime_reconcile_errors_total{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod)",
"legendFormat": "{{instance}} {{pod}}"
}
],
"title": "Reconciliation Error Count per Controller",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 16
},
"id": 3,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(controller_runtime_reconcile_total{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, pod)",
"legendFormat": "{{instance}} {{pod}}"
}
],
"title": "Total Reconciliation Count per Controller",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": 0,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 24
},
"id": 4,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "workqueue_depth{job=\"$job\", namespace=\"$namespace\"}",
"legendFormat": "{{instance}} {{pod}}"
}
],
"title": "WorkQueue Depth",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 32
},
"id": 5,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.50, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P50 {{name}}"
}
],
"title": "Seconds for Items Stay in Queue (before being requested) P50",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 40
},
"id": 6,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.90, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P90 {{name}}"
}
],
"title": "Seconds for Items Stay in Queue (before being requested) P90",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 48
},
"id": 7,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P99 {{name}}"
}
],
"title": "Seconds for Items Stay in Queue (before being requested) P99",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 56
},
"id": 8,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(workqueue_adds_total{job=\"$job\", namespace=\"$namespace\"}[2m])) by (instance, name)",
"legendFormat": "{{name}} {{instance}}"
}
],
"title": "Work Queue Add Rate",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 64
},
"id": 9,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "rate(workqueue_unfinished_work_seconds{job=\"$job\", namespace=\"$namespace\"}[5m])",
"legendFormat": "{{name}} {{instance}}"
}
],
"title": "Unfinished Seconds",
"type": "stat"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 72
},
"id": 10,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.50, sum(rate(workqueue_work_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P50 {{name}}"
}
],
"title": "Seconds Processing Items from WorkQueue - 50th Percentile",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 80
},
"id": 11,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.90, sum(rate(workqueue_work_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P90 {{name}}"
}
],
"title": "Seconds Processing Items from WorkQueue - 90th Percentile",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 88
},
"id": 12,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "histogram_quantile(0.99, sum(rate(workqueue_work_duration_seconds_bucket{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name, le))",
"legendFormat": "P99 {{name}}"
}
],
"title": "Seconds Processing Items from WorkQueue - 99th Percentile",
"type": "timeseries"
},
{
"datasource": {
"type": "datasource",
"uid": "-- Mixed --"
},
"fieldConfig": {
"defaults": {
"decimals": null,
"unit": null
}
},
"gridPos": {
"h": 8,
"w": 24,
"x": 0,
"y": 96
},
"id": 13,
"pluginVersion": "v11.0.0",
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"expr": "sum(rate(workqueue_retries_total{job=\"$job\", namespace=\"$namespace\"}[5m])) by (instance, name)",
"legendFormat": "{{name}} {{instance}}"
}
],
"title": "Work Queue Retries Rate",
"type": "timeseries"
}
],
"schemaVersion": 39,
"templating": {
"list": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"name": "namespace",
"query": "label_values(controller_runtime_reconcile_total, namespace)",
"refresh": 2,
"type": "query"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"name": "job",
"query": "label_values(controller_runtime_reconcile_total{namespace=~\"$namespace\"}, job)",
"refresh": 2,
"type": "query"
}
]
},
"time": {
"from": "now-1h",
"to": "now"
},
"timezone": "utc",
"title": "Fleet / Controller-Runtime",
"uid": "fleet-controller-runtime"
}