From 4238db1e30a43c45d5d28ece5975eccac7b28036 Mon Sep 17 00:00:00 2001 From: Hasan Khan Date: Sat, 27 Jun 2026 18:29:30 -0700 Subject: [PATCH 1/3] feat(helm): package Grafana dashboards Signed-off-by: Hasan Khan --- helm/PREREQUISITES.md | 15 + helm/README.md | 43 +++ helm/dashboards/nico-api-performance.json | 256 ++++++++++++++++ helm/dashboards/nico-lifecycle.json | 243 +++++++++++++++ helm/dashboards/nico-overview.json | 287 ++++++++++++++++++ helm/templates/_helpers.tpl | 34 +++ helm/templates/grafana-dashboards.yaml | 19 ++ .../tests/fixtures/grafana-custom-values.yaml | 10 + helm/tests/grafana_dashboards_test.yaml | 66 ++++ helm/values.yaml | 21 ++ 10 files changed, 994 insertions(+) create mode 100644 helm/dashboards/nico-api-performance.json create mode 100644 helm/dashboards/nico-lifecycle.json create mode 100644 helm/dashboards/nico-overview.json create mode 100644 helm/templates/_helpers.tpl create mode 100644 helm/templates/grafana-dashboards.yaml create mode 100644 helm/tests/fixtures/grafana-custom-values.yaml create mode 100644 helm/tests/grafana_dashboards_test.yaml diff --git a/helm/PREREQUISITES.md b/helm/PREREQUISITES.md index ebc8e4960d..41b9c8dce4 100644 --- a/helm/PREREQUISITES.md +++ b/helm/PREREQUISITES.md @@ -41,6 +41,21 @@ If you want Prometheus metrics collection, install the [Prometheus Operator](htt - **nico-hardware-health** also exposes an optional `telemetryServiceMonitor` (disabled by default) that scrapes `/telemetry` for per-machine sensor gauge data (temperature, power, fans, etc.) from the Prometheus sink. Use `serviceMonitor` for `/metrics` operational metrics only. - NICo functions normally without the Prometheus Operator installed. +### Grafana Dashboard Sidecar (Optional) + +The umbrella chart can install its packaged Grafana dashboards as a ConfigMap +when `grafanaDashboards.enabled=true`. This requires an existing Grafana +installation with a dashboard sidecar watching the ConfigMap namespace and +labels. The default `grafana_dashboard: "1"` label matches the common +`kube-prometheus-stack` selector. To place dashboards in the configured NICo +folder, the sidecar must read the `grafana_folder` annotation (or both sides +must be configured with another annotation key). + +Grafana is not installed by the NICo chart. If `grafanaDashboards.namespace` +targets a namespace other than the NICo release namespace, create that +namespace first and configure the sidecar to search it. See +[`README.md`](./README.md#grafana-dashboards) for values and namespace examples. + --- ## 2. PostgreSQL Database diff --git a/helm/README.md b/helm/README.md index 413a648cf7..57166a7758 100644 --- a/helm/README.md +++ b/helm/README.md @@ -74,6 +74,49 @@ Top-level `global:` values are automatically passed to all subcharts. | `global.spiffe.trustDomain` | SPIFFE trust domain for mTLS | `nico.local` | | `global.labels` | Common labels applied to all resources | See `values.yaml` | +### Grafana Dashboards + +The chart packages three dashboards built from NICo's exported Prometheus +metrics: a site overview, object lifecycle diagnostics, and API performance. +They are disabled by default because this chart does not install Grafana. +The source JSON files live in [`dashboards/`](./dashboards/) and can also be +imported into Grafana directly. + +To expose the dashboards to a Grafana dashboard sidecar in the release +namespace: + +```yaml +grafanaDashboards: + enabled: true +``` + +The default `grafana_dashboard: "1"` label matches the dashboard-sidecar +selector used by `kube-prometheus-stack`. The chart also adds the conventional +`grafana_folder: NICo` annotation; configure the Grafana sidecar's +`folderAnnotation` setting if it does not already read that key. If Grafana +watches a different namespace or selector, configure them explicitly: + +```yaml +grafanaDashboards: + enabled: true + namespace: monitoring + folder: Infrastructure/NICo + folderAnnotation: grafana_folder + labels: + grafana_dashboard: "1" + annotations: {} +``` + +The target namespace must exist before Helm runs, and the Helm identity must be +allowed to create ConfigMaps there. The Grafana sidecar must also watch that +namespace; for `kube-prometheus-stack`, configure +`grafana.sidecar.dashboards.searchNamespace` accordingly. + +Each dashboard provides a Prometheus data-source selector, a NICo scrape-job +selector, and an editable metric-prefix variable. The prefix defaults to +`carbide`, which is the prefix currently emitted by NICo. Set it to `nico` (or +another configured value) when using the `alt_metric_prefix` site setting. + ### Subchart Enable/Disable Flags Each subchart can be independently enabled or disabled. All core NICo services are enabled by default. Infrastructure services (`unbound`) that may already be provided by the environment are disabled by default. diff --git a/helm/dashboards/nico-api-performance.json b/helm/dashboards/nico-api-performance.json new file mode 100644 index 0000000000..6ad0e0ae7a --- /dev/null +++ b/helm/dashboards/nico-api-performance.json @@ -0,0 +1,256 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Request summary", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "READY" } }, "type": "value" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, + "id": 2, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_api_ready{job=~\"$job\"})", "instant": true, "legendFormat": "NICo API", "range": false, "refId": "A" } + ], + "title": "API status", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "reqps" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 }, + "id": 3, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_grpc_server_duration_milliseconds_count{job=~\"$job\"}[$__rate_interval]))", "instant": true, "legendFormat": "Requests / s", "range": false, "refId": "A" } + ], + "title": "Request rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 250 }, { "color": "red", "value": 1000 } ] }, "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 }, + "id": 4, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le) (rate(${metric_prefix}_api_grpc_server_duration_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": true, "legendFormat": "p95", "range": false, "refId": "A" } + ], + "title": "Request latency p95", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "gRPC responses whose status code is not Ok.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 0.001 } ] }, "unit": "reqps" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 }, + "id": 5, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_grpc_server_duration_milliseconds_count{job=~\"$job\",grpc_status_code!=\"Ok\",grpc_status_code!=\"Unknown\"}[$__rate_interval]))", "instant": true, "legendFormat": "Errors / s", "range": false, "refId": "A" } + ], + "title": "gRPC error rate", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "reqps" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 5 }, + "id": 6, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by (grpc_method, grpc_status_code) (rate(${metric_prefix}_api_grpc_server_duration_milliseconds_count{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "{{grpc_method}} — {{grpc_status_code}}", "range": true, "refId": "A" } + ], + "title": "Requests by method and status", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 5 }, + "id": 7, + "options": { "legend": { "calcs": [ "lastNotNull", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.50, sum by (le) (rate(${metric_prefix}_api_grpc_server_duration_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "p50", "range": true, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le) (rate(${metric_prefix}_api_grpc_server_duration_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "p95", "range": true, "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.99, sum by (le) (rate(${metric_prefix}_api_grpc_server_duration_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "p99", "range": true, "refId": "C" } + ], + "title": "Request latency", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, + "id": 8, + "panels": [], + "title": "Database", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Database queries attributed to gRPC request spans.", + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 15 }, + "id": 9, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by (grpc_method) (rate(${metric_prefix}_api_db_queries_total{job=~\"$job\",operation=\"grpc\"}[$__rate_interval]))", "instant": false, "legendFormat": "{{grpc_method}}", "range": true, "refId": "A" } + ], + "title": "Database queries / second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 15 }, + "id": 10, + "options": { "legend": { "calcs": [ "lastNotNull", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le, grpc_method) (rate(${metric_prefix}_api_db_span_query_time_milliseconds_bucket{job=~\"$job\",operation=\"grpc\"}[$__rate_interval])))", "instant": false, "legendFormat": "{{grpc_method}}", "range": true, "refId": "A" } + ], + "title": "Database time per request p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 7, "w": 24, "x": 0, "y": 24 }, + "id": 11, + "options": { "legend": { "calcs": [ "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_db_pool_total_conns{job=~\"$job\"})", "instant": false, "legendFormat": "Total connections", "range": true, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_db_pool_idle_conns{job=~\"$job\"})", "instant": false, "legendFormat": "Idle connections", "range": true, "refId": "B" } + ], + "title": "Database connection pool", + "type": "timeseries" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 31 }, + "id": 12, + "panels": [], + "title": "Vault and transport", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 8, "x": 0, "y": 32 }, + "id": 13, + "options": { "legend": { "calcs": [ "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_vault_requests_attempted_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Attempted / s", "range": true, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_vault_requests_failed_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Failed / s", "range": true, "refId": "B" } + ], + "title": "Vault requests", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "yellow", "value": 300 }, { "color": "green", "value": 900 } ] }, "unit": "s" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 4, "x": 8, "y": 32 }, + "id": 14, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "min(${metric_prefix}_api_vault_token_time_until_refresh_seconds{job=~\"$job\"})", "instant": true, "legendFormat": "Until refresh", "range": false, "refId": "A" } + ], + "title": "Vault token refresh", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 6, "x": 12, "y": 32 }, + "id": 15, + "options": { "legend": { "calcs": [ "lastNotNull", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le) (rate(${metric_prefix}_api_vault_request_duration_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "p95", "range": true, "refId": "A" } + ], + "title": "Vault request latency", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 6, "x": 18, "y": 32 }, + "id": 16, + "options": { "legend": { "calcs": [ "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_tls_connection_attempted_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Attempted / s", "range": true, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_tls_connection_success_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Succeeded / s", "range": true, "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_api_tls_connection_fail_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Failed / s", "range": true, "refId": "C" } + ], + "title": "TLS connections", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ "nico", "api", "performance" ], + "templating": { + "list": [ + { + "current": {}, + "label": "Prometheus", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { "selected": true, "text": "carbide", "value": "carbide" }, + "description": "Prefix configured for NICo core metrics.", + "label": "Metric prefix", + "name": "metric_prefix", + "options": [ { "selected": true, "text": "carbide", "value": "carbide" } ], + "query": "carbide", + "type": "textbox" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(${metric_prefix}_api_ready, job)", + "includeAll": true, + "label": "NICo scrape job", + "multi": true, + "name": "job", + "options": [], + "query": { "query": "label_values(${metric_prefix}_api_ready, job)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "NICo / API Performance", + "uid": "nico-api-performance", + "version": 1 +} diff --git a/helm/dashboards/nico-lifecycle.json b/helm/dashboards/nico-lifecycle.json new file mode 100644 index 0000000000..fe4b4c9fd7 --- /dev/null +++ b/helm/dashboards/nico-lifecycle.json @@ -0,0 +1,243 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Current $object_type state", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Total objects in the selected state controller's latest fresh snapshot.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 8, "x": 0, "y": 1 }, + "id": 2, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_${object_type}_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Total", "range": false, "refId": "A" } + ], + "title": "Objects", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Objects that have remained in their current state beyond its configured SLA.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 8, "x": 8, "y": 1 }, + "id": 3, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(max by (state, substate) (${metric_prefix}_${object_type}_per_state_above_sla{job=~\"$job\",fresh=\"true\"}))", "instant": true, "legendFormat": "Above SLA", "range": false, "refId": "A" } + ], + "title": "Objects above SLA", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Objects whose most recent state-handler run failed.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 4, "w": 8, "x": 16, "y": 1 }, + "id": 4, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(max by (state, substate) (${metric_prefix}_${object_type}_with_state_handling_errors_per_state{job=~\"$job\",fresh=\"true\",error=\"any\"}))", "instant": true, "legendFormat": "Handler errors", "range": false, "refId": "A" } + ], + "title": "Current handling errors", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 5 }, + "id": 5, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (state, substate) (${metric_prefix}_${object_type}_per_state{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "{{state}} / {{substate}}", "range": false, "refId": "A" } + ], + "title": "Objects by state", + "type": "bargauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Only states with objects above SLA are shown.", + "fieldConfig": { "defaults": { "color": { "mode": "continuous-RdYlGr" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 5 }, + "id": 6, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (state, substate) (${metric_prefix}_${object_type}_per_state_above_sla{job=~\"$job\",fresh=\"true\"} > 0)", "instant": true, "legendFormat": "{{state}} / {{substate}}", "range": false, "refId": "A" } + ], + "title": "Above SLA by state", + "type": "bargauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "The latest handler failures by state and concrete error type. The aggregate error=any series is excluded.", + "fieldConfig": { "defaults": { "color": { "mode": "continuous-RdYlGr" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 14 }, + "id": 7, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (state, substate, error) (${metric_prefix}_${object_type}_with_state_handling_errors_per_state{job=~\"$job\",fresh=\"true\",error!=\"any\"} > 0)", "instant": true, "legendFormat": "{{state}} / {{substate}} — {{error}}", "range": false, "refId": "A" } + ], + "title": "State-handler errors", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, + "id": 8, + "panels": [], + "title": "Transitions and latency", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ops" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 23 }, + "id": 9, + "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by (state, substate) (rate(${metric_prefix}_${object_type}_state_entered_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "{{state}} / {{substate}}", "range": true, "refId": "A" } + ], + "title": "State entries / second", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 23 }, + "id": 10, + "options": { "legend": { "calcs": [ "lastNotNull", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le, state, substate) (rate(${metric_prefix}_${object_type}_handler_latency_in_state_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "{{state}} / {{substate}}", "range": true, "refId": "A" } + ], + "title": "State-handler latency p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Time objects spent in a state before transitioning. A series is emitted only when a transition occurs.", + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 0, "y": 32 }, + "id": 11, + "options": { "legend": { "calcs": [ "lastNotNull", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le, state, substate, next_state, next_substate) (rate(${metric_prefix}_${object_type}_time_in_state_seconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "{{state}}/{{substate}} → {{next_state}}/{{next_substate}}", "range": true, "refId": "A" } + ], + "title": "Time in state before transition p95", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 9, "w": 12, "x": 12, "y": 32 }, + "id": 12, + "options": { "legend": { "calcs": [ "lastNotNull" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_${object_type}_object_tasks_enqueued_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Enqueued / s", "range": true, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_${object_type}_object_tasks_dispatched_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Dispatched / s", "range": true, "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_${object_type}_object_tasks_completed_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Completed / s", "range": true, "refId": "C" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(rate(${metric_prefix}_${object_type}_object_tasks_requeued_total{job=~\"$job\"}[$__rate_interval]))", "instant": false, "legendFormat": "Requeued / s", "range": true, "refId": "D" } + ], + "title": "Controller work rate", + "type": "timeseries" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "lineInterpolation": "linear", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "ms" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 24, "x": 0, "y": 41 }, + "id": 13, + "options": { "legend": { "calcs": [ "lastNotNull", "max" ], "displayMode": "table", "placement": "bottom", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "desc" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le) (rate(${metric_prefix}_${object_type}_iteration_latency_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "Processor iteration p95", "range": true, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "histogram_quantile(0.95, sum by (le) (rate(${metric_prefix}_${object_type}_enqueuer_iteration_latency_milliseconds_bucket{job=~\"$job\"}[$__rate_interval])))", "instant": false, "legendFormat": "Enqueuer iteration p95", "range": true, "refId": "B" } + ], + "title": "Controller iteration latency p95", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ "nico", "lifecycle", "state-controller" ], + "templating": { + "list": [ + { + "current": {}, + "label": "Prometheus", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { "selected": true, "text": "carbide", "value": "carbide" }, + "description": "Prefix configured for NICo core metrics.", + "label": "Metric prefix", + "name": "metric_prefix", + "options": [ { "selected": true, "text": "carbide", "value": "carbide" } ], + "query": "carbide", + "type": "textbox" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(${metric_prefix}_api_ready, job)", + "includeAll": true, + "label": "NICo scrape job", + "multi": true, + "name": "job", + "options": [], + "query": { "query": "label_values(${metric_prefix}_api_ready, job)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + }, + { + "current": { "selected": true, "text": "machines", "value": "machines" }, + "description": "State-controller object family.", + "label": "Object type", + "name": "object_type", + "options": [ + { "selected": true, "text": "machines", "value": "machines" }, + { "selected": false, "text": "network_segments", "value": "network_segments" }, + { "selected": false, "text": "vpc_prefixes", "value": "vpc_prefixes" }, + { "selected": false, "text": "ib_partitions", "value": "ib_partitions" }, + { "selected": false, "text": "switches", "value": "switches" }, + { "selected": false, "text": "racks", "value": "racks" }, + { "selected": false, "text": "power_shelves", "value": "power_shelves" } + ], + "query": "machines,network_segments,vpc_prefixes,ib_partitions,switches,racks,power_shelves", + "type": "custom" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "NICo / Object Lifecycle", + "uid": "nico-lifecycle", + "version": 1 +} diff --git a/helm/dashboards/nico-overview.json b/helm/dashboards/nico-overview.json new file mode 100644 index 0000000000..758fff538f --- /dev/null +++ b/helm/dashboards/nico-overview.json @@ -0,0 +1,287 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { "type": "grafana", "uid": "-- Grafana --" }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "graphTooltip": 1, + "links": [], + "panels": [ + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, + "id": 1, + "panels": [], + "title": "Service and site health", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "1 means that the NICo API process is serving metrics.", + "fieldConfig": { + "defaults": { + "color": { "mode": "thresholds" }, + "mappings": [ + { "options": { "0": { "color": "red", "text": "DOWN" }, "1": { "color": "green", "text": "READY" } }, "type": "value" } + ], + "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } + }, + "overrides": [] + }, + "gridPos": { "h": 4, "w": 4, "x": 0, "y": 1 }, + "id": 2, + "options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_api_ready{job=~\"$job\"})", "instant": true, "legendFormat": "NICo API", "range": false, "refId": "A" } + ], + "title": "API status", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Build version and Git SHA reported by the running NICo API.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 8, "x": 4, "y": 1 }, + "id": 3, + "options": { "colorMode": "none", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "name", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (build_version, git_sha) (${metric_prefix}_api_version{job=~\"$job\"})", "instant": true, "legendFormat": "{{build_version}} ({{git_sha}})", "range": false, "refId": "A" } + ], + "title": "Running version", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Managed hosts with one or more active health alerts, summed across tenant-assignment states.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 4, "x": 12, "y": 1 }, + "id": 4, + "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum(max by (healthy, in_use) (${metric_prefix}_hosts_health_status_count{job=~\"$job\",fresh=\"true\",healthy=\"false\"}))", "instant": true, "legendFormat": "Unhealthy hosts", "range": false, "refId": "A" } + ], + "title": "Unhealthy hosts", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "DPUs with a recent health report (less than five minutes old).", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 4, "x": 16, "y": 1 }, + "id": 5, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_dpus_up_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Online DPUs", "range": false, "refId": "A" } + ], + "title": "DPUs online", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "DPUs whose most recent report declared them healthy.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "green", "value": 1 } ] } }, "overrides": [] }, + "gridPos": { "h": 4, "w": 4, "x": 20, "y": 1 }, + "id": 6, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_dpus_healthy_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Healthy DPUs", "range": false, "refId": "A" } + ], + "title": "DPUs healthy", + "type": "stat" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, + "id": 7, + "panels": [], + "title": "Capacity", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 5, "w": 7, "x": 0, "y": 6 }, + "id": 8, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_machines_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Managed", "range": false, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_hosts_usable_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Usable", "range": false, "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_hosts_in_use_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "In use", "range": false, "refId": "C" } + ], + "title": "Host capacity", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 5, "w": 7, "x": 7, "y": 6 }, + "id": 9, + "options": { "colorMode": "value", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "textMode": "auto", "wideLayout": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_gpus_total_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Managed", "range": false, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_gpus_usable_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Usable", "range": false, "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_gpus_in_use_count{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "In use", "range": false, "refId": "C" } + ], + "title": "GPU capacity", + "type": "stat" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Free and allocated values in each NICo resource pool.", + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 5, "w": 10, "x": 14, "y": 6 }, + "id": 10, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (pool) (${metric_prefix}_resourcepool_free_count{job=~\"$job\"})", "instant": true, "legendFormat": "{{pool}} free", "range": false, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (pool) (${metric_prefix}_resourcepool_used_count{job=~\"$job\"})", "instant": true, "legendFormat": "{{pool}} used", "range": false, "refId": "B" } + ], + "title": "Resource pools", + "type": "bargauge" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, + "id": 11, + "panels": [], + "title": "Tenancy and inventory", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Allocated hosts and GPUs by tenant organization ID.", + "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "auto", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] } }, "overrides": [] }, + "gridPos": { "h": 8, "w": 10, "x": 0, "y": 12 }, + "id": 12, + "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (tenant_org_id) (${metric_prefix}_hosts_in_use_by_tenant_count{job=~\"$job\",fresh=\"true\"})", "format": "table", "instant": true, "legendFormat": "Hosts", "range": false, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (tenant_org_id) (${metric_prefix}_gpus_in_use_by_tenant_count{job=~\"$job\",fresh=\"true\"})", "format": "table", "instant": true, "legendFormat": "GPUs", "range": false, "refId": "B" } + ], + "title": "Tenant allocations", + "type": "table" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 7, "x": 10, "y": 12 }, + "id": 13, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_machines_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Hosts", "range": false, "refId": "A" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_network_segments_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Network segments", "range": false, "refId": "B" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_vpc_prefixes_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "VPC prefixes", "range": false, "refId": "C" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_ib_partitions_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "IB partitions", "range": false, "refId": "D" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_switches_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Switches", "range": false, "refId": "E" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_racks_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Racks", "range": false, "refId": "F" }, + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max(${metric_prefix}_power_shelves_total{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "Power shelves", "range": false, "refId": "G" } + ], + "title": "Managed entities", + "type": "bargauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Current managed-host lifecycle distribution. Select the detailed lifecycle dashboard to inspect SLA and handling errors.", + "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "hideFrom": { "legend": false, "tooltip": false, "viz": false } }, "mappings": [] }, "overrides": [] }, + "gridPos": { "h": 8, "w": 7, "x": 17, "y": 12 }, + "id": 14, + "options": { "displayLabels": [ "name", "percent" ], "legend": { "displayMode": "list", "placement": "right", "showLegend": true }, "pieType": "pie", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "tooltip": { "mode": "single", "sort": "none" } }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "max by (state, substate) (${metric_prefix}_machines_per_state{job=~\"$job\",fresh=\"true\"})", "instant": true, "legendFormat": "{{state}} / {{substate}}", "range": false, "refId": "A" } + ], + "title": "Host lifecycle states", + "type": "piechart" + }, + { + "collapsed": false, + "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }, + "id": 15, + "panels": [], + "title": "Health alerts", + "type": "row" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Managed hosts reporting each health probe alert. Counts are combined across tenant-assignment states.", + "fieldConfig": { "defaults": { "color": { "mode": "continuous-RdYlGr" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, + "id": 16, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by (probe_id, probe_target) (max by (probe_id, probe_target, in_use) (${metric_prefix}_hosts_unhealthy_by_probe_id_count{job=~\"$job\",fresh=\"true\"}))", "instant": true, "legendFormat": "{{probe_id}} / {{probe_target}}", "range": false, "refId": "A" } + ], + "title": "Unhealthy hosts by probe", + "type": "bargauge" + }, + { + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "description": "Managed hosts with active health classifications, combined across tenant-assignment states.", + "fieldConfig": { "defaults": { "color": { "mode": "continuous-RdYlGr" }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] }, "unit": "short" }, "overrides": [] }, + "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, + "id": 17, + "options": { "displayMode": "gradient", "maxVizHeight": 300, "minVizHeight": 16, "minVizWidth": 8, "namePlacement": "auto", "orientation": "horizontal", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showUnfilled": true, "sizing": "auto", "valueMode": "color" }, + "targets": [ + { "datasource": { "type": "prometheus", "uid": "${datasource}" }, "editorMode": "code", "expr": "sum by (classification) (max by (classification, in_use) (${metric_prefix}_hosts_unhealthy_by_classification_count{job=~\"$job\",fresh=\"true\"}))", "instant": true, "legendFormat": "{{classification}}", "range": false, "refId": "A" } + ], + "title": "Unhealthy hosts by classification", + "type": "bargauge" + } + ], + "refresh": "30s", + "schemaVersion": 39, + "tags": [ "nico", "infrastructure", "overview" ], + "templating": { + "list": [ + { + "current": {}, + "label": "Prometheus", + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "current": { "selected": true, "text": "carbide", "value": "carbide" }, + "description": "Prefix configured for NICo core metrics (carbide by default; alt_metric_prefix can add another prefix).", + "label": "Metric prefix", + "name": "metric_prefix", + "options": [ { "selected": true, "text": "carbide", "value": "carbide" } ], + "query": "carbide", + "type": "textbox" + }, + { + "allValue": ".*", + "current": { "selected": true, "text": "All", "value": "$__all" }, + "datasource": { "type": "prometheus", "uid": "${datasource}" }, + "definition": "label_values(${metric_prefix}_api_ready, job)", + "includeAll": true, + "label": "NICo scrape job", + "multi": true, + "name": "job", + "options": [], + "query": { "query": "label_values(${metric_prefix}_api_ready, job)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, + "refresh": 1, + "regex": "", + "sort": 1, + "type": "query" + } + ] + }, + "time": { "from": "now-6h", "to": "now" }, + "timepicker": {}, + "timezone": "browser", + "title": "NICo / Site Overview", + "uid": "nico-overview", + "version": 1 +} diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000000..93407abf70 --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,34 @@ +{{/* +Create the chart name and version as used by the chart label. +*/}} +{{- define "nico.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Create a release-scoped name for the packaged Grafana dashboards. +*/}} +{{- define "nico.grafanaDashboardsName" -}} +{{- printf "%s-grafana-dashboards" .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- end -}} + +{{/* +Labels for the Grafana dashboard ConfigMap. User-provided global and dashboard +labels override the chart defaults, in that order. +*/}} +{{- define "nico.grafanaDashboardLabels" -}} +{{- $labels := dict + "helm.sh/chart" (include "nico.chart" .) + "app.kubernetes.io/name" .Chart.Name + "app.kubernetes.io/instance" .Release.Name + "app.kubernetes.io/component" "observability" + "app.kubernetes.io/managed-by" .Release.Service +-}} +{{- with .Values.global.labels }} +{{- $labels = mergeOverwrite $labels . }} +{{- end }} +{{- with .Values.grafanaDashboards.labels }} +{{- $labels = mergeOverwrite $labels . }} +{{- end }} +{{- toYaml $labels -}} +{{- end -}} diff --git a/helm/templates/grafana-dashboards.yaml b/helm/templates/grafana-dashboards.yaml new file mode 100644 index 0000000000..436d1e219e --- /dev/null +++ b/helm/templates/grafana-dashboards.yaml @@ -0,0 +1,19 @@ +{{- if .Values.grafanaDashboards.enabled }} +{{- $annotations := deepCopy (.Values.grafanaDashboards.annotations | default dict) }} +{{- if and .Values.grafanaDashboards.folderAnnotation .Values.grafanaDashboards.folder }} +{{- $_ := set $annotations .Values.grafanaDashboards.folderAnnotation .Values.grafanaDashboards.folder }} +{{- end }} +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ include "nico.grafanaDashboardsName" . }} + namespace: {{ .Values.grafanaDashboards.namespace | default .Release.Namespace | quote }} + labels: + {{- include "nico.grafanaDashboardLabels" . | nindent 4 }} + {{- with $annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +data: + {{- (.Files.Glob "dashboards/*.json").AsConfig | nindent 2 }} +{{- end }} diff --git a/helm/tests/fixtures/grafana-custom-values.yaml b/helm/tests/fixtures/grafana-custom-values.yaml new file mode 100644 index 0000000000..0523f891fe --- /dev/null +++ b/helm/tests/fixtures/grafana-custom-values.yaml @@ -0,0 +1,10 @@ +grafanaDashboards: + enabled: true + namespace: monitoring + folder: Infrastructure/NICo + folderAnnotation: sidecar.example.com/folder + labels: + grafana_dashboard: null + sidecar.example.com/dashboard: "true" + annotations: + example.com/owner: sre diff --git a/helm/tests/grafana_dashboards_test.yaml b/helm/tests/grafana_dashboards_test.yaml new file mode 100644 index 0000000000..991f4a4815 --- /dev/null +++ b/helm/tests/grafana_dashboards_test.yaml @@ -0,0 +1,66 @@ +suite: packaged Grafana dashboards +templates: + - templates/grafana-dashboards.yaml +release: + name: test-nico + namespace: nico-system +tests: + - it: does not install dashboards by default + asserts: + - hasDocuments: + count: 0 + + - it: packages all dashboards for the default Grafana sidecar + set: + grafanaDashboards.enabled: true + asserts: + - isKind: + of: ConfigMap + - equal: + path: metadata.name + value: test-nico-grafana-dashboards + - equal: + path: metadata.namespace + value: nico-system + - equal: + path: metadata.labels["grafana_dashboard"] + value: "1" + - equal: + path: metadata.labels["app.kubernetes.io/component"] + value: observability + - equal: + path: metadata.annotations["grafana_folder"] + value: NICo + - isNotEmpty: + path: data["nico-overview.json"] + - isNotEmpty: + path: data["nico-lifecycle.json"] + - isNotEmpty: + path: data["nico-api-performance.json"] + + - it: supports a Grafana namespace and sidecar-specific metadata + values: + - fixtures/grafana-custom-values.yaml + asserts: + - equal: + path: metadata.namespace + value: monitoring + - equal: + path: metadata.labels["sidecar.example.com/dashboard"] + value: "true" + - notExists: + path: metadata.labels["grafana_dashboard"] + - equal: + path: metadata.annotations["sidecar.example.com/folder"] + value: Infrastructure/NICo + - equal: + path: metadata.annotations["example.com/owner"] + value: sre + + - it: can omit the folder annotation + set: + grafanaDashboards.enabled: true + grafanaDashboards.folderAnnotation: "" + asserts: + - notExists: + path: metadata.annotations diff --git a/helm/values.yaml b/helm/values.yaml index e3ab5cdc46..92e7a81bdc 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -52,6 +52,27 @@ global: app.kubernetes.io/managed-by: Helm app.kubernetes.io/part-of: site-controller +## --------------------------------------------------------------------------- +## Grafana dashboards (optional) +## --------------------------------------------------------------------------- +## Packages the NICo dashboards in a ConfigMap for discovery by a Grafana +## dashboard sidecar, such as the one included with kube-prometheus-stack. +## Grafana itself is not installed by this chart. +grafanaDashboards: + enabled: false + ## Namespace where the dashboard ConfigMap is installed. An empty value uses + ## the Helm release namespace. A non-default namespace must already exist. + namespace: "" + ## Folder metadata understood by common Grafana dashboard sidecars. Set + ## folderAnnotation or folder to an empty string to omit the annotation. + folder: NICo + folderAnnotation: grafana_folder + ## The defaults match kube-prometheus-stack's dashboard sidecar selector. + ## Add or replace labels here if the local sidecar uses another selector. + labels: + grafana_dashboard: "1" + annotations: {} + ## --------------------------------------------------------------------------- ## nico-api — Core API server (gRPC + REST) ## Manages machines, provisioning, networking, firmware updates, and web UI. From e578288cf8658ec6f0572559c495b4d826ff0c1d Mon Sep 17 00:00:00 2001 From: Hasan Khan Date: Sat, 27 Jun 2026 18:36:54 -0700 Subject: [PATCH 2/3] fix(helm): scope dashboard ConfigMap name Signed-off-by: Hasan Khan --- helm/templates/_helpers.tpl | 6 ++++-- helm/tests/grafana_dashboards_test.yaml | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl index 93407abf70..750b6e1b9b 100644 --- a/helm/templates/_helpers.tpl +++ b/helm/templates/_helpers.tpl @@ -6,10 +6,12 @@ Create the chart name and version as used by the chart label. {{- end -}} {{/* -Create a release-scoped name for the packaged Grafana dashboards. +Create a namespace- and release-scoped name for the packaged Grafana +dashboards. The dashboard ConfigMap may be installed into a shared monitoring +namespace, where release name alone is not necessarily unique. */}} {{- define "nico.grafanaDashboardsName" -}} -{{- printf "%s-grafana-dashboards" .Release.Name | trunc 63 | trimSuffix "-" -}} +{{- printf "%s-%s-grafana-dashboards" .Release.Namespace .Release.Name | trunc 63 | trimSuffix "-" -}} {{- end -}} {{/* diff --git a/helm/tests/grafana_dashboards_test.yaml b/helm/tests/grafana_dashboards_test.yaml index 991f4a4815..f8972f7caf 100644 --- a/helm/tests/grafana_dashboards_test.yaml +++ b/helm/tests/grafana_dashboards_test.yaml @@ -18,7 +18,7 @@ tests: of: ConfigMap - equal: path: metadata.name - value: test-nico-grafana-dashboards + value: nico-system-test-nico-grafana-dashboards - equal: path: metadata.namespace value: nico-system From 4d0f7ea3c33ed381199f814f831127195d09b97c Mon Sep 17 00:00:00 2001 From: Hasan Khan Date: Sat, 27 Jun 2026 19:08:06 -0700 Subject: [PATCH 3/3] docs(metrics): sync Site Explorer status metric Signed-off-by: Hasan Khan --- docs/observability/core_metrics.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/observability/core_metrics.md b/docs/observability/core_metrics.md index a6a36ac2ad..0c3a2dfe8b 100644 --- a/docs/observability/core_metrics.md +++ b/docs/observability/core_metrics.md @@ -123,6 +123,7 @@ This file contains a list of metrics exported by NVIDIA Infra Controller (NICo). carbide_site_explorer_created_power_shelves_countgaugeThe amount of Power Shelves that had been created by Site Explorer after being identified carbide_site_explorer_enabledgaugeWhether site-explorer is enabled (1) or paused (0) carbide_site_explorer_iteration_latency_millisecondshistogramThe time it took to perform one site explorer iteration +carbide_site_explorer_last_run_statusgaugeThe status of the latest Site Explorer run carbide_site_explorer_phase_latency_millisecondshistogramThe time it took to perform one site explorer iteration phase carbide_site_explorer_update_explored_endpoints_countgaugeCounts from the last update_explored_endpoints phase by kind carbide_switches_enqueuer_iteration_latency_millisecondshistogramThe overall time it took to enqueue state handling tasks for all carbide_switches in the system