Skip to content

dNationCloud

dnation Kubernetes Monitoring Docs

This is generated documentation from configuration files of Kubernetes Monitoring.

Each configuration parameter can be overriden by providing custom values.yaml during helm installation.


PropertyValue
blackboxMonitoring
enabled: false
clusterMonitoringclusterMonitoring
commonLabels
{}
dnation-kubernetes-jsonnet-translatordnation-kubernetes-jsonnet-translator
fullnameOverride
""
grafanaDashboardsgrafanaDashboards
hostMonitoringhostMonitoring
kaasMonitoringkaasMonitoring
nameOverride
""
namespaceOverride
""
prometheusRulesprometheusRules
templatestemplates
testbedMonitoring
enabled: false


clusterMonitoring

PropertyValue
clusters
- apps: [] description: Kubernetes cluster monitoring label: observer-cluster name: K8sCluster
enabled
true


dnation-kubernetes-jsonnet-translator

PropertyValue
enabled
true
imageimage


image

PropertyValue
args
- --libsonnet - https://github.com/grafana/grafonnet-lib/grafonnet@daad85cf3fad3580e58029414630e29956aefe21 - https://github.com/thelastpickle/grafonnet-polystat-panel@275a48de57afdac0d72219d82863d8ab8bd0e682


grafanaDashboards

PropertyValue
colorcolor
constantsconstants
dataLinkCommonArgs
"refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to"
dataLinkCommonArgsBlackbox
"refresh=10s&var-datasource=$datasource&var-target=$target&from=$__from&to=$__to"
dataLinkCommonArgsNoCluster
"refresh=10s&var-datasource=$datasource&from=$__from&to=$__to"
editable
true
enable
true
idsids
isLoki
true
labelGrafana
grafana_dashboard: '1'
labelJsonnet
grafana_dashboard_jsonnet: '1'
refresh
"10s"
selectorsselectors
severityColorsseverityColors
tagstags
templateRefresh
"time"
templateSort
5
time_from
"now-5m"
tooltip
"shared_crosshair"


color

PropertyValue
black
"#000000"
blue
"#5794f2"
gray
"#858187"
green
"#56a64b"
lightblue
"#8ab8ff"
orange
"#ff780a"
pink
"#fce2de"
purple
"#a352cc"
red
"#e02f44"
white
"#ffffff"
yellow
"#fade2a"


constants

PropertyValue
infinity
100000000000000005366162204393472
maxWarnings
10000


ids

PropertyValue
alertClusterOverview
"alertclusteroverview"
alertHostOverview
"alerthostoverview"
alertKaasOverview
"alertkaasoverview"
alertTestbedOverview
"alerttestbedoverview"
alertVMOverview
"alertvmoverview"
apache
"apache"
apiServer
"apiserver"
autoscaler
"autoscaler"
cAdvisor
"cadvisor"
containerDetail
"containerdetail"
containerOverview
"containeroverview"
controllerManager
"controllermanager"
cpuNamespaceOverview
"cpunamespaceoverview"
cpuOverview
"cpuoverview"
daemonSetOverview
"daemonsetoverview"
deploymentOverview
"deploymentoverview"
diskOverview
"diskoverview"
etcd
"etcd"
harbor
"harbor"
hostMonitoring
"hostmonitoring"
javaActuator
"javaactuator"
jobOverview
"joboverview"
jvm
"jvm"
k8sMonitoring
"k8smonitoring"
kaasL1Monitoring
"kaasl1monitoring"
kaasMonitoring
"kaas-monitoring"
kubelet
"kubelet"
lokiDistributed
"loki-distributed"
memoryNamespaceOverview
"memorynamespaceoverview"
memoryOverview
"memoryoverview"
monitoring
"monitoring"
mysqlExporter
"mysqlexporter"
networkNamespaceOverview
"networknamespaceoverview"
networkOverview
"networkoverview"
nginxIngress
"nginxingress"
nginxNrpe
"nginxnrpe"
nginxVts
"nginxvts"
nginxVtsEnhanced
"nginxvtsenhanced"
nginxVtsEnhancedLegacy
"nginxvtsenhancedlegacy"
nginxVtsLegacy
"nginxvtslegacy"
nodeExporter
"nodeexporter"
nodeOverview
"nodeoverview"
persistentVolumes
"persistentvolumes"
phpFpm
"phpfpm"
podOverview
"podoverview"
postfix
"postfix"
prometheus
"prometheus"
proxy
"proxy"
pvcOverview
"pvcoverview"
pythonFlask
"pythonflask"
rabbitmq
"rabbitmq"
scheduler
"scheduler"
sslExporter
"ssl-exporter"
statefulSet
"statefulset"
statefulSetOverview
"statefulsetoverview"
testbed
"testbed"
vmMonitoring
"vmmonitoring"
websocket
"websocket"


selectors

PropertyValue
apiServer
"job=\"apiserver\""
controllerManager
"job=\"kube-controller-manager\""
etcd
"job=\"kube-etcd\""
kubelet
"job=\"kubelet\""
proxy
"job=\"kube-proxy\""
scheduler
"job=\"kube-scheduler\""


severityColors

PropertyValue
critical
"red"
default
"green"
invalid
"black"
warning
"orange"


tags

PropertyValue
k8sApps
- k8s - app - L1
k8sAppsMain
- k8s - app - L0
k8sContainer
- k8s - container - L3
k8sHostsMain
- k8s - host - L1
k8sMonitoring
- k8s - monitoring - L1
k8sMonitoringMain
- k8s - cluster - host - L0
k8sNodeExporter
- k8s - nodeexporter - L3
k8sOverview
- k8s - overview - L2
k8sPVC
- k8s - pvc - L3
k8sStatefulSet
- k8s - statefulset - L3
k8sSystem
- k8s - system - L2
k8sVMs
- k8s - vm - L2
kaasMonitoring
- kaas - monitoring - L1
kaasMonitoringMain
- kaas - cluster - L0
testbed
- testbed - L0
testbedAlert
- testbed - L1


hostMonitoring

PropertyValue
enabled
false
hosts
[]


kaasMonitoring

PropertyValue
clusters
- description: KaaS monitoring name: KaasCluster
enabled
false


prometheusRules

PropertyValue
alertGroupCluster
"Cluster"
alertGroupClusterApp
"ClusterApp"
alertGroupClusterVM
"ClusterVM"
alertGroupClusterVMApp
"ClusterVMApp"
alertGroupHost
"Host"
alertGroupHostApp
"HostApp"
alertInterval
"5m"
alertNamePrefix
"KubernetesMonitoring"
enable
true
labelJsonnet
prometheus_rule_jsonnet: '1'
labelPrometheus
prometheus_rule: '1'


templates

PropertyValue
L0L0
L1L1
L2L2
RecordRules
- expr: node_uname_info{job=~"node-exporter"} and on(nodename) label_replace(kube_node_role{role=~"control-plane"}, "nodename", "$1", "node", "(.+)") record: master_uname_info - expr: node_uname_info{job=~"node-exporter"} unless on(nodename) label_replace(kube_node_role{role=~"control-plane"}, "nodename", "$1", "node", "(.+)") record: worker_uname_info
commonThresholdscommonThresholds
templateBasestemplateBases


L0

PropertyValue
blackboxblackbox
hosthost
k8sk8s
kaaskaas
testbedtestbed


blackbox

PropertyValue
mainmain


main

PropertyValue
panel
expr: probe_success{target=~"%(target)s", endpoint="http"} graphMode: none gridPos: h: 3 w: 4 mappings: - from: -1 text: '-' to: -1 type: 2 value: '' - from: 0 text: Critical to: 0 type: 2 value: '' - from: 1 text: OK to: 1 type: 2 value: '' thresholds: critical: 1 lowest: 0 operator: < unit: none


host

PropertyValue
mainmain


main

PropertyValue
panel
expr: ((sum(up{job=~"%(job)s"}) or on() vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog", alertstate="firing", severity="warning", job=~"%(job)s", alertgroup=~"%(groupHost)s|%(groupHostApp)s"} OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", alertstate="firing", severity="critical", job=~"%(job)s", alertgroup=~"%(groupHost)s|%(groupHostApp)s"} OR on() vector(0)) * %(maxWarnings)d graphMode: none gridPos: h: 3 w: 4 mappings: - from: -1 text: Down to: -1 type: 2 value: '' - from: 0 text: OK to: 0 type: 2 value: '' - from: 1 text: Warning to: 9999 type: 2 value: '' - from: 10000 text: Critical to: 100000000000000005366162204393472 type: 2 value: '' thresholds: critical: 10000 lowest: 0 operator: '>=' warning: 1 unit: none


k8s

PropertyValue
mainmain


main

PropertyValue
panel
expr: ((sum(up{job=~"node-exporter", cluster="%(cluster)s"}) or on() vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing", severity="warning", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing", severity="critical", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0)) * %(maxWarnings)d graphMode: none gridPos: h: 3 w: 4 mappings: - from: -1 text: Down to: -1 type: 2 value: '' - from: 0 text: OK to: 0 type: 2 value: '' - from: 1 text: Warning to: 9999 type: 2 value: '' - from: 10000 text: Critical to: 100000000000000005366162204393472 type: 2 value: '' thresholds: critical: 10000 lowest: 0 operator: '>=' warning: 1 unit: none


kaas

PropertyValue
mainmain


main

PropertyValue
panel
expr: ((sum(kaas{cluster="%(cluster)s"} unless up{job=~"node-exporter", cluster="%(cluster)s"}) or on() vector(0)) == bool 0) * (-1) + ((sum(kaas{cluster="%(cluster)s"}) or on() vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing", severity="warning", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing", severity="critical", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0)) * %(maxWarnings)d graphMode: none gridPos: h: 3 w: 4 mappings: - from: -2 text: '-' to: -2 type: 2 value: '' - from: -1 text: Down to: -1 type: 2 value: '' - from: 0 text: OK to: 0 type: 2 value: '' - from: 1 text: Warning to: 9999 type: 2 value: '' - from: 10000 text: Critical to: 100000000000000005366162204393472 type: 2 value: '' thresholds: critical: 10000 lowest: 0 operator: '>=' warning: 1 unit: none


testbed

PropertyValue
mainmain


main

PropertyValue
panel
expr: ((sum(up{infrastructure="testbed"}) or on() vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog", infrastructure="testbed", alertstate="firing", severity="warning"} OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", infrastructure="testbed", alertstate="firing", severity="critical"} OR on() vector(0)) * %(maxWarnings)d graphMode: none gridPos: h: 3 w: 4 mappings: - from: -1 text: Down to: -1 type: 2 value: '' - from: 0 text: OK to: 0 type: 2 value: '' - from: 1 text: Warning to: 9999 type: 2 value: '' - from: 10000 text: Critical to: 100000000000000005366162204393472 type: 2 value: '' thresholds: critical: 10000 lowest: 0 operator: '>=' warning: 1 unit: none


L1

PropertyValue
hosthost
hostAppshostApps
k8sk8s
k8sAppsk8sApps
vmvm
vmAppsvmApps


host

PropertyValue
overallNetworkErrorsoverallNetworkErrors
overallUtilizationCPUoverallUtilizationCPU
overallUtilizationDiskoverallUtilizationDisk
overallUtilizationRAMoverallUtilizationRAM
targetDowntargetDown
totalCorestotalCores
totalDisktotalDisk
totalRAMtotalRAM
usedCoresusedCores
usedDiskusedDisk
usedRAMusedRAM


overallNetworkErrors

PropertyValue
alert
customLables: alertgroup: Host expr: sum(rate(node_network_transmit_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) ) by (job, nodename) linkGetParams: var-instance={{ $labels.nodename }} message: 'Host {{ $labels.nodename }}: High Overall Network Errors Count {{ $value }}%' name: HostNetworkOverallErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: sum(rate(node_network_transmit_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) ) by (job, nodename) gridPos: x: 18 y: 6 thresholds: critical: 15 operator: '>=' warning: 10 title: Overall Errors unit: pps


overallUtilizationCPU

PropertyValue
alert
customLables: alertgroup: Host expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"%s", mode="idle"}[5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100) linkGetParams: var-instance={{ $labels.nodename }} message: 'Host {{ $labels.nodename }}: High CPU Overall Utilization {{ $value }}%' name: HostCPUOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100)) gridPos: x: 0 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationDisk

PropertyValue
alert
customLables: alertgroup: Host expr: round((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)) * 100 > 0) linkGetParams: var-instance={{ $labels.nodename }} message: 'Host {{ $labels.nodename }}: High Disk Overall Utilization {{ $value }}%' name: HostDiskOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: max(round((sum(node_filesystem_size_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)) * 100 > 0)) gridPos: x: 12 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationRAM

PropertyValue
alert
customLables: alertgroup: Host expr: round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info))) * 100) linkGetParams: var-instance={{ $labels.nodename }} message: 'Host {{ $labels.nodename }}: High RAM Overall Utilization {{ $value }}%' name: HostRAMOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"$job"} * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info))) * 100)) gridPos: x: 6 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


targetDown

PropertyValue
alert
customLables: alertgroup: Host expr: 100 * (count by(job, namespace, service) (up{job=~"%s"} == 0) / count by(job, namespace, service) (up{job=~"%s"})) message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' name: HostTargetDown thresholds: critical: 90 operator: '>=' warning: 10
panel
null


totalCores

PropertyValue
panel
colorMode: value expr: count(node_cpu_seconds_total{job=~"$job", mode="system"}) graphMode: none gridPos: h: 2 w: 3 x: 3 y: 9 thresholds: color: '#858187' value: title: Total Cores unit: none


totalDisk

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{job=~"$job"}) graphMode: none gridPos: h: 2 w: 3 x: 15 y: 9 thresholds: color: '#858187' value: title: Total unit: bytes


totalRAM

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{job=~"$job"}) graphMode: none gridPos: h: 2 w: 3 x: 9 y: 9 thresholds: color: '#858187' value: title: Total unit: bytes


usedCores

PropertyValue
panel
colorMode: value expr: (1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m])))) * count(node_cpu_seconds_total{job=~"$job", mode="system"}) graphMode: none gridPos: h: 2 w: 3 x: 0 y: 9 thresholds: color: '#858187' value: title: Used Cores unit: none


usedDisk

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{job=~"$job"}) - sum(node_filesystem_free_bytes{job=~"$job"}) graphMode: none gridPos: h: 2 w: 3 x: 12 y: 9 thresholds: color: '#858187' value: title: Used unit: bytes


usedRAM

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{job=~"$job"}) * (((1 - sum(node_memory_MemAvailable_bytes{job=~"$job"}) / sum(node_memory_MemTotal_bytes{job=~"$job"})))) graphMode: none gridPos: h: 2 w: 3 x: 6 y: 9 thresholds: color: '#858187' value: title: Used unit: bytes


hostApps

PropertyValue
apacheapache
autoscalerautoscaler
cAdvisorcAdvisor
genericAppgenericApp
harborharbor
javaActuatorjavaActuator
jvmjvm
lokiDistributedlokiDistributed
mysqlExportermysqlExporter
nginxIngressnginxIngress
nginxIngressCertificateExpirynginxIngressCertificateExpiry
nginxNrpenginxNrpe
nginxVtsnginxVts
nginxVtsEnhancednginxVtsEnhanced
nginxVtsEnhancedLegacynginxVtsEnhancedLegacy
nginxVtsLegacynginxVtsLegacy
phpFpmphpFpm
postfixpostfix
prometheusprometheus
pythonFlaskpythonFlask
rabbitmqrabbitmq
sslExportersslExporter
websocketwebsocket


apache

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


autoscaler

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (autoscaler_healthy{job=~".+"}) / sum by (job, cluster) (autoscaler_instances{job=~".+"}) * 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Autoscaler Health Low {{ $value }}%' name: HostAppAutoscalerHealthLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- autoscaler
panel
expr: (sum by (job) (autoscaler_healthy{cluster="$cluster", %(job)s}) / sum by (job) (autoscaler_instances{cluster="$cluster", %(job)s}) * 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


cAdvisor

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


genericApp

PropertyValue
alert
{}
default
false
panel
description: GenericApp template. Used when application monitoring is requested but appropriate template was not found. expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 gridPos: w: 4 thresholds: critical: 95 operator: < warning: 99


harbor

PropertyValue
alert
customLables: alertgroup: HostApp expr: harbor_up{job=~".+"} linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Harbor component "{{ $labels.component }}" is down' name: HostAppHarborComponentDown thresholds: critical: 0 operator: == warning: 0
default
false
linkTo
- harbor
panel
expr: (sum(harbor_up{cluster="$cluster", %(job)s}) / count(harbor_up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


javaActuator

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="heap"})*100/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="nonheap"}) > sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"}) or (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100)/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"})) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Java Actuator Heap High {{ $value }}%' name: HostAppJavaActuatorHeapHigh thresholds: critical: 90 lowest: 0 operator: '>=' warning: 75
default
false
linkTo
- javaactuator
panel
expr: (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="heap"})*100/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="nonheap"}) > sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"}) or (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100)/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"})) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 90 lowest: 0 operator: '>=' warning: 75


jvm

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


lokiDistributed

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


mysqlExporter

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


nginxIngress

PropertyValue
alert
customLables: alertgroup: HostApp expr: ((sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+", status!~"[4-5].*"}[5m])) / sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m])) + 100)) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Nginx Ingress Success Rate (non-4|5xx responses) Low {{ printf "%.0f" $value }}%' name: HostAppNginxIngressSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxingress
panel
expr: ((sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s, status!~"[4-5].*"}[5m])) / sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s}[5m])) + 100)) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxIngressCertificateExpiry

PropertyValue
alert
customLables: alertgroup: HostApp expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{job=~".+"} - time()) / 60 / 60 / 24 linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Nginx Ingress Certificate Expiry in {{ printf "%.2f" $value }} days' name: HostAppNginxIngressCertificateExpiry thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 8
default
false
linkTo
- nginxingress
panel
dataLinks: - title: Detail url: /d/nginxingress?var-job=%(job)s&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to decimals: 0 expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{cluster="$cluster", %(job)s} - time()) OR on() vector(-100000000000000005366162204393472) gridPos: w: 4 mappings: - text: '-' type: 1 value: -100000000000000005366162204393472 thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 691200 unit: s


nginxNrpe

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


nginxVts

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: HostAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvts
panel
expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsEnhanced

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: HostAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtsenhanced
panel
expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsEnhancedLegacy

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: HostAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtsenhancedlegacy
panel
expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsLegacy

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: HostAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtslegacy
panel
expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


phpFpm

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


postfix

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (postfix_size{job=~".+"})) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} mappings: - text: '-' type: 1 value: -1 message: 'HostApp {{ $labels.job }}: Postfix Queue Size High {{ $value }}%' name: HostAppPostfixQueueSizeHigh thresholds: critical: 10 lowest: 0 operator: '>=' warning: 5
default
false
linkTo
- postfix
panel
expr: (sum by (job) (postfix_size{cluster="$cluster", %(job)s})) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 10 lowest: 0 operator: '>=' warning: 5 unit: mailq


prometheus

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


pythonFlask

PropertyValue
alert
customLables: alertgroup: HostApp expr: (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+",status!~"[4-5].*"}[5m])) / sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'HostApp {{ $labels.job }}: Python Flask Success Rate (non-4|5xx responses) Low {{ $value }}%' name: HostAppPythonFlaskSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- pythonflask
panel
expr: (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s,status!~"[4-5].*"}[5m])) / sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


rabbitmq

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


sslExporter

PropertyValue
alert
{}
default
false
linkTo
- ssl-exporter
panel
decimals: 0 expr: bottomk(1,ssl_cert_not_after{cluster="$cluster"}-time() OR ssl_file_cert_not_after{cluster="$cluster"}-time() OR ssl_kubeconfig_cert_not_after{cluster="$cluster"}-time() OR ssl_kubernetes_cert_not_after{cluster="$cluster"}-time()) gridPos: w: 4 mappings: - text: '-' type: 1 value: -100000000000000005366162204393472 thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 691200 unit: s


websocket

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


k8s

PropertyValue
apiServerHealthapiServerHealth
controllerManagerHealthcontrollerManagerHealth
daemonSetsHealthdaemonSetsHealth
deploymentsHealthdeploymentsHealth
etcdHealthetcdHealth
kubeletHealthkubeletHealth
mostUtilizedMasterNodeCPUmostUtilizedMasterNodeCPU
mostUtilizedMasterNodeDiskmostUtilizedMasterNodeDisk
mostUtilizedMasterNodeNetworkErrorsmostUtilizedMasterNodeNetworkErrors
mostUtilizedMasterNodeRAMmostUtilizedMasterNodeRAM
mostUtilizedPVCmostUtilizedPVC
mostUtilizedWorkerNodeCPUmostUtilizedWorkerNodeCPU
mostUtilizedWorkerNodeDiskmostUtilizedWorkerNodeDisk
mostUtilizedWorkerNodeNetworkErrorsmostUtilizedWorkerNodeNetworkErrors
mostUtilizedWorkerNodeRAMmostUtilizedWorkerNodeRAM
nodeHealthnodeHealth
overallMasterNodesNetworkErrorsoverallMasterNodesNetworkErrors
overallUtilizationMasterNodesCPUoverallUtilizationMasterNodesCPU
overallUtilizationMasterNodesDiskoverallUtilizationMasterNodesDisk
overallUtilizationMasterNodesRAMoverallUtilizationMasterNodesRAM
overallUtilizationWorkerNodesCPUoverallUtilizationWorkerNodesCPU
overallUtilizationWorkerNodesDiskoverallUtilizationWorkerNodesDisk
overallUtilizationWorkerNodesRAMoverallUtilizationWorkerNodesRAM
overallWorkerNodesNetworkErrorsoverallWorkerNodesNetworkErrors
proxyHealthproxyHealth
pvcBoundpvcBound
runningContainersrunningContainers
runningPodsrunningPods
runningStatefulSetsrunningStatefulSets
schedulerHealthschedulerHealth
succeededJobssucceededJobs
targetDowntargetDown
totalCoresMasterNodestotalCoresMasterNodes
totalCoresWorkerNodestotalCoresWorkerNodes
totalDiskMasterNodestotalDiskMasterNodes
totalDiskWorkerNodestotalDiskWorkerNodes
totalRAMMasterNodestotalRAMMasterNodes
totalRAMWorkerNodestotalRAMWorkerNodes
usedCoresMasterNodesusedCoresMasterNodes
usedCoresWorkerNodesusedCoresWorkerNodes
usedDiskMasterNodesusedDiskMasterNodes
usedDiskWorkerNodesusedDiskWorkerNodes
usedRAMMasterNodesusedRAMMasterNodes
usedRAMWorkerNodesusedRAMWorkerNodes


apiServerHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: (sum(up{job="apiserver"}) by (cluster) / count(up{job="apiserver"}) by (cluster)) * 100 linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Api Server Health Low {{ $value }}% name: ClusterApiServerHealthLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- apiserver
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: (sum(up{cluster="$cluster", job="apiserver"}) / count(up{cluster="$cluster", job="apiserver"})) * 100 OR on() vector(-1) gridPos: w: 4 x: 0 y: 5 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: API Server


controllerManagerHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: (sum(up{job="kube-controller-manager"}) by (cluster) / count(up{job="kube-controller-manager"}) by (cluster)) * 100 linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Controller Manager Health Low {{ $value }}% name: ClusterControllerManagerHealthLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- controllermanager
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: (sum(up{cluster="$cluster", job="kube-controller-manager"}) / count(up{cluster="$cluster", job="kube-controller-manager"})) * 100 OR on() vector(-1) gridPos: w: 4 x: 4 y: 5 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: Controller Manager


daemonSetsHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((sum(kube_daemonset_status_updated_number_scheduled OR kube_daemonset_updated_number_scheduled) by (cluster) + sum(kube_daemonset_status_number_available) by (cluster)) / (2 * sum(kube_daemonset_status_desired_number_scheduled) by (cluster)) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: DaemonSets Health Low {{ $value }}% name: RunningDaemonSetsHealthLow thresholds: critical: 95 operator: < warning: 99
linkTo
- daemonSetOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round((sum(kube_daemonset_status_updated_number_scheduled{cluster="$cluster"} OR kube_daemonset_updated_number_scheduled{cluster="$cluster"}) + sum(kube_daemonset_status_number_available{cluster="$cluster"})) / (2 * sum(kube_daemonset_status_desired_number_scheduled{cluster="$cluster"})) * 100) gridPos: x: 6 y: 12 thresholds: critical: 95 operator: < warning: 99 title: DaemonSets Health


deploymentsHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((sum(kube_deployment_status_replicas_updated) by (cluster) + sum(kube_deployment_status_replicas_available) by (cluster)) / (2 * sum(kube_deployment_status_replicas) by (cluster)) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: Running Deployments Health Low {{ $value }}% name: RunningDeploymentsHealthLow thresholds: critical: 95 operator: < warning: 99
linkTo
- deploymentOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round((sum(kube_deployment_status_replicas_updated{cluster="$cluster"}) + sum(kube_deployment_status_replicas_available{cluster="$cluster"})) / (2 * sum(kube_deployment_status_replicas{cluster="$cluster"})) * 100) gridPos: x: 0 y: 12 thresholds: critical: 95 operator: < warning: 99 title: Deployments Health


etcdHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: (sum(up{job="kube-etcd"}) by (cluster) / count(up{job="kube-etcd"}) by (cluster)) * 100 linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Etcd Health Low {{ $value }}% name: ClusterEtcdHealthLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- etcd
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: (sum(up{cluster="$cluster", job="kube-etcd"}) / count(up{cluster="$cluster", job="kube-etcd"})) * 100 OR on() vector(-1) gridPos: w: 4 x: 8 y: 5 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: Etcd


kubeletHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: (sum(up{job="kubelet", metrics_path="/metrics"}) by (cluster) / count(up{job="kubelet", metrics_path="/metrics"}) by (cluster)) * 100 linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Kubelet Health Low {{ $value }}% name: ClusterKubeletHealthLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- kubelet
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: (sum(up{cluster="$cluster", job="kubelet", metrics_path="/metrics"}) / count(up{cluster="$cluster", job="kubelet", metrics_path="/metrics"})) * 100 OR on() vector(-1) gridPos: w: 4 x: 12 y: 5 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: Kubelet


mostUtilizedMasterNodeCPU

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, cluster) )) * 100) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Master Node {{ $labels.nodename }}: High CPU Utilization {{ $value }}%' name: ClusterMasterNodeCPUUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/cpuoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename) )) * 100)) gridPos: w: 3 x: 3 y: 17 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized Node


mostUtilizedMasterNodeDisk

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)) + sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)) * 100) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Master Node {{ $labels.nodename }}: High Disk Utilization {{ $value }}%' name: ClusterMasterNodeDiskUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/diskoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: max(round((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device)) * 100)) gridPos: w: 3 x: 15 y: 17 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized Node


mostUtilizedMasterNodeNetworkErrors

PropertyValue
alert
customLables: alertgroup: Cluster expr: sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename, cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename, cluster) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Master Node {{ $labels.nodename }}: High Network Errors Count {{ $value }}%' name: ClusterMasterNodeNetworkErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/networkoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename)) gridPos: w: 3 x: 21 y: 17 thresholds: critical: 15 operator: '>=' warning: 10 title: Most Affected Node unit: pps


mostUtilizedMasterNodeRAM

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info))) * 100) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Master Node {{ $labels.nodename }}: High RAM Utilization {{ $value }}%' name: ClusterMasterNodesRAMUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/memoryoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: max(round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) / sum by (job, nodename) (node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info))) * 100)) gridPos: w: 3 x: 9 y: 17 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized Node


mostUtilizedPVC

PropertyValue
alert
customLables: alertgroup: Cluster expr: sum(((kubelet_volume_stats_capacity_bytes - kubelet_volume_stats_available_bytes) / kubelet_volume_stats_capacity_bytes) * 100) by (persistentvolumeclaim, cluster) linkGetParams: var-pvc={{ $labels.persistentvolumeclaim }}&var-cluster={{ $labels.cluster }} message: '"{{ $labels.persistentvolumeclaim }}": High PVC Utilization {{ $value }}%' name: PVCUtilizationHigh thresholds: critical: 97 lowest: 0 operator: '>=' warning: 85
linkTo
- pvcOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(sum(((kubelet_volume_stats_capacity_bytes{cluster="$cluster"} - kubelet_volume_stats_available_bytes{cluster="$cluster"}) / kubelet_volume_stats_capacity_bytes{cluster="$cluster"}) * 100) by (persistentvolumeclaim)) OR on() vector(-1) gridPos: w: 3 x: 21 y: 12 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 97 lowest: 0 operator: '>=' warning: 85 title: Most Utilized PVC


mostUtilizedWorkerNodeCPU

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, cluster) )) * 100) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Worker Node {{ $labels.nodename }}: High CPU Utilization {{ $value }}%' name: ClusterWorkerNodeCPUUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/cpuoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename) )) * 100)) gridPos: w: 3 x: 3 y: 24 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized Node


mostUtilizedWorkerNodeDisk

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)) + sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)) * 100) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Worker Node {{ $labels.nodename }}: High Disk Utilization {{ $value }}%' name: ClusterWorkerNodeDiskUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/diskoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: max(round((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device)) * 100)) gridPos: w: 3 x: 15 y: 24 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized Node


mostUtilizedWorkerNodeNetworkErrors

PropertyValue
alert
customLables: alertgroup: Cluster expr: sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename, cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename, cluster) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Worker Node {{ $labels.nodename }}: High Network Errors Count {{ $value }}%' name: ClusterWorkerNodeNetworkErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/networkoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename)) gridPos: w: 3 x: 21 y: 24 thresholds: critical: 15 operator: '>=' warning: 10 title: Most Affected Node unit: pps


mostUtilizedWorkerNodeRAM

PropertyValue
alert
customLables: alertgroup: Cluster expr: round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info))) * 100) linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster }} message: 'Cluster Worker Node {{ $labels.nodename }}: High RAM Utilization {{ $value }}%' name: ClusterWorkerNodesRAMUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/memoryoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: max(round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) / sum by (job, nodename) (node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info))) * 100)) gridPos: w: 3 x: 9 y: 24 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized Node


nodeHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: round(sum(kube_node_info) by (cluster) / (sum(kube_node_info) by (cluster) + sum(kube_node_spec_unschedulable) by (cluster) + sum(kube_node_status_condition{condition=~"DiskPressure|MemoryPressure|PIDPressure", status=~"true|unknown"}) by (cluster) + sum(kube_node_status_condition{condition="Ready", status=~"false|unknown"}) by (cluster)) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: Nodes Health Low {{ $value }}% name: NodesHealthLow thresholds: critical: 95 operator: < warning: 99
linkTo
- nodeOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round(sum(kube_node_info{cluster="$cluster"}) / (sum(kube_node_info{cluster="$cluster"}) + sum(kube_node_spec_unschedulable{cluster="$cluster"}) + sum(kube_node_status_condition{cluster="$cluster", condition=~"DiskPressure|MemoryPressure|PIDPressure", status=~"true|unknown"}) + sum(kube_node_status_condition{cluster="$cluster", condition="Ready", status=~"false|unknown"}) ) * 100) gridPos: x: 0 y: 9 thresholds: critical: 95 operator: < warning: 99 title: Nodes Health


overallMasterNodesNetworkErrors

PropertyValue
alert
customLables: alertgroup: Cluster expr: sum(sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename, cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename, cluster)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Master Nodes High Overall Network Errors Count {{ $value }}% name: ClusterMasterNodesNetworkOverallErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- networkPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: sum(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename)) gridPos: w: 3 x: 18 y: 17 thresholds: critical: 15 operator: '>=' warning: 10 title: Overall Errors unit: pps


overallUtilizationMasterNodesCPU

PropertyValue
alert
customLables: alertgroup: Cluster expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, cluster) )) * 100)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Master Nodes High CPU Overall Utilization {{ $value }}% name: ClusterMasterNodesCPUOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- cpuPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename) )) * 100)) gridPos: w: 3 x: 0 y: 17 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationMasterNodesDisk

PropertyValue
alert
customLables: alertgroup: Cluster expr: avg(round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)) + sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)) * 100 > 0)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Master Nodes High Disk Overall Utilization {{ $value }}% name: ClusterMasterNodesDiskOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- diskPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: avg(round((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename, device)) * 100 > 0)) gridPos: w: 3 x: 12 y: 17 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationMasterNodesRAM

PropertyValue
alert
customLables: alertgroup: Cluster expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (master_uname_info))) * 100)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Master Nodes High RAM Overall Utilization {{ $value }}% name: ClusterMasterNodesRAMOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- memoryPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: avg(round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info)) / sum by (job, nodename) (node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (master_uname_info))) * 100)) gridPos: w: 3 x: 6 y: 17 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationWorkerNodesCPU

PropertyValue
alert
customLables: alertgroup: Cluster expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, cluster) )) * 100)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Worker Nodes High CPU Overall Utilization {{ $value }}% name: ClusterWorkerNodesCPUOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- cpuPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename) )) * 100)) gridPos: w: 3 x: 0 y: 24 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationWorkerNodesDisk

PropertyValue
alert
customLables: alertgroup: Cluster expr: avg(round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)) + sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)) * 100 > 0)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Worker Nodes High Disk Overall Utilization {{ $value }}% name: ClusterWorkerNodesDiskOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- diskPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: avg(round((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device)) * 100 > 0)) gridPos: w: 3 x: 12 y: 24 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationWorkerNodesRAM

PropertyValue
alert
customLables: alertgroup: Cluster expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename) (worker_uname_info))) * 100)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Worker Nodes High RAM Overall Utilization {{ $value }}% name: ClusterWorkerNodesRAMOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- memoryPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: avg(round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info)) / sum by (job, nodename) (node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"} * on(instance, pod) group_left(nodename) (worker_uname_info))) * 100)) gridPos: w: 3 x: 6 y: 24 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallWorkerNodesNetworkErrors

PropertyValue
alert
customLables: alertgroup: Cluster expr: sum(sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename, cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename, cluster)) by (cluster) linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Worker Nodes High Overall Network Errors Count {{ $value }}% name: ClusterWorkerNodesNetworkOverallErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- networkPerNodePolystat
panel
dataLinks: - title: System Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All - title: K8s Overview url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: sum(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster", job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename)) gridPos: w: 3 x: 18 y: 24 thresholds: critical: 15 operator: '>=' warning: 10 title: Overall Errors unit: pps


proxyHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: (sum(up{job="kube-proxy"}) by (cluster) / count(up{job="kube-proxy"}) by (cluster)) * 100 linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Proxy Health Low {{ $value }}% name: ClusterProxyHealthLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- proxy
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: (sum(up{cluster="$cluster", job="kube-proxy"}) / count(up{cluster="$cluster", job="kube-proxy"})) * 100 OR on() vector(-1) gridPos: w: 4 x: 16 y: 5 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: Proxy


pvcBound

PropertyValue
alert
customLables: alertgroup: Cluster expr: "round(sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) by (cluster)\ \ / (\nsum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) by (cluster)\ \ + sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) by (cluster)\ \ +\nsum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) by (cluster)\n\ ) * 100)" linkGetParams: var-cluster={{ $labels.cluster }} message: PVC Bound Rate Low {{ $value }}% name: PVCBoundRateLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- pvcOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: "round(sum(kube_persistentvolumeclaim_status_phase{cluster=\"$cluster\", phase=\"\ Bound\"}) / (\nsum(kube_persistentvolumeclaim_status_phase{cluster=\"$cluster\"\ , phase=\"Bound\"}) + sum(kube_persistentvolumeclaim_status_phase{cluster=\"$cluster\"\ , phase=\"Pending\"}) +\nsum(kube_persistentvolumeclaim_status_phase{cluster=\"\ $cluster\", phase=\"Lost\"})\n) * 100) OR on() vector(-1)" gridPos: w: 3 x: 18 y: 12 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: PVC Bound


runningContainers

PropertyValue
alert
customLables: alertgroup: Cluster expr: round(sum(kube_pod_container_status_running) by (cluster) / (sum(kube_pod_container_status_running) by (cluster) + (count(kube_pod_container_status_terminated) by (cluster) - count(kube_pod_container_status_terminated unless ignoring(reason) kube_pod_container_status_terminated_reason{reason!="Completed"}) by (cluster)) + sum(kube_pod_container_status_waiting) by (cluster)) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: Running Containers Health Low {{ $value }}% name: RunningContainersHealthLow thresholds: critical: 95 operator: < warning: 99
linkTo
- containerOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round(sum(kube_pod_container_status_running{cluster="$cluster"}) / (sum(kube_pod_container_status_running{cluster="$cluster"}) + (sum(kube_pod_container_status_terminated_reason{cluster="$cluster", reason!="Completed"}) OR vector(0)) + sum(kube_pod_container_status_waiting{cluster="$cluster"})) * 100) gridPos: x: 12 y: 12 thresholds: critical: 95 operator: < warning: 99 title: Running Containers


runningPods

PropertyValue
alert
customLables: alertgroup: Cluster expr: round(sum(kube_pod_status_phase{phase="Running"}) by (cluster) / (sum(kube_pod_status_phase{phase="Running"}) by (cluster) + sum(kube_pod_status_phase{phase="Pending"}) by (cluster) + sum(kube_pod_status_phase{phase="Failed"}) by (cluster) + sum(kube_pod_status_phase{phase="Unknown"}) by (cluster)) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: Pods Health Low {{ $value }}% name: RunningPodsHealthLow thresholds: critical: 95 operator: < warning: 99
linkTo
- podOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round(sum(kube_pod_status_phase{cluster="$cluster", phase="Running"}) / (sum(kube_pod_status_phase{cluster="$cluster", phase="Running"}) + sum(kube_pod_status_phase{cluster="$cluster", phase="Pending"}) + sum(kube_pod_status_phase{cluster="$cluster", phase="Failed"}) + sum(kube_pod_status_phase{cluster="$cluster", phase="Unknown"})) * 100) gridPos: x: 12 y: 9 thresholds: critical: 95 operator: < warning: 99 title: Running Pods


runningStatefulSets

PropertyValue
alert
customLables: alertgroup: Cluster expr: round(sum(kube_statefulset_status_replicas_ready) by (cluster) / sum(kube_statefulset_status_replicas) by (cluster) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: StatefulSets Health Low {{ $value }}% name: RunningStatefulSetsHealthLow thresholds: critical: 95 operator: < warning: 99
linkTo
- statefulSetOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round(sum(kube_statefulset_status_replicas_ready{cluster="$cluster"}) / sum(kube_statefulset_status_replicas{cluster="$cluster"}) * 100) gridPos: x: 6 y: 9 thresholds: critical: 95 operator: < warning: 99 title: Running StatefulSets


schedulerHealth

PropertyValue
alert
customLables: alertgroup: Cluster expr: (sum(up{job="kube-scheduler"}) by (cluster) / count(up{job="kube-scheduler"}) by (cluster)) * 100 linkGetParams: var-cluster={{ $labels.cluster }} message: Cluster Scheduler Health Low {{ $value }}% name: ClusterSchedulerHealthLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- scheduler
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: (sum(up{cluster="$cluster", job="kube-scheduler"}) / count(up{cluster="$cluster", job="kube-scheduler"})) * 100 OR on() vector(-1) gridPos: w: 4 x: 20 y: 5 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: Scheduler


succeededJobs

PropertyValue
alert
customLables: alertgroup: Cluster expr: round(sum(kube_job_status_succeeded) by (cluster) / (sum(kube_job_status_succeeded) by (cluster) + sum(kube_job_status_failed) by (cluster)) * 100) linkGetParams: var-cluster={{ $labels.cluster }} message: Succeeded Jobs Rate Low {{ $value }}% name: SucceededJobsRateLow thresholds: critical: 95 lowest: 0 operator: < warning: 99
linkTo
- jobOverviewTable
panel
dataLinks: - title: K8s Overview url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: round(sum(kube_job_status_succeeded{cluster="$cluster"}) / (sum(kube_job_status_succeeded{cluster="$cluster"}) + sum(kube_job_status_failed{cluster="$cluster"})) * 100) OR on() vector(-1) gridPos: x: 18 y: 9 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99 title: Succeeded Jobs


targetDown

PropertyValue
alert
customLables: alertgroup: Cluster expr: 100 * (count by(job, namespace, service, cluster) (up{pod!~"virt-launcher.*|"} == 0) / count by(job, namespace, service, cluster) (up{pod!~"virt-launcher.*|"})) message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' name: ClusterTargetDown thresholds: critical: 90 operator: '>=' warning: 10
panel
null


totalCoresMasterNodes

PropertyValue
panel
colorMode: value expr: count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system", instance=~"$masterInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 3 y: 20 thresholds: color: '#858187' value: title: Total Cores unit: none


totalCoresWorkerNodes

PropertyValue
panel
colorMode: value expr: count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system", instance=~"$workerInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 3 y: 27 thresholds: color: '#858187' value: title: Total Cores unit: none


totalDiskMasterNodes

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 15 y: 20 thresholds: color: '#858187' value: title: Total unit: bytes


totalDiskWorkerNodes

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 15 y: 27 thresholds: color: '#858187' value: title: Total unit: bytes


totalRAMMasterNodes

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 9 y: 20 thresholds: color: '#858187' value: title: Total unit: bytes


totalRAMWorkerNodes

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 9 y: 27 thresholds: color: '#858187' value: title: Total unit: bytes


usedCoresMasterNodes

PropertyValue
panel
colorMode: value expr: (1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle", instance=~"$masterInstance"}[5m])))) * count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system", instance=~"$masterInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 0 y: 20 thresholds: color: '#858187' value: title: Used Cores unit: none


usedCoresWorkerNodes

PropertyValue
panel
colorMode: value expr: (1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle", instance=~"$workerInstance"}[5m])))) * count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system", instance=~"$workerInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 0 y: 27 thresholds: color: '#858187' value: title: Used Cores unit: none


usedDiskMasterNodes

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 12 y: 20 thresholds: color: '#858187' value: title: Used unit: bytes


usedDiskWorkerNodes

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}) graphMode: none gridPos: h: 2 w: 3 x: 12 y: 27 thresholds: color: '#858187' value: title: Used unit: bytes


usedRAMMasterNodes

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}) * (((1 - sum(node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}) / sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})))) graphMode: none gridPos: h: 2 w: 3 x: 6 y: 20 thresholds: color: '#858187' value: title: Used unit: bytes


usedRAMWorkerNodes

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}) * (((1 - sum(node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}) / sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})))) graphMode: none gridPos: h: 2 w: 3 x: 6 y: 27 thresholds: color: '#858187' value: title: Used unit: bytes


k8sApps

PropertyValue
apacheapache
autoscalerautoscaler
cAdvisorcAdvisor
genericAppgenericApp
harborharbor
javaActuatorjavaActuator
jvmjvm
lokiDistributedlokiDistributed
mysqlExportermysqlExporter
nginxIngressnginxIngress
nginxIngressCertificateExpirynginxIngressCertificateExpiry
nginxNrpenginxNrpe
nginxVtsnginxVts
nginxVtsEnhancednginxVtsEnhanced
nginxVtsEnhancedLegacynginxVtsEnhancedLegacy
nginxVtsLegacynginxVtsLegacy
phpFpmphpFpm
postfixpostfix
prometheusprometheus
pythonFlaskpythonFlask
rabbitmqrabbitmq
sslExportersslExporter
websocketwebsocket


apache

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


autoscaler

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (autoscaler_healthy{job=~".+"}) / sum by (job, cluster) (autoscaler_instances{job=~".+"}) * 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Autoscaler Health Low {{ $value }}%' name: ClusterAppAutoscalerHealthLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- autoscaler
panel
expr: (sum by (job) (autoscaler_healthy{cluster="$cluster", %(job)s}) / sum by (job) (autoscaler_instances{cluster="$cluster", %(job)s}) * 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


cAdvisor

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


genericApp

PropertyValue
alert
{}
default
false
panel
description: GenericApp template. Used when application monitoring is requested but appropriate template was not found. expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 gridPos: w: 4 thresholds: critical: 95 operator: < warning: 99


harbor

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: harbor_up{job=~".+"} linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Harbor component "{{ $labels.component }}" is down' name: ClusterAppHarborComponentDown thresholds: critical: 0 operator: == warning: 0
default
false
linkTo
- harbor
panel
expr: (sum(harbor_up{cluster="$cluster", %(job)s}) / count(harbor_up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


javaActuator

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="heap"})*100/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="nonheap"}) > sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"}) or (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100)/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"})) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Java Actuator Heap High {{ $value }}%' name: ClusterAppJavaActuatorHeapHigh thresholds: critical: 90 lowest: 0 operator: '>=' warning: 75
default
false
linkTo
- javaactuator
panel
expr: (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="heap"})*100/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="nonheap"}) > sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"}) or (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100)/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"})) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 90 lowest: 0 operator: '>=' warning: 75


jvm

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


lokiDistributed

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


mysqlExporter

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


nginxIngress

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: ((sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+", status!~"[4-5].*"}[5m])) / sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m])) + 100)) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Nginx Ingress Success Rate (non-4|5xx responses) Low {{ printf "%.0f" $value }}%' name: ClusterAppNginxIngressSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxingress
panel
expr: ((sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s, status!~"[4-5].*"}[5m])) / sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s}[5m])) + 100)) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxIngressCertificateExpiry

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{job=~".+"} - time()) / 60 / 60 / 24 linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Nginx Ingress Certificate Expiry in {{ printf "%.2f" $value }} days' name: ClusterAppNginxIngressCertificateExpiry thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 8
default
false
linkTo
- nginxingress
panel
dataLinks: - title: Detail url: /d/nginxingress?var-job=%(job)s&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to decimals: 0 expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{cluster="$cluster", %(job)s} - time()) OR on() vector(-100000000000000005366162204393472) gridPos: w: 4 mappings: - text: '-' type: 1 value: -100000000000000005366162204393472 thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 691200 unit: s


nginxNrpe

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


nginxVts

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvts
panel
expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsEnhanced

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtsenhanced
panel
expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsEnhancedLegacy

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtsenhancedlegacy
panel
expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsLegacy

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtslegacy
panel
expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


phpFpm

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


postfix

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (postfix_size{job=~".+"})) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} mappings: - text: '-' type: 1 value: -1 message: 'ClusterApp {{ $labels.job }}: Postfix Queue Size High {{ $value }}%' name: ClusterAppPostfixQueueSizeHigh thresholds: critical: 10 lowest: 0 operator: '>=' warning: 5
default
false
linkTo
- postfix
panel
expr: (sum by (job) (postfix_size{cluster="$cluster", %(job)s})) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 10 lowest: 0 operator: '>=' warning: 5 unit: mailq


prometheus

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


pythonFlask

PropertyValue
alert
customLables: alertgroup: ClusterApp expr: (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+",status!~"[4-5].*"}[5m])) / sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterApp {{ $labels.job }}: Python Flask Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterAppPythonFlaskSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- pythonflask
panel
expr: (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s,status!~"[4-5].*"}[5m])) / sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


rabbitmq

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


sslExporter

PropertyValue
alert
{}
default
false
linkTo
- ssl-exporter
panel
decimals: 0 expr: bottomk(1,ssl_cert_not_after{cluster="$cluster"}-time() OR ssl_file_cert_not_after{cluster="$cluster"}-time() OR ssl_kubeconfig_cert_not_after{cluster="$cluster"}-time() OR ssl_kubernetes_cert_not_after{cluster="$cluster"}-time()) gridPos: w: 4 mappings: - text: '-' type: 1 value: -100000000000000005366162204393472 thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 691200 unit: s


websocket

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


vm

PropertyValue
mainmain


main

PropertyValue
panel
expr: sum(ALERTS{alertname!="Watchdog", alertstate="firing", severity="warning", job=~"%(job)s", alertgroup=~"%(groupVM)s|%(groupVMApp)s"} OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", alertstate="firing", severity="critical", job=~"%(job)s", alertgroup=~"%(groupVM)s|%(groupVMApp)s"} OR on() vector(0)) * %(maxWarnings)d graphMode: none gridPos: h: 3 w: 4 mappings: - from: 0 text: OK to: 0 type: 2 value: '' - from: 1 text: Warning to: 9999 type: 2 value: '' - from: 10000 text: Critical to: 100000000000000005366162204393472 type: 2 value: '' thresholds: critical: 10000 operator: '>=' warning: 1 unit: none


vmApps

PropertyValue
apacheapache
autoscalerautoscaler
cAdvisorcAdvisor
genericAppgenericApp
harborharbor
javaActuatorjavaActuator
jvmjvm
lokiDistributedlokiDistributed
mysqlExportermysqlExporter
nginxIngressnginxIngress
nginxIngressCertificateExpirynginxIngressCertificateExpiry
nginxNrpenginxNrpe
nginxVtsnginxVts
nginxVtsEnhancednginxVtsEnhanced
nginxVtsEnhancedLegacynginxVtsEnhancedLegacy
nginxVtsLegacynginxVtsLegacy
phpFpmphpFpm
postfixpostfix
prometheusprometheus
pythonFlaskpythonFlask
rabbitmqrabbitmq
sslExportersslExporter
websocketwebsocket


apache

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


autoscaler

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (autoscaler_healthy{job=~".+"}) / sum by (job, cluster) (autoscaler_instances{job=~".+"}) * 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Autoscaler Health Low {{ $value }}%' name: ClusterVMAppAutoscalerHealthLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- autoscaler
panel
expr: (sum by (job) (autoscaler_healthy{cluster="$cluster", %(job)s}) / sum by (job) (autoscaler_instances{cluster="$cluster", %(job)s}) * 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


cAdvisor

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


genericApp

PropertyValue
alert
{}
default
false
panel
description: GenericApp template. Used when application monitoring is requested but appropriate template was not found. expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 gridPos: w: 4 thresholds: critical: 95 operator: < warning: 99


harbor

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: harbor_up{job=~".+"} linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Harbor component "{{ $labels.component }}" is down' name: ClusterVMAppHarborComponentDown thresholds: critical: 0 operator: == warning: 0
default
false
linkTo
- harbor
panel
expr: (sum(harbor_up{cluster="$cluster", %(job)s}) / count(harbor_up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


javaActuator

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="heap"})*100/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="nonheap"}) > sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"}) or (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100)/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"})) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Java Actuator Heap High {{ $value }}%' name: ClusterVMAppJavaActuatorHeapHigh thresholds: critical: 90 lowest: 0 operator: '>=' warning: 75
default
false
linkTo
- javaactuator
panel
expr: (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="heap"})*100/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="nonheap"}) > sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"}) or (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100)/sum by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"})) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 90 lowest: 0 operator: '>=' warning: 75


jvm

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


lokiDistributed

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


mysqlExporter

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


nginxIngress

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: ((sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+", status!~"[4-5].*"}[5m])) / sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m])) + 100)) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Nginx Ingress Success Rate (non-4|5xx responses) Low {{ printf "%.0f" $value }}%' name: ClusterVMAppNginxIngressSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxingress
panel
expr: ((sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s, status!~"[4-5].*"}[5m])) / sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s}[5m])) + 100)) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxIngressCertificateExpiry

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{job=~".+"} - time()) / 60 / 60 / 24 linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Nginx Ingress Certificate Expiry in {{ printf "%.2f" $value }} days' name: ClusterVMAppNginxIngressCertificateExpiry thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 8
default
false
linkTo
- nginxingress
panel
dataLinks: - title: Detail url: /d/nginxingress?var-job=%(job)s&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to decimals: 0 expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{cluster="$cluster", %(job)s} - time()) OR on() vector(-100000000000000005366162204393472) gridPos: w: 4 mappings: - text: '-' type: 1 value: -100000000000000005366162204393472 thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 691200 unit: s


nginxNrpe

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


nginxVts

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterVMAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvts
panel
expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsEnhanced

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterVMAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtsenhanced
panel
expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsEnhancedLegacy

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterVMAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtsenhancedlegacy
panel
expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


nginxVtsLegacy

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*", code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterVMAppNginxVTSSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- nginxvtslegacy
panel
expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


phpFpm

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


postfix

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (postfix_size{job=~".+"})) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} mappings: - text: '-' type: 1 value: -1 message: 'ClusterVMApp {{ $labels.job }}: Postfix Queue Size High {{ $value }}%' name: ClusterVMAppPostfixQueueSizeHigh thresholds: critical: 10 lowest: 0 operator: '>=' warning: 5
default
false
linkTo
- postfix
panel
expr: (sum by (job) (postfix_size{cluster="$cluster", %(job)s})) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 10 lowest: 0 operator: '>=' warning: 5 unit: mailq


prometheus

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


pythonFlask

PropertyValue
alert
customLables: alertgroup: ClusterVMApp expr: (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+",status!~"[4-5].*"}[5m])) / sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m])) + 100) linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }} message: 'ClusterVMApp {{ $labels.job }}: Python Flask Success Rate (non-4|5xx responses) Low {{ $value }}%' name: ClusterVMAppPythonFlaskSuccessRateLow thresholds: critical: 85 lowest: 0 operator: < warning: 95
default
false
linkTo
- pythonflask
panel
expr: (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s,status!~"[4-5].*"}[5m])) / sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster", %(job)s}[5m])) + 100) OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 85 lowest: 0 operator: < warning: 95


rabbitmq

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


sslExporter

PropertyValue
alert
{}
default
false
linkTo
- ssl-exporter
panel
decimals: 0 expr: bottomk(1,ssl_cert_not_after{cluster="$cluster"}-time() OR ssl_file_cert_not_after{cluster="$cluster"}-time() OR ssl_kubeconfig_cert_not_after{cluster="$cluster"}-time() OR ssl_kubernetes_cert_not_after{cluster="$cluster"}-time()) gridPos: w: 4 mappings: - text: '-' type: 1 value: -100000000000000005366162204393472 thresholds: critical: 0 lowest: -100000000000000005366162204393472 operator: < warning: 691200 unit: s


websocket

PropertyValue
alert
{}
default
false
panel
expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1) gridPos: w: 4 mappings: - text: '-' type: 1 value: -1 thresholds: critical: 95 lowest: 0 operator: < warning: 99


L2

PropertyValue
containerOverviewcontainerOverview
cpuPerNodecpuPerNode
daemonSetOverviewdaemonSetOverview
deploymentOverviewdeploymentOverview
diskPerNodediskPerNode
jobOverviewjobOverview
memoryPerNodememoryPerNode
networkPerNodenetworkPerNode
nodeOverviewnodeOverview
podOverviewpodOverview
pvcOverviewpvcOverview
statefulSetOverviewstatefulSetOverview
vmvm


containerOverview

PropertyValue
containerOverviewTablecontainerOverviewTable


containerOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_pod_container_info{cluster="$cluster", namespace=~"$namespace", pod=~"$pod"}, container)
panel
expr: - "sum by (container, namespace, pod) ((kube_pod_container_status_terminated * 0 or\ \ kube_pod_container_status_terminated_reason{cluster=\"$cluster\", namespace=~\"\ $namespace\", pod=~\"$pod\", container=~\"$container\", reason=\"Completed\"}) *\ \ 1) + \nsum by (container, namespace, pod) (kube_pod_container_status_running{cluster=\"\ $cluster\"} * 2) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ ContainerCreating\"}) * 3) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ CrashLoopBackOff\"}) * 4) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ CreateContainerConfigError\"}) * 5) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ ErrImagePull\"}) * 6) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ ImagePullBackOff\"}) * 7) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ CreateContainerError\"}) * 8) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ InvalidImageName\"}) * 9) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\ \ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\ CrashLoopBackOff\"}) * 10) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\ \ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\ OOMKilled\"}) * 11) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\ \ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\ Error\"}) * 12) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\ \ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\ ContainerCannotRun\"}) * 13) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\ \ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\ DeadlineExceeded\"}) * 14) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\ \ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\ Evicted\"}) * 15)" - sum by (container, namespace, pod) (kube_pod_container_status_restarts_total{cluster="$cluster", namespace=~"$namespace", pod=~"$pod", container=~"$container"}) sort: col: 5 desc: true styles: - pattern: Time type: hidden - alias: Status colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #A' thresholds: - 4 - 4 type: string valueMaps: - text: Terminated (Completed) value: 1 - text: Running value: 2 - text: Waiting (ContainerCreating) value: 3 - text: Waiting (CrashLoopBackOff) value: 4 - text: Waiting (CreateContainerConfigError) value: 5 - text: Waiting (ErrImagePull) value: 6 - text: Waiting (ImagePullBackOff) value: 7 - text: Waiting (CreateContainerError) value: 8 - text: Waiting (InvalidImageName) value: 9 - text: Waiting (CrashLoopBackOff) value: 10 - text: Terminated (OOMKilled) value: 11 - text: Terminated (Error) value: 12 - text: Terminated (ContainerCannotRun) value: 13 - text: Terminated (DeadlineExceeded) value: 14 - text: Terminated (Evicted) value: 15 - alias: Restarts colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' pattern: 'Value #B' thresholds: - 5 - 10 type: number - alias: Container link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-container=${__cell_3}&var-namespace=${__cell_1}&var-pod=${__cell_2}&var-view=container&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: container - alias: Namespace pattern: namespace type: string - alias: Pod pattern: pod type: string title: Containers transformations: - id: merge options: {} - id: organize options: excludeByName: Time: false indexByName: Time: 0 'Value #A': 4 'Value #B': 5 container: 3 namespace: 1 pod: 2 renameByName: {}


cpuPerNode

PropertyValue
cpuPerNodePolystatcpuPerNodePolystat


cpuPerNodePolystat

PropertyValue
base
"basePolystatTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"}, nodename)
panel
default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: "avg(round((1 - (avg by (instance, pod) (irate(node_cpu_seconds_total{cluster=\"\ $cluster\", job=~\"$job\", mode=\"idle\"}[5m])))) * 100)\n* on(instance, pod) group_left(nodename)\ \ \n node_uname_info{cluster=\"$cluster\", nodename=~\"$instance\"}) by (nodename)" fontColor: '#ffffff' global_thresholds: - color: '#56a64b' state: 0 value: 0 - color: '#ff780a' state: 1 value: 75 - color: '#e02f44' state: 2 value: 90 global_unit_format: percent title: CPU per Node


daemonSetOverview

PropertyValue
daemonSetOverviewTabledaemonSetOverviewTable


daemonSetOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_daemonset_status_desired_number_scheduled{cluster="$cluster", namespace=~"$namespace"}, daemonset)
panel
expr: - sum by (daemonset, namespace) (kube_daemonset_status_number_misscheduled{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace) (kube_daemonset_status_desired_number_scheduled{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace) (kube_daemonset_updated_number_scheduled{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace) (kube_daemonset_status_desired_number_scheduled{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace) (kube_daemonset_status_number_available{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace) (kube_daemonset_status_desired_number_scheduled{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace) (kube_daemonset_status_number_ready{cluster="$cluster", namespace=~"$namespace", daemonset=~"$daemonset"}) sort: col: 5 desc: true styles: - pattern: Time type: hidden - alias: Scheduled colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #A' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: Updated colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #B' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: Available colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #C' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: Ready colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #D' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: DaemonSet pattern: daemonset type: string - alias: Namespace link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-namespace=$__cell&var-pod=All&var-view=pod&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: namespace title: DaemonSets transformations: - id: merge options: {} - id: organize options: excludeByName: Time: true indexByName: Time: 0 'Value #A': 3 'Value #B': 4 'Value #C': 5 'Value #D': 6 daemonset: 2 namespace: 1 renameByName: {}


deploymentOverview

PropertyValue
deploymentOverviewTabledeploymentOverviewTable


deploymentOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_deployment_status_replicas{cluster="$cluster", namespace=~"$namespace"}, deployment)
panel
expr: - sum by (deployment, namespace) (kube_deployment_status_replicas{cluster="$cluster", namespace=~"$namespace", deployment=~"$deployment"}) - sum by (deployment, namespace) (kube_deployment_status_replicas_updated{cluster="$cluster", namespace=~"$namespace", deployment=~"$deployment"}) - sum by (deployment, namespace) (kube_deployment_status_replicas{cluster="$cluster", namespace=~"$namespace", deployment=~"$deployment"}) - sum by (deployment, namespace) (kube_deployment_status_replicas_available{cluster="$cluster", namespace=~"$namespace", deployment=~"$deployment"}) sort: col: 3 desc: true styles: - pattern: Time type: hidden - alias: Updated colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #A' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: Available colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #B' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: Deployment pattern: deployment type: string - alias: Namespace link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-namespace=$__cell&var-pod=All&var-view=pod&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: namespace title: Deployments transformations: - id: merge options: {} - id: organize options: excludeByName: Time: true indexByName: Time: 0 'Value #A': 3 'Value #B': 4 deployment: 2 namespace: 1 renameByName: {}


diskPerNode

PropertyValue
diskPerNodePolystatdiskPerNodePolystat


diskPerNodePolystat

PropertyValue
base
"basePolystatTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"}, nodename)
panel
default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: "max(round(\n(sum(node_filesystem_size_bytes{cluster=\"$cluster\", job=~\"$job\"\ }) by (instance, device, pod) - sum(node_filesystem_free_bytes{cluster=\"$cluster\"\ , job=~\"$job\"}) by (instance, device, pod)) /\n(sum(node_filesystem_size_bytes{cluster=\"\ $cluster\", job=~\"$job\"}) by (instance, device, pod) - sum(node_filesystem_free_bytes{cluster=\"\ $cluster\", job=~\"$job\"}) by (instance, device, pod) +\nsum(node_filesystem_avail_bytes{cluster=\"\ $cluster\", job=~\"$job\"}) by (instance, device, pod))\n * 100\n) * on(instance,\ \ pod) group_left(nodename) \n node_uname_info{cluster=\"$cluster\", nodename=~\"\ $instance\"}) by (nodename)" fontColor: '#ffffff' global_thresholds: - color: '#56a64b' state: 0 value: 0 - color: '#ff780a' state: 1 value: 75 - color: '#e02f44' state: 2 value: 90 global_unit_format: percent title: Disk per Node


jobOverview

PropertyValue
jobOverviewTablejobOverviewTable


jobOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_job_info{cluster="$cluster", namespace=~"$namespace"}, job_name)
panel
expr: - "sum by (job_name, namespace) (clamp_max(kube_job_status_succeeded{cluster=\"$cluster\"\ , namespace=~\"$namespace\", job_name=~\"$job_name\"}, 1) * 1) * on(job_name, namespace)\ \ group_left(owner_name) kube_job_owner{cluster=\"$cluster\", namespace=~\"$namespace\"\ , job_name=~\"$job_name\"} +\nsum by (job_name, namespace) (clamp_max(kube_job_status_active{cluster=\"\ $cluster\", namespace=~\"$namespace\", job_name=~\"$job_name\"}, 1) * 2) * on(job_name,\ \ namespace) group_left(owner_name) kube_job_owner{cluster=\"$cluster\", namespace=~\"\ $namespace\", job_name=~\"$job_name\"} +\nsum by (job_name, namespace) (clamp_max(kube_job_status_failed{cluster=\"\ $cluster\", namespace=~\"$namespace\", job_name=~\"$job_name\"}, 1) * 3) * on(job_name,\ \ namespace) group_left(owner_name) kube_job_owner{cluster=\"$cluster\", namespace=~\"\ $namespace\", job_name=~\"$job_name\"}\n" sort: col: 3 desc: true styles: - pattern: Time type: hidden - alias: Status colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: Value thresholds: - 3 - 3 type: string valueMaps: - text: Succeeded value: 1 - text: Active value: 2 - text: Failed value: 3 - alias: Job name pattern: job_name type: string - alias: Owner pattern: owner_name type: string - alias: Namespace link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-namespace=$__cell&var-container=All&var-view=container&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: namespace title: Jobs transformations: - id: organize options: excludeByName: Time: true indexByName: Time: 0 Value: 4 job_name: 2 namespace: 1 owner_name: 3 renameByName: {}


memoryPerNode

PropertyValue
memoryPerNodePolystatmemoryPerNodePolystat


memoryPerNodePolystat

PropertyValue
base
"basePolystatTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"}, nodename)
panel
default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: "avg(round((1 - (sum(node_memory_MemAvailable_bytes{cluster=\"$cluster\", job=~\"\ $job\"}) by (instance, pod) / sum(node_memory_MemTotal_bytes{cluster=\"$cluster\"\ , job=~\"$job\"}) by (instance, pod) )) * 100)\n* on(instance, pod) group_left(nodename)\ \ \n node_uname_info{cluster=\"$cluster\", nodename=~\"$instance\"}) by (nodename)" fontColor: '#ffffff' global_thresholds: - color: '#56a64b' state: 0 value: 0 - color: '#ff780a' state: 1 value: 75 - color: '#e02f44' state: 2 value: 90 global_unit_format: percent title: Memory per Node


networkPerNode

PropertyValue
networkPerNodePolystatnetworkPerNodePolystat


networkPerNodePolystat

PropertyValue
base
"basePolystatTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"}, nodename)
panel
default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: "avg((sum(rate(node_network_transmit_errs_total{cluster=\"$cluster\", job=~\"\ $job\", device!~\"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+\"}[5m])) \ \ by (instance, pod) \n + sum(rate(node_network_receive_errs_total{cluster=\"\ $cluster\", job=~\"$job\", device!~\"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+\"\ }[5m])) by (instance, pod))\n* on(instance, pod) group_left(nodename) \n node_uname_info{cluster=\"\ $cluster\", nodename=~\"$instance\"}) by (nodename)" fontColor: '#ffffff' global_thresholds: - color: '#56a64b' state: 0 value: 0 - color: '#ff780a' state: 1 value: 10 - color: '#e02f44' state: 2 value: 30 global_unit_format: pps title: Network Errors per Node


nodeOverview

PropertyValue
nodeOverviewTablenodeOverviewTable


nodeOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
{}
panel
expr: - sum by (node) (kube_node_spec_unschedulable{cluster="$cluster"}) - sum by (node) (kube_node_status_condition{cluster="$cluster", condition="DiskPressure", status=~"true|unknown"}) - sum by (node) (kube_node_status_condition{cluster="$cluster", condition="MemoryPressure", status=~"true|unknown"}) - sum by (node) (kube_node_status_condition{cluster="$cluster", condition="PIDPressure", status=~"true|unknown"}) - sum by (node) (kube_node_status_condition{cluster="$cluster", condition="Ready", status=~"false|unknown"}) sort: col: 6 desc: true styles: - pattern: Time type: hidden - alias: Schedulable colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #A' thresholds: - 1 - 1 type: string valueMaps: - text: Failed value: 1 - text: OK value: 0 - alias: Disk Pressure colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #B' thresholds: - 1 - 1 type: string valueMaps: - text: Failed value: 1 - text: OK value: 0 - alias: Memory Pressure colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #C' thresholds: - 1 - 1 type: string valueMaps: - text: Failed value: 1 - text: OK value: 0 - alias: PID Pressure colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #D' thresholds: - 1 - 1 type: string valueMaps: - text: Failed value: 1 - text: OK value: 0 - alias: Ready colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #E' thresholds: - 1 - 1 type: string valueMaps: - text: Failed value: 1 - text: OK value: 0 - alias: Node link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-view=pod&var-instance=$__cell&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: node title: Nodes


podOverview

PropertyValue
podOverviewTablepodOverviewTable


podOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_pod_info{cluster="$cluster", namespace=~"$namespace"}, pod)
panel
expr: - "sum by (namespace, pod) (kube_pod_status_phase{cluster=\"$cluster\", namespace=~\"\ $namespace\", phase=\"Running\"} * 1) +\nsum by (namespace, pod) (kube_pod_status_phase{cluster=\"\ $cluster\", namespace=~\"$namespace\", phase=\"Succeeded\"} * 2) +\nsum by (namespace,\ \ pod) (kube_pod_status_phase{cluster=\"$cluster\", namespace=~\"$namespace\", phase=\"\ Unknown\"} * 3) +\nsum by (namespace, pod) (kube_pod_status_phase{cluster=\"$cluster\"\ , namespace=~\"$namespace\", phase=\"Failed\"} * 4) +\nsum by (namespace, pod) (kube_pod_status_phase{cluster=\"\ $cluster\", namespace=~\"$namespace\", pod=~\"$pod\", phase=\"Pending\"} * 5)\n" sort: col: 3 desc: true styles: - pattern: Time type: hidden - alias: Status colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: Value thresholds: - 3 - 3 type: string valueMaps: - text: Running value: 1 - text: Succeeded value: 2 - text: Unknown value: 3 - text: Failed value: 4 - text: Pending value: 5 - alias: Namespace pattern: namespace type: string - alias: Pod link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-container=All&var-view=pod&var-namespace=${__cell_1}&var-pod=${__cell_2}&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: pod title: Pods


pvcOverview

PropertyValue
pvcOverviewTablepvcOverviewTable


pvcOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_persistentvolumeclaim_info{cluster="$cluster", namespace=~"$namespace"}, persistentvolumeclaim)
panel
description: Capacity is available only for remote pvc. expr: - sum by (persistentvolumeclaim, namespace) (((kubelet_volume_stats_capacity_bytes{cluster="$cluster", namespace=~"$namespace", persistentvolumeclaim=~"$pvc"} - kubelet_volume_stats_available_bytes{cluster="$cluster", namespace=~"$namespace", persistentvolumeclaim=~"$pvc"}) / kubelet_volume_stats_capacity_bytes{cluster="$cluster", namespace=~"$namespace", persistentvolumeclaim=~"$pvc"}) * 100) - "sum by (persistentvolumeclaim, namespace) (kube_persistentvolumeclaim_status_phase{cluster=\"\ $cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\"$pvc\", phase=\"\ Bound\"} * 1) +\nsum by (persistentvolumeclaim, namespace) (kube_persistentvolumeclaim_status_phase{cluster=\"\ $cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\"$pvc\", phase=\"\ Lost\"} * 2) +\nsum by (persistentvolumeclaim, namespace) (kube_persistentvolumeclaim_status_phase{cluster=\"\ $cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\"$pvc\", phase=\"\ Pending\"} * 3)\n" sort: col: 3 desc: true styles: - pattern: Time type: hidden - alias: Capacity colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' pattern: 'Value #A' thresholds: - 85 - 97 type: number unit: percent - alias: Status colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 1 pattern: 'Value #B' thresholds: - 2 - 2 type: string valueMaps: - text: Bound value: 1 - text: Lost value: 2 - text: Pending value: 3 - alias: PVC link: true linkTooltip: Detail linkUrl: /d/persistentvolumes?var-namespace=${__cell_1}&var-pvc=${__cell_2}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: persistentvolumeclaim - alias: Namespace pattern: namespace type: string title: Persistent Volumes


statefulSetOverview

PropertyValue
statefulSetOverviewTablestatefulSetOverviewTable


statefulSetOverviewTable

PropertyValue
base
"baseTableTemplate"
dashboardInfo
grafanaTemplateQuery: label_values(kube_statefulset_status_replicas{cluster="$cluster", namespace=~"$namespace"}, statefulset)
panel
expr: - sum by (statefulset, namespace) (kube_statefulset_status_replicas_updated{cluster="$cluster", namespace=~"$namespace", statefulset=~"$statefulset"}) - sum by (statefulset, namespace) (kube_statefulset_status_replicas{cluster="$cluster", namespace=~"$namespace", statefulset=~"$statefulset"}) - sum by (statefulset, namespace) (kube_statefulset_status_replicas_ready{cluster="$cluster", namespace=~"$namespace", statefulset=~"$statefulset"}) sort: col: 4 desc: true styles: - pattern: Time type: hidden - alias: Updated pattern: 'Value #A' type: number - alias: Ready colorMode: cell colors: - '#56a64b' - '#ff780a' - '#e02f44' mappingType: 2 pattern: 'Value #B' rangeMaps: - from: 0 text: OK to: 0 - from: 1 text: Failed to: 100000000000000005366162204393472 thresholds: - 1 - 1 type: string - alias: StatefulSet link: true linkTooltip: Detail linkUrl: /d/statefulset?var-namespace=${__cell_1}&var-statefulset=${__cell_2}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: statefulset - alias: Namespace link: true linkTooltip: Detail linkUrl: /d/containerdetail?var-namespace=$__cell&var-pod=All&var-view=pod&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to pattern: namespace title: StatefulSets


vm

PropertyValue
mostUtilizedVMCPUmostUtilizedVMCPU
mostUtilizedVMDiskmostUtilizedVMDisk
mostUtilizedVMNetworkErrorsmostUtilizedVMNetworkErrors
mostUtilizedVMRAMmostUtilizedVMRAM
overallNetworkErrorsoverallNetworkErrors
overallUtilizationCPUoverallUtilizationCPU
overallUtilizationDiskoverallUtilizationDisk
overallUtilizationRAMoverallUtilizationRAM
targetDowntargetDown
totalCorestotalCores
totalDisktotalDisk
totalRAMtotalRAM
usedCoresusedCores
usedDiskusedDisk
usedRAMusedRAM


mostUtilizedVMCPU

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"%s", mode="idle"}[5m]) * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100) linkGetParams: var-instance={{ $labels.nodename }} message: 'VM {{ $labels.nodename }}: High CPU Utilization {{ $value }}%' name: VMCPUUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(round((1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m]) * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100)) gridPos: w: 3 x: 3 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized VM


mostUtilizedVMDisk

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: round((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device)) * 100 > 0) linkGetParams: var-instance={{ $labels.nodename }} message: 'VM {{ $labels.nodename }}: High Disk Utilization {{ $value }}%' name: VMDiskUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: max(round((sum(node_filesystem_size_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device)) / ((sum(node_filesystem_size_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device)) * 100 > 0)) gridPos: w: 3 x: 15 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized VM


mostUtilizedVMNetworkErrors

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: sum(rate(node_network_transmit_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename) linkGetParams: var-instance={{ $labels.nodename }} message: 'VM {{ $labels.nodename }}: High Network Errors Count {{ $value }}%' name: VMNetworkErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: max(sum(rate(node_network_transmit_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename)) gridPos: w: 3 x: 21 y: 6 thresholds: critical: 15 operator: '>=' warning: 10 title: Most Affected VM unit: pps


mostUtilizedVMRAM

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)) / sum by (job, nodename) (node_memory_MemTotal_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info))) * 100) linkGetParams: var-instance={{ $labels.nodename }} message: 'VM {{ $labels.nodename }}: High RAM Utilization {{ $value }}%' name: VMRAMUtilizationHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: max(round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info)) / sum by (job, nodename) (node_memory_MemTotal_bytes{job=~"$job"} * on(instance) group_left(nodename) (node_uname_info))) * 100)) gridPos: w: 3 x: 9 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Most Utilized VM


overallNetworkErrors

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: sum(sum(rate(node_network_transmit_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job, nodename)) message: VM High Overall Network Errors Count {{ $value }}% name: VMNetworkOverallErrorsHigh thresholds: critical: 15 operator: '>=' warning: 10
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: sum(sum(rate(node_network_transmit_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m]) * on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job, nodename)) gridPos: w: 3 x: 18 y: 6 thresholds: critical: 15 operator: '>=' warning: 10 title: Overall Errors unit: pps


overallUtilizationCPU

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"%s", mode="idle"}[5m]) * on(instance, cluster) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100)) message: VM High CPU Overall Utilization {{ $value }}% name: VMCPUOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m]) * on(instance, cluster) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100)) gridPos: w: 3 x: 0 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationDisk

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: round((sum(node_filesystem_size_bytes{job=~"%s"}) - sum(node_filesystem_free_bytes{job=~"%s"})) / (sum(node_filesystem_size_bytes{job=~"%s"}) - sum(node_filesystem_free_bytes{job=~"%s"}) + sum(node_filesystem_avail_bytes{job=~"%s"})) * 100 > 0) message: VM High Disk Overall Utilization {{ $value }}% name: VMDiskOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the disk utilization is calculated using the fraction:\n\ ```\n/( + )\n```\nThe value of \ \ is reduced by 5% of the available disk capacity, because \nthe file system\ \ marks 5% of the available disk capacity as reserved. \nIf less than 5% is free,\ \ using the remaining reserved space requires root privileges.\nAny non-privileged\ \ users and processes are unable to write new data to the partition. See the list\ \ of explicitly ignored mount points and file systems [here](https://github.com/dNationCloud/kubernetes-monitoring-stack/blob/main/chart/values.yaml)" expr: round((sum(node_filesystem_size_bytes{job=~"$job"}) - sum(node_filesystem_free_bytes{job=~"$job"})) / (sum(node_filesystem_size_bytes{job=~"$job"}) - sum(node_filesystem_free_bytes{job=~"$job"}) + sum(node_filesystem_avail_bytes{job=~"$job"})) * 100 > 0) gridPos: w: 3 x: 12 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


overallUtilizationRAM

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"%s"} * on(instance, cluster) group_left(nodename) (node_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"%s"} * on(instance, cluster) group_left(nodename) (node_uname_info))) * 100)) message: VM High RAM Overall Utilization {{ $value }}% name: VMRAMOverallHigh thresholds: critical: 90 operator: '>=' warning: 75
linkTo
- nodeexporter
panel
dataLinks: - title: System Overview url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to description: "The percentage of the memory utilization is calculated by:\n```\n1 -\ \ (/)\n```" expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"$job"} * on(instance, cluster) group_left(nodename) (node_uname_info)) / sum by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"$job"} * on(instance, cluster) group_left(nodename) (node_uname_info))) * 100)) gridPos: w: 3 x: 6 y: 6 thresholds: critical: 90 operator: '>=' warning: 75 title: Overall Utilization


targetDown

PropertyValue
alert
customLables: alertgroup: ClusterVM expr: 100 * (count by(job, namespace, service) (up{job=~"%s"} == 0) / count by(job, namespace, service) (up{job=~"%s"})) message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.' name: VMTargetDown thresholds: critical: 90 operator: '>=' warning: 10
panel
null


totalCores

PropertyValue
panel
colorMode: value expr: count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system"}) graphMode: none gridPos: h: 2 w: 3 x: 3 y: 9 thresholds: color: '#858187' value: title: Total Cores unit: none


totalDisk

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"}) graphMode: none gridPos: h: 2 w: 3 x: 15 y: 9 thresholds: color: '#858187' value: title: Total unit: bytes


totalRAM

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"}) graphMode: none gridPos: h: 2 w: 3 x: 9 y: 9 thresholds: color: '#858187' value: title: Total unit: bytes


usedCores

PropertyValue
panel
colorMode: value expr: (1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle"}[5m])))) * count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system"}) graphMode: none gridPos: h: 2 w: 3 x: 0 y: 9 thresholds: color: '#858187' value: title: Used Cores unit: none


usedDisk

PropertyValue
panel
colorMode: value expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"}) - sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job"}) graphMode: none gridPos: h: 2 w: 3 x: 12 y: 9 thresholds: color: '#858187' value: title: Used unit: bytes


usedRAM

PropertyValue
panel
colorMode: value expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"}) * (((1 - sum(node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job"}) / sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"})))) graphMode: none gridPos: h: 2 w: 3 x: 6 y: 9 thresholds: color: '#858187' value: title: Used unit: bytes


commonThresholds

PropertyValue
appapp
controlPlanecontrolPlane
criticalPanelcriticalPanel
k8sk8s
nodenode
warningPanelwarningPanel


app

PropertyValue
critical
95
operator
"<"
warning
99


controlPlane

PropertyValue
critical
95
lowest
0
operator
"<"
warning
99


criticalPanel

PropertyValue
critical
1
operator
">="


k8s

PropertyValue
critical
95
operator
"<"
warning
99


node

PropertyValue
critical
90
operator
">="
warning
75


warningPanel

PropertyValue
operator
">="
warning
1


templateBases

PropertyValue
baseAlertbaseAlert
basePolystatTemplatebasePolystatTemplate
baseStatsTemplatebaseStatsTemplate
baseTableTemplatebaseTableTemplate


baseAlert

PropertyValue
customLables
{}
expr
""
linkGetParams
""
message
""
name
"error must be overwritten"
thresholds
{}


basePolystatTemplate

PropertyValue
default
true
enabled
true
panelpanel


panel

PropertyValue
datasource
"$datasource"
default_click_through
""
description""
expr
""
fontAutoColor
false
fontColor
"white"
globalDecimals
null
global_thresholds
{}
global_unit_format
""
gridPos
h: 6 w: 24 x: 0 y: 0
hexagon_sort_by_direction
2
hexagon_sort_by_field
"value"
polygon_border_size
0
title
"error must be overwritten"
tooltip_timestamp_enabled
false


baseStatsTemplate

PropertyValue
alert
{}
default
true
enabled
true
panelpanel


panel

PropertyValue
colorMode
"background"
dataLinks
[]
datasource
"$datasource"
decimals
null
description""
expr
""
graphMode
"area"
gridPos
h: 3 w: 6 x: error must be overwritten y: error must be overwritten
mappings
[]
thresholds
{}
title
"error must be overwritten"
unit
"percent"


baseTableTemplate

PropertyValue
default
true
enabled
true
panelpanel


panel

PropertyValue
datasource
"$datasource"
description""
expr
[]
gridPos
h: 19 w: 24 x: 0 y: 1
sort
{}
styles
[]
title
"error must be overwritten"
transformations
[]