dnation Kubernetes Monitoring Docs
This is generated documentation from configuration files of Kubernetes Monitoring.
Each configuration parameter can be overriden by providing custom values.yaml during helm installation.
Property | Value |
---|---|
blackboxMonitoring | enabled: false
|
clusterMonitoring | clusterMonitoring |
commonLabels | {}
|
dnation-kubernetes-jsonnet-translator | dnation-kubernetes-jsonnet-translator |
fullnameOverride | "" |
grafanaDashboards | grafanaDashboards |
hostMonitoring | hostMonitoring |
kaasMonitoring | kaasMonitoring |
nameOverride | "" |
namespaceOverride | "" |
prometheusRules | prometheusRules |
templates | templates |
testbedMonitoring | enabled: false
|
clusterMonitoring
Property | Value |
---|---|
clusters | - apps: []
description: Kubernetes cluster monitoring
label: observer-cluster
name: K8sCluster
|
enabled | true |
dnation-kubernetes-jsonnet-translator
Property | Value |
---|---|
enabled | true |
image | image |
image
Property | Value |
---|---|
args | - --libsonnet
- https://github.com/grafana/grafonnet-lib/grafonnet@daad85cf3fad3580e58029414630e29956aefe21
- https://github.com/thelastpickle/grafonnet-polystat-panel@275a48de57afdac0d72219d82863d8ab8bd0e682
|
grafanaDashboards
Property | Value |
---|---|
color | color |
constants | constants |
dataLinkCommonArgs | "refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to" |
dataLinkCommonArgsBlackbox | "refresh=10s&var-datasource=$datasource&var-target=$target&from=$__from&to=$__to" |
dataLinkCommonArgsNoCluster | "refresh=10s&var-datasource=$datasource&from=$__from&to=$__to" |
editable | true |
enable | true |
ids | ids |
isLoki | true |
labelGrafana | grafana_dashboard: '1'
|
labelJsonnet | grafana_dashboard_jsonnet: '1'
|
refresh | "10s" |
selectors | selectors |
severityColors | severityColors |
tags | tags |
templateRefresh | "time" |
templateSort | 5 |
time_from | "now-5m" |
tooltip | "shared_crosshair" |
color
Property | Value |
---|---|
black | "#000000" |
blue | "#5794f2" |
gray | "#858187" |
green | "#56a64b" |
lightblue | "#8ab8ff" |
orange | "#ff780a" |
pink | "#fce2de" |
purple | "#a352cc" |
red | "#e02f44" |
white | "#ffffff" |
yellow | "#fade2a" |
constants
Property | Value |
---|---|
infinity | 100000000000000005366162204393472 |
maxWarnings | 10000 |
ids
Property | Value |
---|---|
alertClusterOverview | "alertclusteroverview" |
alertHostOverview | "alerthostoverview" |
alertKaasOverview | "alertkaasoverview" |
alertTestbedOverview | "alerttestbedoverview" |
alertVMOverview | "alertvmoverview" |
apache | "apache" |
apiServer | "apiserver" |
autoscaler | "autoscaler" |
cAdvisor | "cadvisor" |
containerDetail | "containerdetail" |
containerOverview | "containeroverview" |
controllerManager | "controllermanager" |
cpuNamespaceOverview | "cpunamespaceoverview" |
cpuOverview | "cpuoverview" |
daemonSetOverview | "daemonsetoverview" |
deploymentOverview | "deploymentoverview" |
diskOverview | "diskoverview" |
etcd | "etcd" |
harbor | "harbor" |
hostMonitoring | "hostmonitoring" |
javaActuator | "javaactuator" |
jobOverview | "joboverview" |
jvm | "jvm" |
k8sMonitoring | "k8smonitoring" |
kaasL1Monitoring | "kaasl1monitoring" |
kaasMonitoring | "kaas-monitoring" |
kubelet | "kubelet" |
lokiDistributed | "loki-distributed" |
memoryNamespaceOverview | "memorynamespaceoverview" |
memoryOverview | "memoryoverview" |
monitoring | "monitoring" |
mysqlExporter | "mysqlexporter" |
networkNamespaceOverview | "networknamespaceoverview" |
networkOverview | "networkoverview" |
nginxIngress | "nginxingress" |
nginxNrpe | "nginxnrpe" |
nginxVts | "nginxvts" |
nginxVtsEnhanced | "nginxvtsenhanced" |
nginxVtsEnhancedLegacy | "nginxvtsenhancedlegacy" |
nginxVtsLegacy | "nginxvtslegacy" |
nodeExporter | "nodeexporter" |
nodeOverview | "nodeoverview" |
persistentVolumes | "persistentvolumes" |
phpFpm | "phpfpm" |
podOverview | "podoverview" |
postfix | "postfix" |
prometheus | "prometheus" |
proxy | "proxy" |
pvcOverview | "pvcoverview" |
pythonFlask | "pythonflask" |
rabbitmq | "rabbitmq" |
scheduler | "scheduler" |
sslExporter | "ssl-exporter" |
statefulSet | "statefulset" |
statefulSetOverview | "statefulsetoverview" |
testbed | "testbed" |
vmMonitoring | "vmmonitoring" |
websocket | "websocket" |
selectors
Property | Value |
---|---|
apiServer | "job=\"apiserver\"" |
controllerManager | "job=\"kube-controller-manager\"" |
etcd | "job=\"kube-etcd\"" |
kubelet | "job=\"kubelet\"" |
proxy | "job=\"kube-proxy\"" |
scheduler | "job=\"kube-scheduler\"" |
severityColors
Property | Value |
---|---|
critical | "red" |
default | "green" |
invalid | "black" |
warning | "orange" |
tags
Property | Value |
---|---|
k8sApps | - k8s
- app
- L1
|
k8sAppsMain | - k8s
- app
- L0
|
k8sContainer | - k8s
- container
- L3
|
k8sHostsMain | - k8s
- host
- L1
|
k8sMonitoring | - k8s
- monitoring
- L1
|
k8sMonitoringMain | - k8s
- cluster
- host
- L0
|
k8sNodeExporter | - k8s
- nodeexporter
- L3
|
k8sOverview | - k8s
- overview
- L2
|
k8sPVC | - k8s
- pvc
- L3
|
k8sStatefulSet | - k8s
- statefulset
- L3
|
k8sSystem | - k8s
- system
- L2
|
k8sVMs | - k8s
- vm
- L2
|
kaasMonitoring | - kaas
- monitoring
- L1
|
kaasMonitoringMain | - kaas
- cluster
- L0
|
testbed | - testbed
- L0
|
testbedAlert | - testbed
- L1
|
hostMonitoring
Property | Value |
---|---|
enabled | false |
hosts | []
|
kaasMonitoring
Property | Value |
---|---|
clusters | - description: KaaS monitoring
name: KaasCluster
|
enabled | false |
prometheusRules
Property | Value |
---|---|
alertGroupCluster | "Cluster" |
alertGroupClusterApp | "ClusterApp" |
alertGroupClusterVM | "ClusterVM" |
alertGroupClusterVMApp | "ClusterVMApp" |
alertGroupHost | "Host" |
alertGroupHostApp | "HostApp" |
alertInterval | "5m" |
alertNamePrefix | "KubernetesMonitoring" |
enable | true |
labelJsonnet | prometheus_rule_jsonnet: '1'
|
labelPrometheus | prometheus_rule: '1'
|
templates
Property | Value |
---|---|
L0 | L0 |
L1 | L1 |
L2 | L2 |
RecordRules | - expr: node_uname_info{job=~"node-exporter"} and on(nodename) label_replace(kube_node_role{role=~"control-plane"},
"nodename", "$1", "node", "(.+)")
record: master_uname_info
- expr: node_uname_info{job=~"node-exporter"} unless on(nodename) label_replace(kube_node_role{role=~"control-plane"},
"nodename", "$1", "node", "(.+)")
record: worker_uname_info
|
commonThresholds | commonThresholds |
templateBases | templateBases |
L0
Property | Value |
---|---|
blackbox | blackbox |
host | host |
k8s | k8s |
kaas | kaas |
testbed | testbed |
blackbox
Property | Value |
---|---|
main | main |
Property | Value |
---|---|
panel | expr: probe_success{target=~"%(target)s", endpoint="http"}
graphMode: none
gridPos:
h: 3
w: 4
mappings:
- from: -1
text: '-'
to: -1
type: 2
value: ''
- from: 0
text: Critical
to: 0
type: 2
value: ''
- from: 1
text: OK
to: 1
type: 2
value: ''
thresholds:
critical: 1
lowest: 0
operator: <
unit: none
|
host
Property | Value |
---|---|
main | main |
Property | Value |
---|---|
panel | expr: ((sum(up{job=~"%(job)s"}) or on() vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog",
alertstate="firing", severity="warning", job=~"%(job)s", alertgroup=~"%(groupHost)s|%(groupHostApp)s"}
OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", alertstate="firing", severity="critical",
job=~"%(job)s", alertgroup=~"%(groupHost)s|%(groupHostApp)s"} OR on() vector(0))
* %(maxWarnings)d
graphMode: none
gridPos:
h: 3
w: 4
mappings:
- from: -1
text: Down
to: -1
type: 2
value: ''
- from: 0
text: OK
to: 0
type: 2
value: ''
- from: 1
text: Warning
to: 9999
type: 2
value: ''
- from: 10000
text: Critical
to: 100000000000000005366162204393472
type: 2
value: ''
thresholds:
critical: 10000
lowest: 0
operator: '>='
warning: 1
unit: none
|
k8s
Property | Value |
---|---|
main | main |
Property | Value |
---|---|
panel | expr: ((sum(up{job=~"node-exporter", cluster="%(cluster)s"}) or on() vector(0)) ==
bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing",
severity="warning", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0))
+ sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing",
severity="critical", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0))
* %(maxWarnings)d
graphMode: none
gridPos:
h: 3
w: 4
mappings:
- from: -1
text: Down
to: -1
type: 2
value: ''
- from: 0
text: OK
to: 0
type: 2
value: ''
- from: 1
text: Warning
to: 9999
type: 2
value: ''
- from: 10000
text: Critical
to: 100000000000000005366162204393472
type: 2
value: ''
thresholds:
critical: 10000
lowest: 0
operator: '>='
warning: 1
unit: none
|
kaas
Property | Value |
---|---|
main | main |
Property | Value |
---|---|
panel | expr: ((sum(kaas{cluster="%(cluster)s"} unless up{job=~"node-exporter", cluster="%(cluster)s"})
or on() vector(0)) == bool 0) * (-1) + ((sum(kaas{cluster="%(cluster)s"}) or on()
vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s",
alertstate="firing", severity="warning", alertgroup=~"%(groupCluster)s|%(groupApp)s"}
OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog", cluster="%(cluster)s", alertstate="firing",
severity="critical", alertgroup=~"%(groupCluster)s|%(groupApp)s"} OR on() vector(0))
* %(maxWarnings)d
graphMode: none
gridPos:
h: 3
w: 4
mappings:
- from: -2
text: '-'
to: -2
type: 2
value: ''
- from: -1
text: Down
to: -1
type: 2
value: ''
- from: 0
text: OK
to: 0
type: 2
value: ''
- from: 1
text: Warning
to: 9999
type: 2
value: ''
- from: 10000
text: Critical
to: 100000000000000005366162204393472
type: 2
value: ''
thresholds:
critical: 10000
lowest: 0
operator: '>='
warning: 1
unit: none
|
testbed
Property | Value |
---|---|
main | main |
Property | Value |
---|---|
panel | expr: ((sum(up{infrastructure="testbed"}) or on() vector(0)) == bool 0) * (-1) + sum(ALERTS{alertname!="Watchdog",
infrastructure="testbed", alertstate="firing", severity="warning"} OR on() vector(0))
+ sum(ALERTS{alertname!="Watchdog", infrastructure="testbed", alertstate="firing",
severity="critical"} OR on() vector(0)) * %(maxWarnings)d
graphMode: none
gridPos:
h: 3
w: 4
mappings:
- from: -1
text: Down
to: -1
type: 2
value: ''
- from: 0
text: OK
to: 0
type: 2
value: ''
- from: 1
text: Warning
to: 9999
type: 2
value: ''
- from: 10000
text: Critical
to: 100000000000000005366162204393472
type: 2
value: ''
thresholds:
critical: 10000
lowest: 0
operator: '>='
warning: 1
unit: none
|
L1
Property | Value |
---|---|
host | host |
hostApps | hostApps |
k8s | k8s |
k8sApps | k8sApps |
vm | vm |
vmApps | vmApps |
host
Property | Value |
---|---|
overallNetworkErrors | overallNetworkErrors |
overallUtilizationCPU | overallUtilizationCPU |
overallUtilizationDisk | overallUtilizationDisk |
overallUtilizationRAM | overallUtilizationRAM |
targetDown | targetDown |
totalCores | totalCores |
totalDisk | totalDisk |
totalRAM | totalRAM |
usedCores | usedCores |
usedDisk | usedDisk |
usedRAM | usedRAM |
Property | Value |
---|---|
alert | customLables:
alertgroup: Host
expr: sum(rate(node_network_transmit_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) )
by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) ) by (job,
nodename)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'Host {{ $labels.nodename }}: High Overall Network Errors Count {{ $value
}}%'
name: HostNetworkOverallErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: sum(rate(node_network_transmit_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) )
by (job, nodename) + sum(rate(node_network_receive_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, job, cluster, pod) group_left(nodename) (node_uname_info) ) by (job,
nodename)
gridPos:
x: 18
y: 6
thresholds:
critical: 15
operator: '>='
warning: 10
title: Overall Errors
unit: pps
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Host
expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"%s", mode="idle"}[5m]) *
on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job,
nodename) )) * 100)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'Host {{ $labels.nodename }}: High CPU Overall Utilization {{ $value }}%'
name: HostCPUOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m])
* on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job,
nodename) )) * 100))
gridPos:
x: 0
y: 6
thresholds:
critical: 90
operator: '>='
warning: 75
title: Overall Utilization
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Host
expr: round((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance, job, cluster,
pod) group_left(nodename) (node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"}
* on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job,
nodename, device)) / ((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance,
job, cluster, pod) group_left(nodename) (node_uname_info)) by (job, nodename, device)
- sum(node_filesystem_free_bytes{job=~"%s"} * on(instance, job, cluster, pod) group_left(nodename)
(node_uname_info)) by (job, nodename, device)) + sum(node_filesystem_avail_bytes{job=~"%s"}
* on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) by (job,
nodename, device)) * 100 > 0)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'Host {{ $labels.nodename }}: High Disk Overall Utilization {{ $value }}%'
name: HostDiskOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
Property | Value |
---|---|
alert | customLables:
alertgroup: Host
expr: round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"%s"}
* on(instance, job, cluster, pod) group_left(nodename) (node_uname_info)) / sum
by (job, nodename, cluster) (node_memory_MemTotal_bytes{job=~"%s"} * on(instance,
job, cluster, pod) group_left(nodename) (node_uname_info))) * 100)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'Host {{ $labels.nodename }}: High RAM Overall Utilization {{ $value }}%'
name: HostRAMOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
Property | Value |
---|---|
alert | customLables:
alertgroup: Host
expr: 100 * (count by(job, namespace, service) (up{job=~"%s"} == 0) / count by(job,
namespace, service) (up{job=~"%s"}))
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
name: HostTargetDown
thresholds:
critical: 90
operator: '>='
warning: 10
|
panel | null |
Property | Value |
---|---|
panel | colorMode: value
expr: count(node_cpu_seconds_total{job=~"$job", mode="system"})
graphMode: none
gridPos:
h: 2
w: 3
x: 3
y: 9
thresholds:
color: '#858187'
value:
title: Total Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{job=~"$job"})
graphMode: none
gridPos:
h: 2
w: 3
x: 15
y: 9
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{job=~"$job"})
graphMode: none
gridPos:
h: 2
w: 3
x: 9
y: 9
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: (1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m])))) * count(node_cpu_seconds_total{job=~"$job",
mode="system"})
graphMode: none
gridPos:
h: 2
w: 3
x: 0
y: 9
thresholds:
color: '#858187'
value:
title: Used Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{job=~"$job"}) - sum(node_filesystem_free_bytes{job=~"$job"})
graphMode: none
gridPos:
h: 2
w: 3
x: 12
y: 9
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{job=~"$job"}) * (((1 - sum(node_memory_MemAvailable_bytes{job=~"$job"})
/ sum(node_memory_MemTotal_bytes{job=~"$job"}))))
graphMode: none
gridPos:
h: 2
w: 3
x: 6
y: 9
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
hostApps
Property | Value |
---|---|
apache | apache |
autoscaler | autoscaler |
cAdvisor | cAdvisor |
genericApp | genericApp |
harbor | harbor |
javaActuator | javaActuator |
jvm | jvm |
lokiDistributed | lokiDistributed |
mysqlExporter | mysqlExporter |
nginxIngress | nginxIngress |
nginxIngressCertificateExpiry | nginxIngressCertificateExpiry |
nginxNrpe | nginxNrpe |
nginxVts | nginxVts |
nginxVtsEnhanced | nginxVtsEnhanced |
nginxVtsEnhancedLegacy | nginxVtsEnhancedLegacy |
nginxVtsLegacy | nginxVtsLegacy |
phpFpm | phpFpm |
postfix | postfix |
prometheus | prometheus |
pythonFlask | pythonFlask |
rabbitmq | rabbitmq |
sslExporter | sslExporter |
websocket | websocket |
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (autoscaler_healthy{job=~".+"}) / sum by (job, cluster)
(autoscaler_instances{job=~".+"}) * 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Autoscaler Health Low {{ $value }}%'
name: HostAppAutoscalerHealthLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - autoscaler
|
panel | expr: (sum by (job) (autoscaler_healthy{cluster="$cluster", %(job)s}) / sum by (job)
(autoscaler_instances{cluster="$cluster", %(job)s}) * 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | description: GenericApp template. Used when application monitoring is requested but
appropriate template was not found.
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
gridPos:
w: 4
thresholds:
critical: 95
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: harbor_up{job=~".+"}
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Harbor component "{{ $labels.component }}" is
down'
name: HostAppHarborComponentDown
thresholds:
critical: 0
operator: ==
warning: 0
|
default | false |
linkTo | - harbor
|
panel | expr: (sum(harbor_up{cluster="$cluster", %(job)s}) / count(harbor_up{cluster="$cluster",
%(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="heap"})*100/sum by
(job, cluster) (jvm_memory_max_bytes{job=~".+", area="nonheap"}) > sum by (job,
cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100/sum by (job, cluster)
(jvm_memory_max_bytes{job=~".+", area="heap"}) or (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+",
area="nonheap"})*100)/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"}))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Java Actuator Heap High {{ $value }}%'
name: HostAppJavaActuatorHeapHigh
thresholds:
critical: 90
lowest: 0
operator: '>='
warning: 75
|
default | false |
linkTo | - javaactuator
|
panel | expr: (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="heap"})*100/sum by
(job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="nonheap"}) > sum by
(job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100/sum by
(job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"}) or (sum by
(job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100)/sum
by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"})) OR on()
vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 90
lowest: 0
operator: '>='
warning: 75
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: ((sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+", status!~"[4-5].*"}[5m]))
/ sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m]))
* 100) > 0 OR (sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m]))
+ 100))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Nginx Ingress Success Rate (non-4|5xx responses)
Low {{ printf "%.0f" $value }}%'
name: HostAppNginxIngressSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxingress
|
panel | expr: ((sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s,
status!~"[4-5].*"}[5m])) / sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster",
%(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster",
%(job)s}[5m])) + 100)) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{job=~".+"} - time())
/ 60 / 60 / 24
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Nginx Ingress Certificate Expiry in {{ printf
"%.2f" $value }} days'
name: HostAppNginxIngressCertificateExpiry
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 8
|
default | false |
linkTo | - nginxingress
|
panel | dataLinks:
- title: Detail
url: /d/nginxingress?var-job=%(job)s&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
decimals: 0
expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{cluster="$cluster",
%(job)s} - time()) OR on() vector(-100000000000000005366162204393472)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -100000000000000005366162204393472
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 691200
unit: s
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: HostAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvts
|
panel | expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s,
code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: HostAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtsenhanced
|
panel | expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s,
code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: HostAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtsenhancedlegacy
|
panel | expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*",
code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: HostAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtslegacy
|
panel | expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*",
code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (postfix_size{job=~".+"}))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
mappings:
- text: '-'
type: 1
value: -1
message: 'HostApp {{ $labels.job }}: Postfix Queue Size High {{ $value }}%'
name: HostAppPostfixQueueSizeHigh
thresholds:
critical: 10
lowest: 0
operator: '>='
warning: 5
|
default | false |
linkTo | - postfix
|
panel | expr: (sum by (job) (postfix_size{cluster="$cluster", %(job)s})) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 10
lowest: 0
operator: '>='
warning: 5
unit: mailq
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: HostApp
expr: (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+",status!~"[4-5].*"}[5m]))
/ sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m]))
* 100) > 0 OR (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'HostApp {{ $labels.job }}: Python Flask Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: HostAppPythonFlaskSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - pythonflask
|
panel | expr: (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s,status!~"[4-5].*"}[5m])) / sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
linkTo | - ssl-exporter
|
panel | decimals: 0
expr: bottomk(1,ssl_cert_not_after{cluster="$cluster"}-time() OR ssl_file_cert_not_after{cluster="$cluster"}-time()
OR ssl_kubeconfig_cert_not_after{cluster="$cluster"}-time() OR ssl_kubernetes_cert_not_after{cluster="$cluster"}-time())
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -100000000000000005366162204393472
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 691200
unit: s
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
k8s
Property | Value |
---|---|
apiServerHealth | apiServerHealth |
controllerManagerHealth | controllerManagerHealth |
daemonSetsHealth | daemonSetsHealth |
deploymentsHealth | deploymentsHealth |
etcdHealth | etcdHealth |
kubeletHealth | kubeletHealth |
mostUtilizedMasterNodeCPU | mostUtilizedMasterNodeCPU |
mostUtilizedMasterNodeDisk | mostUtilizedMasterNodeDisk |
mostUtilizedMasterNodeNetworkErrors | mostUtilizedMasterNodeNetworkErrors |
mostUtilizedMasterNodeRAM | mostUtilizedMasterNodeRAM |
mostUtilizedPVC | mostUtilizedPVC |
mostUtilizedWorkerNodeCPU | mostUtilizedWorkerNodeCPU |
mostUtilizedWorkerNodeDisk | mostUtilizedWorkerNodeDisk |
mostUtilizedWorkerNodeNetworkErrors | mostUtilizedWorkerNodeNetworkErrors |
mostUtilizedWorkerNodeRAM | mostUtilizedWorkerNodeRAM |
nodeHealth | nodeHealth |
overallMasterNodesNetworkErrors | overallMasterNodesNetworkErrors |
overallUtilizationMasterNodesCPU | overallUtilizationMasterNodesCPU |
overallUtilizationMasterNodesDisk | overallUtilizationMasterNodesDisk |
overallUtilizationMasterNodesRAM | overallUtilizationMasterNodesRAM |
overallUtilizationWorkerNodesCPU | overallUtilizationWorkerNodesCPU |
overallUtilizationWorkerNodesDisk | overallUtilizationWorkerNodesDisk |
overallUtilizationWorkerNodesRAM | overallUtilizationWorkerNodesRAM |
overallWorkerNodesNetworkErrors | overallWorkerNodesNetworkErrors |
proxyHealth | proxyHealth |
pvcBound | pvcBound |
runningContainers | runningContainers |
runningPods | runningPods |
runningStatefulSets | runningStatefulSets |
schedulerHealth | schedulerHealth |
succeededJobs | succeededJobs |
targetDown | targetDown |
totalCoresMasterNodes | totalCoresMasterNodes |
totalCoresWorkerNodes | totalCoresWorkerNodes |
totalDiskMasterNodes | totalDiskMasterNodes |
totalDiskWorkerNodes | totalDiskWorkerNodes |
totalRAMMasterNodes | totalRAMMasterNodes |
totalRAMWorkerNodes | totalRAMWorkerNodes |
usedCoresMasterNodes | usedCoresMasterNodes |
usedCoresWorkerNodes | usedCoresWorkerNodes |
usedDiskMasterNodes | usedDiskMasterNodes |
usedDiskWorkerNodes | usedDiskWorkerNodes |
usedRAMMasterNodes | usedRAMMasterNodes |
usedRAMWorkerNodes | usedRAMWorkerNodes |
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: (sum(up{job="apiserver"}) by (cluster) / count(up{job="apiserver"}) by (cluster))
* 100
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Api Server Health Low {{ $value }}%
name: ClusterApiServerHealthLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - apiserver
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: (sum(up{cluster="$cluster", job="apiserver"}) / count(up{cluster="$cluster",
job="apiserver"})) * 100 OR on() vector(-1)
gridPos:
w: 4
x: 0
y: 5
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: API Server
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: (sum(up{job="kube-controller-manager"}) by (cluster) / count(up{job="kube-controller-manager"})
by (cluster)) * 100
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Controller Manager Health Low {{ $value }}%
name: ClusterControllerManagerHealthLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - controllermanager
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: (sum(up{cluster="$cluster", job="kube-controller-manager"}) / count(up{cluster="$cluster",
job="kube-controller-manager"})) * 100 OR on() vector(-1)
gridPos:
w: 4
x: 4
y: 5
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: Controller Manager
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((sum(kube_daemonset_status_updated_number_scheduled OR kube_daemonset_updated_number_scheduled)
by (cluster) + sum(kube_daemonset_status_number_available) by (cluster)) / (2 *
sum(kube_daemonset_status_desired_number_scheduled) by (cluster)) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: DaemonSets Health Low {{ $value }}%
name: RunningDaemonSetsHealthLow
thresholds:
critical: 95
operator: <
warning: 99
|
linkTo | - daemonSetOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round((sum(kube_daemonset_status_updated_number_scheduled{cluster="$cluster"}
OR kube_daemonset_updated_number_scheduled{cluster="$cluster"}) + sum(kube_daemonset_status_number_available{cluster="$cluster"}))
/ (2 * sum(kube_daemonset_status_desired_number_scheduled{cluster="$cluster"}))
* 100)
gridPos:
x: 6
y: 12
thresholds:
critical: 95
operator: <
warning: 99
title: DaemonSets Health
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((sum(kube_deployment_status_replicas_updated) by (cluster) + sum(kube_deployment_status_replicas_available)
by (cluster)) / (2 * sum(kube_deployment_status_replicas) by (cluster)) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Running Deployments Health Low {{ $value }}%
name: RunningDeploymentsHealthLow
thresholds:
critical: 95
operator: <
warning: 99
|
linkTo | - deploymentOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round((sum(kube_deployment_status_replicas_updated{cluster="$cluster"}) + sum(kube_deployment_status_replicas_available{cluster="$cluster"}))
/ (2 * sum(kube_deployment_status_replicas{cluster="$cluster"})) * 100)
gridPos:
x: 0
y: 12
thresholds:
critical: 95
operator: <
warning: 99
title: Deployments Health
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: (sum(up{job="kube-etcd"}) by (cluster) / count(up{job="kube-etcd"}) by (cluster))
* 100
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Etcd Health Low {{ $value }}%
name: ClusterEtcdHealthLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - etcd
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: (sum(up{cluster="$cluster", job="kube-etcd"}) / count(up{cluster="$cluster",
job="kube-etcd"})) * 100 OR on() vector(-1)
gridPos:
w: 4
x: 8
y: 5
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: Etcd
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: (sum(up{job="kubelet", metrics_path="/metrics"}) by (cluster) / count(up{job="kubelet",
metrics_path="/metrics"}) by (cluster)) * 100
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Kubelet Health Low {{ $value }}%
name: ClusterKubeletHealthLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - kubelet
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: (sum(up{cluster="$cluster", job="kubelet", metrics_path="/metrics"}) / count(up{cluster="$cluster",
job="kubelet", metrics_path="/metrics"})) * 100 OR on() vector(-1)
gridPos:
w: 4
x: 12
y: 5
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: Kubelet
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m])
* on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename,
cluster) )) * 100)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Master Node {{ $labels.nodename }}: High CPU Utilization {{ $value
}}%'
name: ClusterMasterNodeCPUUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/cpuoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job",
mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info))
by (job, nodename) )) * 100))
gridPos:
w: 3
x: 3
y: 17
thresholds:
critical: 90
operator: '>='
warning: 75
title: Most Utilized Node
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod)
group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster) -
sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(master_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename,
device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance,
pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster))
+ sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(master_uname_info)) by (job, nodename, device, cluster)) * 100)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Master Node {{ $labels.nodename }}: High Disk Utilization {{ $value
}}%'
name: ClusterMasterNodeDiskUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/diskoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
mostUtilizedMasterNodeNetworkErrors
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename,
cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename,
cluster)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Master Node {{ $labels.nodename }}: High Network Errors Count {{
$value }}%'
name: ClusterMasterNodeNetworkErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/networkoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job",
device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance,
pod) group_left(nodename) (master_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster",
job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename))
gridPos:
w: 3
x: 21
y: 17
thresholds:
critical: 15
operator: '>='
warning: 10
title: Most Affected Node
unit: pps
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (master_uname_info)) / sum by (job, nodename,
cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(master_uname_info))) * 100)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Master Node {{ $labels.nodename }}: High RAM Utilization {{ $value
}}%'
name: ClusterMasterNodesRAMUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/memoryoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: sum(((kubelet_volume_stats_capacity_bytes - kubelet_volume_stats_available_bytes)
/ kubelet_volume_stats_capacity_bytes) * 100) by (persistentvolumeclaim, cluster)
linkGetParams: var-pvc={{ $labels.persistentvolumeclaim }}&var-cluster={{ $labels.cluster
}}
message: '"{{ $labels.persistentvolumeclaim }}": High PVC Utilization {{ $value }}%'
name: PVCUtilizationHigh
thresholds:
critical: 97
lowest: 0
operator: '>='
warning: 85
|
linkTo | - pvcOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(sum(((kubelet_volume_stats_capacity_bytes{cluster="$cluster"} - kubelet_volume_stats_available_bytes{cluster="$cluster"})
/ kubelet_volume_stats_capacity_bytes{cluster="$cluster"}) * 100) by (persistentvolumeclaim))
OR on() vector(-1)
gridPos:
w: 3
x: 21
y: 12
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 97
lowest: 0
operator: '>='
warning: 85
title: Most Utilized PVC
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m])
* on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename,
cluster) )) * 100)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Worker Node {{ $labels.nodename }}: High CPU Utilization {{ $value
}}%'
name: ClusterWorkerNodeCPUUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/cpuoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job",
mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info))
by (job, nodename) )) * 100))
gridPos:
w: 3
x: 3
y: 24
thresholds:
critical: 90
operator: '>='
warning: 75
title: Most Utilized Node
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance, pod)
group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster) -
sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(worker_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename,
device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance,
pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster))
+ sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(worker_uname_info)) by (job, nodename, device, cluster)) * 100)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Worker Node {{ $labels.nodename }}: High Disk Utilization {{ $value
}}%'
name: ClusterWorkerNodeDiskUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/diskoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
mostUtilizedWorkerNodeNetworkErrors
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename,
cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename,
cluster)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Worker Node {{ $labels.nodename }}: High Network Errors Count {{
$value }}%'
name: ClusterWorkerNodeNetworkErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/networkoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job",
device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance,
pod) group_left(nodename) (worker_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster",
job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename))
gridPos:
w: 3
x: 21
y: 24
thresholds:
critical: 15
operator: '>='
warning: 10
title: Most Affected Node
unit: pps
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (worker_uname_info)) / sum by (job, nodename,
cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(worker_uname_info))) * 100)
linkGetParams: var-instance={{ $labels.nodename }}&var-cluster={{ $labels.cluster
}}
message: 'Cluster Worker Node {{ $labels.nodename }}: High RAM Utilization {{ $value
}}%'
name: ClusterWorkerNodesRAMUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/memoryoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round(sum(kube_node_info) by (cluster) / (sum(kube_node_info) by (cluster) +
sum(kube_node_spec_unschedulable) by (cluster) + sum(kube_node_status_condition{condition=~"DiskPressure|MemoryPressure|PIDPressure",
status=~"true|unknown"}) by (cluster) + sum(kube_node_status_condition{condition="Ready",
status=~"false|unknown"}) by (cluster)) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Nodes Health Low {{ $value }}%
name: NodesHealthLow
thresholds:
critical: 95
operator: <
warning: 99
|
linkTo | - nodeOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round(sum(kube_node_info{cluster="$cluster"}) / (sum(kube_node_info{cluster="$cluster"})
+ sum(kube_node_spec_unschedulable{cluster="$cluster"}) + sum(kube_node_status_condition{cluster="$cluster",
condition=~"DiskPressure|MemoryPressure|PIDPressure", status=~"true|unknown"}) +
sum(kube_node_status_condition{cluster="$cluster", condition="Ready", status=~"false|unknown"})
) * 100)
gridPos:
x: 0
y: 9
thresholds:
critical: 95
operator: <
warning: 99
title: Nodes Health
|
overallMasterNodesNetworkErrors
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: sum(sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename,
cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename,
cluster)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Master Nodes High Overall Network Errors Count {{ $value }}%
name: ClusterMasterNodesNetworkOverallErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - networkPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: sum(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job",
device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance,
pod) group_left(nodename) (master_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster",
job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (master_uname_info) ) by (job, nodename))
gridPos:
w: 3
x: 18
y: 17
thresholds:
critical: 15
operator: '>='
warning: 10
title: Overall Errors
unit: pps
|
overallUtilizationMasterNodesCPU
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m])
* on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename,
cluster) )) * 100)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Master Nodes High CPU Overall Utilization {{ $value }}%
name: ClusterMasterNodesCPUOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - cpuPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job",
mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (master_uname_info))
by (job, nodename) )) * 100))
gridPos:
w: 3
x: 0
y: 17
thresholds:
critical: 90
operator: '>='
warning: 75
title: Overall Utilization
|
overallUtilizationMasterNodesDisk
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: avg(round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance,
pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster)
- sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(master_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (master_uname_info)) by (job, nodename,
device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance,
pod) group_left(nodename) (master_uname_info)) by (job, nodename, device, cluster))
+ sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(master_uname_info)) by (job, nodename, device, cluster)) * 100 > 0)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Master Nodes High Disk Overall Utilization {{ $value }}%
name: ClusterMasterNodesDiskOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - diskPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
overallUtilizationMasterNodesRAM
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (master_uname_info)) / sum by (job, nodename,
cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(master_uname_info))) * 100)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Master Nodes High RAM Overall Utilization {{ $value }}%
name: ClusterMasterNodesRAMOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - memoryPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
overallUtilizationWorkerNodesCPU
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"node-exporter", mode="idle"}[5m])
* on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename,
cluster) )) * 100)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Worker Nodes High CPU Overall Utilization {{ $value }}%
name: ClusterWorkerNodesCPUOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - cpuPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/cpunamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job",
mode="idle"}[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info))
by (job, nodename) )) * 100))
gridPos:
w: 3
x: 0
y: 24
thresholds:
critical: 90
operator: '>='
warning: 75
title: Overall Utilization
|
overallUtilizationWorkerNodesDisk
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: avg(round((sum(node_filesystem_size_bytes{job=~"node-exporter"} * on(instance,
pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster)
- sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(worker_uname_info)) by (job, nodename, device, cluster)) / ((sum(node_filesystem_size_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (worker_uname_info)) by (job, nodename,
device, cluster) - sum(node_filesystem_free_bytes{job=~"node-exporter"} * on(instance,
pod) group_left(nodename) (worker_uname_info)) by (job, nodename, device, cluster))
+ sum(node_filesystem_avail_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(worker_uname_info)) by (job, nodename, device, cluster)) * 100 > 0)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Worker Nodes High Disk Overall Utilization {{ $value }}%
name: ClusterWorkerNodesDiskOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - diskPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
overallUtilizationWorkerNodesRAM
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"node-exporter"}
* on(instance, pod) group_left(nodename) (worker_uname_info)) / sum by (job, nodename,
cluster) (node_memory_MemTotal_bytes{job=~"node-exporter"} * on(instance, pod) group_left(nodename)
(worker_uname_info))) * 100)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Worker Nodes High RAM Overall Utilization {{ $value }}%
name: ClusterWorkerNodesRAMOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - memoryPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/memorynamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
overallWorkerNodesNetworkErrors
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: sum(sum(rate(node_network_transmit_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename,
cluster) + sum(rate(node_network_receive_errs_total{job=~"node-exporter", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename,
cluster)) by (cluster)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Worker Nodes High Overall Network Errors Count {{ $value }}%
name: ClusterWorkerNodesNetworkOverallErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - networkPerNodePolystat
|
panel | dataLinks:
- title: System Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to&var-instance=All
- title: K8s Overview
url: /d/networknamespaceoverview?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: sum(sum(rate(node_network_transmit_errs_total{cluster="$cluster", job=~"$job",
device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"} [5m]) * on(instance,
pod) group_left(nodename) (worker_uname_info) ) by (job, nodename) + sum(rate(node_network_receive_errs_total{cluster="$cluster",
job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, pod) group_left(nodename) (worker_uname_info) ) by (job, nodename))
gridPos:
w: 3
x: 18
y: 24
thresholds:
critical: 15
operator: '>='
warning: 10
title: Overall Errors
unit: pps
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: (sum(up{job="kube-proxy"}) by (cluster) / count(up{job="kube-proxy"}) by (cluster))
* 100
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Proxy Health Low {{ $value }}%
name: ClusterProxyHealthLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - proxy
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: (sum(up{cluster="$cluster", job="kube-proxy"}) / count(up{cluster="$cluster",
job="kube-proxy"})) * 100 OR on() vector(-1)
gridPos:
w: 4
x: 16
y: 5
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: Proxy
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: "round(sum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) by (cluster)\
\ / (\nsum(kube_persistentvolumeclaim_status_phase{phase=\"Bound\"}) by (cluster)\
\ + sum(kube_persistentvolumeclaim_status_phase{phase=\"Pending\"}) by (cluster)\
\ +\nsum(kube_persistentvolumeclaim_status_phase{phase=\"Lost\"}) by (cluster)\n\
) * 100)"
linkGetParams: var-cluster={{ $labels.cluster }}
message: PVC Bound Rate Low {{ $value }}%
name: PVCBoundRateLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - pvcOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: "round(sum(kube_persistentvolumeclaim_status_phase{cluster=\"$cluster\", phase=\"\
Bound\"}) / (\nsum(kube_persistentvolumeclaim_status_phase{cluster=\"$cluster\"\
, phase=\"Bound\"}) + sum(kube_persistentvolumeclaim_status_phase{cluster=\"$cluster\"\
, phase=\"Pending\"}) +\nsum(kube_persistentvolumeclaim_status_phase{cluster=\"\
$cluster\", phase=\"Lost\"})\n) * 100) OR on() vector(-1)"
gridPos:
w: 3
x: 18
y: 12
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: PVC Bound
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round(sum(kube_pod_container_status_running) by (cluster) / (sum(kube_pod_container_status_running)
by (cluster) + (count(kube_pod_container_status_terminated) by (cluster) - count(kube_pod_container_status_terminated
unless ignoring(reason) kube_pod_container_status_terminated_reason{reason!="Completed"})
by (cluster)) + sum(kube_pod_container_status_waiting) by (cluster)) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Running Containers Health Low {{ $value }}%
name: RunningContainersHealthLow
thresholds:
critical: 95
operator: <
warning: 99
|
linkTo | - containerOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round(sum(kube_pod_container_status_running{cluster="$cluster"}) / (sum(kube_pod_container_status_running{cluster="$cluster"})
+ (sum(kube_pod_container_status_terminated_reason{cluster="$cluster", reason!="Completed"})
OR vector(0)) + sum(kube_pod_container_status_waiting{cluster="$cluster"})) * 100)
gridPos:
x: 12
y: 12
thresholds:
critical: 95
operator: <
warning: 99
title: Running Containers
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round(sum(kube_pod_status_phase{phase="Running"}) by (cluster) / (sum(kube_pod_status_phase{phase="Running"})
by (cluster) + sum(kube_pod_status_phase{phase="Pending"}) by (cluster) + sum(kube_pod_status_phase{phase="Failed"})
by (cluster) + sum(kube_pod_status_phase{phase="Unknown"}) by (cluster)) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Pods Health Low {{ $value }}%
name: RunningPodsHealthLow
thresholds:
critical: 95
operator: <
warning: 99
|
linkTo | - podOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round(sum(kube_pod_status_phase{cluster="$cluster", phase="Running"}) / (sum(kube_pod_status_phase{cluster="$cluster",
phase="Running"}) + sum(kube_pod_status_phase{cluster="$cluster", phase="Pending"})
+ sum(kube_pod_status_phase{cluster="$cluster", phase="Failed"}) + sum(kube_pod_status_phase{cluster="$cluster",
phase="Unknown"})) * 100)
gridPos:
x: 12
y: 9
thresholds:
critical: 95
operator: <
warning: 99
title: Running Pods
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round(sum(kube_statefulset_status_replicas_ready) by (cluster) / sum(kube_statefulset_status_replicas)
by (cluster) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: StatefulSets Health Low {{ $value }}%
name: RunningStatefulSetsHealthLow
thresholds:
critical: 95
operator: <
warning: 99
|
linkTo | - statefulSetOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round(sum(kube_statefulset_status_replicas_ready{cluster="$cluster"}) / sum(kube_statefulset_status_replicas{cluster="$cluster"})
* 100)
gridPos:
x: 6
y: 9
thresholds:
critical: 95
operator: <
warning: 99
title: Running StatefulSets
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: (sum(up{job="kube-scheduler"}) by (cluster) / count(up{job="kube-scheduler"})
by (cluster)) * 100
linkGetParams: var-cluster={{ $labels.cluster }}
message: Cluster Scheduler Health Low {{ $value }}%
name: ClusterSchedulerHealthLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - scheduler
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: (sum(up{cluster="$cluster", job="kube-scheduler"}) / count(up{cluster="$cluster",
job="kube-scheduler"})) * 100 OR on() vector(-1)
gridPos:
w: 4
x: 20
y: 5
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: Scheduler
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: round(sum(kube_job_status_succeeded) by (cluster) / (sum(kube_job_status_succeeded)
by (cluster) + sum(kube_job_status_failed) by (cluster)) * 100)
linkGetParams: var-cluster={{ $labels.cluster }}
message: Succeeded Jobs Rate Low {{ $value }}%
name: SucceededJobsRateLow
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
linkTo | - jobOverviewTable
|
panel | dataLinks:
- title: K8s Overview
url: /d/{}?refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: round(sum(kube_job_status_succeeded{cluster="$cluster"}) / (sum(kube_job_status_succeeded{cluster="$cluster"})
+ sum(kube_job_status_failed{cluster="$cluster"})) * 100) OR on() vector(-1)
gridPos:
x: 18
y: 9
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
title: Succeeded Jobs
|
Property | Value |
---|---|
alert | customLables:
alertgroup: Cluster
expr: 100 * (count by(job, namespace, service, cluster) (up{pod!~"virt-launcher.*|"}
== 0) / count by(job, namespace, service, cluster) (up{pod!~"virt-launcher.*|"}))
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
name: ClusterTargetDown
thresholds:
critical: 90
operator: '>='
warning: 10
|
panel | null |
Property | Value |
---|---|
panel | colorMode: value
expr: count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system",
instance=~"$masterInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 3
y: 20
thresholds:
color: '#858187'
value:
title: Total Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system",
instance=~"$workerInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 3
y: 27
thresholds:
color: '#858187'
value:
title: Total Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 15
y: 20
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 15
y: 27
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 9
y: 20
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 9
y: 27
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: (1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle",
instance=~"$masterInstance"}[5m])))) * count(node_cpu_seconds_total{cluster="$cluster",
job=~"$job", mode="system", instance=~"$masterInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 0
y: 20
thresholds:
color: '#858187'
value:
title: Used Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: (1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle",
instance=~"$workerInstance"}[5m])))) * count(node_cpu_seconds_total{cluster="$cluster",
job=~"$job", mode="system", instance=~"$workerInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 0
y: 27
thresholds:
color: '#858187'
value:
title: Used Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})
- sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 12
y: 20
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})
- sum(node_filesystem_free_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})
graphMode: none
gridPos:
h: 2
w: 3
x: 12
y: 27
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})
* (((1 - sum(node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"})
/ sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$masterInstance"}))))
graphMode: none
gridPos:
h: 2
w: 3
x: 6
y: 20
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})
* (((1 - sum(node_memory_MemAvailable_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"})
/ sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job", instance=~"$workerInstance"}))))
graphMode: none
gridPos:
h: 2
w: 3
x: 6
y: 27
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
k8sApps
Property | Value |
---|---|
apache | apache |
autoscaler | autoscaler |
cAdvisor | cAdvisor |
genericApp | genericApp |
harbor | harbor |
javaActuator | javaActuator |
jvm | jvm |
lokiDistributed | lokiDistributed |
mysqlExporter | mysqlExporter |
nginxIngress | nginxIngress |
nginxIngressCertificateExpiry | nginxIngressCertificateExpiry |
nginxNrpe | nginxNrpe |
nginxVts | nginxVts |
nginxVtsEnhanced | nginxVtsEnhanced |
nginxVtsEnhancedLegacy | nginxVtsEnhancedLegacy |
nginxVtsLegacy | nginxVtsLegacy |
phpFpm | phpFpm |
postfix | postfix |
prometheus | prometheus |
pythonFlask | pythonFlask |
rabbitmq | rabbitmq |
sslExporter | sslExporter |
websocket | websocket |
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (autoscaler_healthy{job=~".+"}) / sum by (job, cluster)
(autoscaler_instances{job=~".+"}) * 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Autoscaler Health Low {{ $value }}%'
name: ClusterAppAutoscalerHealthLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - autoscaler
|
panel | expr: (sum by (job) (autoscaler_healthy{cluster="$cluster", %(job)s}) / sum by (job)
(autoscaler_instances{cluster="$cluster", %(job)s}) * 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | description: GenericApp template. Used when application monitoring is requested but
appropriate template was not found.
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
gridPos:
w: 4
thresholds:
critical: 95
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: harbor_up{job=~".+"}
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Harbor component "{{ $labels.component }}"
is down'
name: ClusterAppHarborComponentDown
thresholds:
critical: 0
operator: ==
warning: 0
|
default | false |
linkTo | - harbor
|
panel | expr: (sum(harbor_up{cluster="$cluster", %(job)s}) / count(harbor_up{cluster="$cluster",
%(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="heap"})*100/sum by
(job, cluster) (jvm_memory_max_bytes{job=~".+", area="nonheap"}) > sum by (job,
cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100/sum by (job, cluster)
(jvm_memory_max_bytes{job=~".+", area="heap"}) or (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+",
area="nonheap"})*100)/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"}))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Java Actuator Heap High {{ $value }}%'
name: ClusterAppJavaActuatorHeapHigh
thresholds:
critical: 90
lowest: 0
operator: '>='
warning: 75
|
default | false |
linkTo | - javaactuator
|
panel | expr: (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="heap"})*100/sum by
(job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="nonheap"}) > sum by
(job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100/sum by
(job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"}) or (sum by
(job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100)/sum
by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"})) OR on()
vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 90
lowest: 0
operator: '>='
warning: 75
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: ((sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+", status!~"[4-5].*"}[5m]))
/ sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m]))
* 100) > 0 OR (sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m]))
+ 100))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Nginx Ingress Success Rate (non-4|5xx responses)
Low {{ printf "%.0f" $value }}%'
name: ClusterAppNginxIngressSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxingress
|
panel | expr: ((sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s,
status!~"[4-5].*"}[5m])) / sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster",
%(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster",
%(job)s}[5m])) + 100)) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{job=~".+"} - time())
/ 60 / 60 / 24
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Nginx Ingress Certificate Expiry in {{ printf
"%.2f" $value }} days'
name: ClusterAppNginxIngressCertificateExpiry
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 8
|
default | false |
linkTo | - nginxingress
|
panel | dataLinks:
- title: Detail
url: /d/nginxingress?var-job=%(job)s&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
decimals: 0
expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{cluster="$cluster",
%(job)s} - time()) OR on() vector(-100000000000000005366162204393472)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -100000000000000005366162204393472
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 691200
unit: s
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvts
|
panel | expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s,
code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtsenhanced
|
panel | expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s,
code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtsenhancedlegacy
|
panel | expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*",
code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtslegacy
|
panel | expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*",
code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (postfix_size{job=~".+"}))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
mappings:
- text: '-'
type: 1
value: -1
message: 'ClusterApp {{ $labels.job }}: Postfix Queue Size High {{ $value }}%'
name: ClusterAppPostfixQueueSizeHigh
thresholds:
critical: 10
lowest: 0
operator: '>='
warning: 5
|
default | false |
linkTo | - postfix
|
panel | expr: (sum by (job) (postfix_size{cluster="$cluster", %(job)s})) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 10
lowest: 0
operator: '>='
warning: 5
unit: mailq
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterApp
expr: (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+",status!~"[4-5].*"}[5m]))
/ sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m]))
* 100) > 0 OR (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterApp {{ $labels.job }}: Python Flask Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterAppPythonFlaskSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - pythonflask
|
panel | expr: (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s,status!~"[4-5].*"}[5m])) / sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
linkTo | - ssl-exporter
|
panel | decimals: 0
expr: bottomk(1,ssl_cert_not_after{cluster="$cluster"}-time() OR ssl_file_cert_not_after{cluster="$cluster"}-time()
OR ssl_kubeconfig_cert_not_after{cluster="$cluster"}-time() OR ssl_kubernetes_cert_not_after{cluster="$cluster"}-time())
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -100000000000000005366162204393472
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 691200
unit: s
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
vm
Property | Value |
---|---|
main | main |
Property | Value |
---|---|
panel | expr: sum(ALERTS{alertname!="Watchdog", alertstate="firing", severity="warning", job=~"%(job)s",
alertgroup=~"%(groupVM)s|%(groupVMApp)s"} OR on() vector(0)) + sum(ALERTS{alertname!="Watchdog",
alertstate="firing", severity="critical", job=~"%(job)s", alertgroup=~"%(groupVM)s|%(groupVMApp)s"}
OR on() vector(0)) * %(maxWarnings)d
graphMode: none
gridPos:
h: 3
w: 4
mappings:
- from: 0
text: OK
to: 0
type: 2
value: ''
- from: 1
text: Warning
to: 9999
type: 2
value: ''
- from: 10000
text: Critical
to: 100000000000000005366162204393472
type: 2
value: ''
thresholds:
critical: 10000
operator: '>='
warning: 1
unit: none
|
vmApps
Property | Value |
---|---|
apache | apache |
autoscaler | autoscaler |
cAdvisor | cAdvisor |
genericApp | genericApp |
harbor | harbor |
javaActuator | javaActuator |
jvm | jvm |
lokiDistributed | lokiDistributed |
mysqlExporter | mysqlExporter |
nginxIngress | nginxIngress |
nginxIngressCertificateExpiry | nginxIngressCertificateExpiry |
nginxNrpe | nginxNrpe |
nginxVts | nginxVts |
nginxVtsEnhanced | nginxVtsEnhanced |
nginxVtsEnhancedLegacy | nginxVtsEnhancedLegacy |
nginxVtsLegacy | nginxVtsLegacy |
phpFpm | phpFpm |
postfix | postfix |
prometheus | prometheus |
pythonFlask | pythonFlask |
rabbitmq | rabbitmq |
sslExporter | sslExporter |
websocket | websocket |
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (autoscaler_healthy{job=~".+"}) / sum by (job, cluster)
(autoscaler_instances{job=~".+"}) * 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Autoscaler Health Low {{ $value }}%'
name: ClusterVMAppAutoscalerHealthLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - autoscaler
|
panel | expr: (sum by (job) (autoscaler_healthy{cluster="$cluster", %(job)s}) / sum by (job)
(autoscaler_instances{cluster="$cluster", %(job)s}) * 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | description: GenericApp template. Used when application monitoring is requested but
appropriate template was not found.
expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
gridPos:
w: 4
thresholds:
critical: 95
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: harbor_up{job=~".+"}
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Harbor component "{{ $labels.component }}"
is down'
name: ClusterVMAppHarborComponentDown
thresholds:
critical: 0
operator: ==
warning: 0
|
default | false |
linkTo | - harbor
|
panel | expr: (sum(harbor_up{cluster="$cluster", %(job)s}) / count(harbor_up{cluster="$cluster",
%(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+", area="heap"})*100/sum by
(job, cluster) (jvm_memory_max_bytes{job=~".+", area="nonheap"}) > sum by (job,
cluster) (jvm_memory_used_bytes{job=~".+", area="nonheap"})*100/sum by (job, cluster)
(jvm_memory_max_bytes{job=~".+", area="heap"}) or (sum by (job, cluster) (jvm_memory_used_bytes{job=~".+",
area="nonheap"})*100)/sum by (job, cluster) (jvm_memory_max_bytes{job=~".+", area="heap"}))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Java Actuator Heap High {{ $value }}%'
name: ClusterVMAppJavaActuatorHeapHigh
thresholds:
critical: 90
lowest: 0
operator: '>='
warning: 75
|
default | false |
linkTo | - javaactuator
|
panel | expr: (sum by (job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="heap"})*100/sum by
(job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="nonheap"}) > sum by
(job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100/sum by
(job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"}) or (sum by
(job) (jvm_memory_used_bytes{cluster="$cluster", %(job)s, area="nonheap"})*100)/sum
by (job) (jvm_memory_max_bytes{cluster="$cluster", %(job)s, area="heap"})) OR on()
vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 90
lowest: 0
operator: '>='
warning: 75
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: ((sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+", status!~"[4-5].*"}[5m]))
/ sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m]))
* 100) > 0 OR (sum by (job, cluster) (rate(nginx_ingress_controller_requests{job=~".+"}[5m]))
+ 100))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Nginx Ingress Success Rate (non-4|5xx responses)
Low {{ printf "%.0f" $value }}%'
name: ClusterVMAppNginxIngressSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxingress
|
panel | expr: ((sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster", %(job)s,
status!~"[4-5].*"}[5m])) / sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster",
%(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_ingress_controller_requests{cluster="$cluster",
%(job)s}[5m])) + 100)) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{job=~".+"} - time())
/ 60 / 60 / 24
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Nginx Ingress Certificate Expiry in {{ printf
"%.2f" $value }} days'
name: ClusterVMAppNginxIngressCertificateExpiry
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 8
|
default | false |
linkTo | - nginxingress
|
panel | dataLinks:
- title: Detail
url: /d/nginxingress?var-job=%(job)s&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
decimals: 0
expr: bottomk(1, nginx_ingress_controller_ssl_expire_time_seconds{cluster="$cluster",
%(job)s} - time()) OR on() vector(-100000000000000005366162204393472)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -100000000000000005366162204393472
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 691200
unit: s
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterVMAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvts
|
panel | expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s,
code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_vts_server_requests_total{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterVMAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtsenhanced
|
panel | expr: (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster", %(job)s,
code!~"[4-5].*", code!="total"}[5m])) / sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_vts_server_requests_total{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterVMAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtsenhancedlegacy
|
panel | expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*",
code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (rate(nginx_server_requests{job=~".+", code!~"[4-5].*",
code!="total"}[5m])) / sum by (job, cluster) (rate(nginx_server_requests{job=~".+",
code!="total"}[5m])) * 100) > 0 OR (sum by (job, cluster) (rate(nginx_server_requests{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Nginx VTS Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterVMAppNginxVTSSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - nginxvtslegacy
|
panel | expr: (sum by (job) (rate(nginx_server_requests{cluster="$cluster", %(job)s, code!~"[4-5].*",
code!="total"}[5m])) / sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s, code!="total"}[5m])) * 100) > 0 OR (sum by (job) (rate(nginx_server_requests{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (postfix_size{job=~".+"}))
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
mappings:
- text: '-'
type: 1
value: -1
message: 'ClusterVMApp {{ $labels.job }}: Postfix Queue Size High {{ $value }}%'
name: ClusterVMAppPostfixQueueSizeHigh
thresholds:
critical: 10
lowest: 0
operator: '>='
warning: 5
|
default | false |
linkTo | - postfix
|
panel | expr: (sum by (job) (postfix_size{cluster="$cluster", %(job)s})) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 10
lowest: 0
operator: '>='
warning: 5
unit: mailq
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{%(job)s}) / count(up{cluster="$cluster", %(job)s}))*100 OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVMApp
expr: (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+",status!~"[4-5].*"}[5m]))
/ sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m]))
* 100) > 0 OR (sum by (job, cluster) (rate(flask_http_request_duration_seconds_count{job=~".+"}[5m]))
+ 100)
linkGetParams: var-job={{ $labels.job }}&var-cluster={{ $labels.cluster }}
message: 'ClusterVMApp {{ $labels.job }}: Python Flask Success Rate (non-4|5xx responses)
Low {{ $value }}%'
name: ClusterVMAppPythonFlaskSuccessRateLow
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
default | false |
linkTo | - pythonflask
|
panel | expr: (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s,status!~"[4-5].*"}[5m])) / sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s}[5m])) * 100) > 0 OR (sum by (job) (rate(flask_http_request_duration_seconds_count{cluster="$cluster",
%(job)s}[5m])) + 100) OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 85
lowest: 0
operator: <
warning: 95
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
Property | Value |
---|---|
alert | {}
|
default | false |
linkTo | - ssl-exporter
|
panel | decimals: 0
expr: bottomk(1,ssl_cert_not_after{cluster="$cluster"}-time() OR ssl_file_cert_not_after{cluster="$cluster"}-time()
OR ssl_kubeconfig_cert_not_after{cluster="$cluster"}-time() OR ssl_kubernetes_cert_not_after{cluster="$cluster"}-time())
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -100000000000000005366162204393472
thresholds:
critical: 0
lowest: -100000000000000005366162204393472
operator: <
warning: 691200
unit: s
|
Property | Value |
---|---|
alert | {}
|
default | false |
panel | expr: (sum(up{cluster="$cluster", %(job)s}) / count(up{cluster="$cluster", %(job)s}))*100
OR on() vector(-1)
gridPos:
w: 4
mappings:
- text: '-'
type: 1
value: -1
thresholds:
critical: 95
lowest: 0
operator: <
warning: 99
|
L2
Property | Value |
---|---|
containerOverview | containerOverview |
cpuPerNode | cpuPerNode |
daemonSetOverview | daemonSetOverview |
deploymentOverview | deploymentOverview |
diskPerNode | diskPerNode |
jobOverview | jobOverview |
memoryPerNode | memoryPerNode |
networkPerNode | networkPerNode |
nodeOverview | nodeOverview |
podOverview | podOverview |
pvcOverview | pvcOverview |
statefulSetOverview | statefulSetOverview |
vm | vm |
containerOverview
Property | Value |
---|---|
containerOverviewTable | containerOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_pod_container_info{cluster="$cluster", namespace=~"$namespace",
pod=~"$pod"}, container)
|
panel | expr:
- "sum by (container, namespace, pod) ((kube_pod_container_status_terminated * 0 or\
\ kube_pod_container_status_terminated_reason{cluster=\"$cluster\", namespace=~\"\
$namespace\", pod=~\"$pod\", container=~\"$container\", reason=\"Completed\"}) *\
\ 1) + \nsum by (container, namespace, pod) (kube_pod_container_status_running{cluster=\"\
$cluster\"} * 2) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
ContainerCreating\"}) * 3) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
CrashLoopBackOff\"}) * 4) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
CreateContainerConfigError\"}) * 5) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
ErrImagePull\"}) * 6) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
ImagePullBackOff\"}) * 7) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
CreateContainerError\"}) * 8) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
InvalidImageName\"}) * 9) + \nsum by (container, namespace, pod) ((kube_pod_container_status_waiting\
\ * 0 or kube_pod_container_status_waiting_reason{cluster=\"$cluster\", reason=\"\
CrashLoopBackOff\"}) * 10) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\
\ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\
OOMKilled\"}) * 11) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\
\ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\
Error\"}) * 12) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\
\ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\
ContainerCannotRun\"}) * 13) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\
\ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\
DeadlineExceeded\"}) * 14) + \nsum by (container, namespace, pod) ((kube_pod_container_status_terminated\
\ * 0 or kube_pod_container_status_terminated_reason{cluster=\"$cluster\", reason=\"\
Evicted\"}) * 15)"
- sum by (container, namespace, pod) (kube_pod_container_status_restarts_total{cluster="$cluster",
namespace=~"$namespace", pod=~"$pod", container=~"$container"})
sort:
col: 5
desc: true
styles:
- pattern: Time
type: hidden
- alias: Status
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #A'
thresholds:
- 4
- 4
type: string
valueMaps:
- text: Terminated (Completed)
value: 1
- text: Running
value: 2
- text: Waiting (ContainerCreating)
value: 3
- text: Waiting (CrashLoopBackOff)
value: 4
- text: Waiting (CreateContainerConfigError)
value: 5
- text: Waiting (ErrImagePull)
value: 6
- text: Waiting (ImagePullBackOff)
value: 7
- text: Waiting (CreateContainerError)
value: 8
- text: Waiting (InvalidImageName)
value: 9
- text: Waiting (CrashLoopBackOff)
value: 10
- text: Terminated (OOMKilled)
value: 11
- text: Terminated (Error)
value: 12
- text: Terminated (ContainerCannotRun)
value: 13
- text: Terminated (DeadlineExceeded)
value: 14
- text: Terminated (Evicted)
value: 15
- alias: Restarts
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
pattern: 'Value #B'
thresholds:
- 5
- 10
type: number
- alias: Container
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-container=${__cell_3}&var-namespace=${__cell_1}&var-pod=${__cell_2}&var-view=container&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: container
- alias: Namespace
pattern: namespace
type: string
- alias: Pod
pattern: pod
type: string
title: Containers
transformations:
- id: merge
options: {}
- id: organize
options:
excludeByName:
Time: false
indexByName:
Time: 0
'Value #A': 4
'Value #B': 5
container: 3
namespace: 1
pod: 2
renameByName: {}
|
cpuPerNode
Property | Value |
---|---|
cpuPerNodePolystat | cpuPerNodePolystat |
Property | Value |
---|---|
base | "basePolystatTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"},
nodename)
|
panel | default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: "avg(round((1 - (avg by (instance, pod) (irate(node_cpu_seconds_total{cluster=\"\
$cluster\", job=~\"$job\", mode=\"idle\"}[5m])))) * 100)\n* on(instance, pod) group_left(nodename)\
\ \n node_uname_info{cluster=\"$cluster\", nodename=~\"$instance\"}) by (nodename)"
fontColor: '#ffffff'
global_thresholds:
- color: '#56a64b'
state: 0
value: 0
- color: '#ff780a'
state: 1
value: 75
- color: '#e02f44'
state: 2
value: 90
global_unit_format: percent
title: CPU per Node
|
daemonSetOverview
Property | Value |
---|---|
daemonSetOverviewTable | daemonSetOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_daemonset_status_desired_number_scheduled{cluster="$cluster",
namespace=~"$namespace"}, daemonset)
|
panel | expr:
- sum by (daemonset, namespace) (kube_daemonset_status_number_misscheduled{cluster="$cluster",
namespace=~"$namespace", daemonset=~"$daemonset"})
- sum by (daemonset, namespace) (kube_daemonset_status_desired_number_scheduled{cluster="$cluster",
namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace)
(kube_daemonset_updated_number_scheduled{cluster="$cluster", namespace=~"$namespace",
daemonset=~"$daemonset"})
- sum by (daemonset, namespace) (kube_daemonset_status_desired_number_scheduled{cluster="$cluster",
namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace)
(kube_daemonset_status_number_available{cluster="$cluster", namespace=~"$namespace",
daemonset=~"$daemonset"})
- sum by (daemonset, namespace) (kube_daemonset_status_desired_number_scheduled{cluster="$cluster",
namespace=~"$namespace", daemonset=~"$daemonset"}) - sum by (daemonset, namespace)
(kube_daemonset_status_number_ready{cluster="$cluster", namespace=~"$namespace",
daemonset=~"$daemonset"})
sort:
col: 5
desc: true
styles:
- pattern: Time
type: hidden
- alias: Scheduled
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #A'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: Updated
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #B'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: Available
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #C'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: Ready
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #D'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: DaemonSet
pattern: daemonset
type: string
- alias: Namespace
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-namespace=$__cell&var-pod=All&var-view=pod&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: namespace
title: DaemonSets
transformations:
- id: merge
options: {}
- id: organize
options:
excludeByName:
Time: true
indexByName:
Time: 0
'Value #A': 3
'Value #B': 4
'Value #C': 5
'Value #D': 6
daemonset: 2
namespace: 1
renameByName: {}
|
deploymentOverview
Property | Value |
---|---|
deploymentOverviewTable | deploymentOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_deployment_status_replicas{cluster="$cluster",
namespace=~"$namespace"}, deployment)
|
panel | expr:
- sum by (deployment, namespace) (kube_deployment_status_replicas{cluster="$cluster",
namespace=~"$namespace", deployment=~"$deployment"}) - sum by (deployment, namespace)
(kube_deployment_status_replicas_updated{cluster="$cluster", namespace=~"$namespace",
deployment=~"$deployment"})
- sum by (deployment, namespace) (kube_deployment_status_replicas{cluster="$cluster",
namespace=~"$namespace", deployment=~"$deployment"}) - sum by (deployment, namespace)
(kube_deployment_status_replicas_available{cluster="$cluster", namespace=~"$namespace",
deployment=~"$deployment"})
sort:
col: 3
desc: true
styles:
- pattern: Time
type: hidden
- alias: Updated
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #A'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: Available
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #B'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: Deployment
pattern: deployment
type: string
- alias: Namespace
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-namespace=$__cell&var-pod=All&var-view=pod&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: namespace
title: Deployments
transformations:
- id: merge
options: {}
- id: organize
options:
excludeByName:
Time: true
indexByName:
Time: 0
'Value #A': 3
'Value #B': 4
deployment: 2
namespace: 1
renameByName: {}
|
diskPerNode
Property | Value |
---|---|
diskPerNodePolystat | diskPerNodePolystat |
Property | Value |
---|---|
base | "basePolystatTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"},
nodename)
|
panel | default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
jobOverview
Property | Value |
---|---|
jobOverviewTable | jobOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_job_info{cluster="$cluster", namespace=~"$namespace"},
job_name)
|
panel | expr:
- "sum by (job_name, namespace) (clamp_max(kube_job_status_succeeded{cluster=\"$cluster\"\
, namespace=~\"$namespace\", job_name=~\"$job_name\"}, 1) * 1) * on(job_name, namespace)\
\ group_left(owner_name) kube_job_owner{cluster=\"$cluster\", namespace=~\"$namespace\"\
, job_name=~\"$job_name\"} +\nsum by (job_name, namespace) (clamp_max(kube_job_status_active{cluster=\"\
$cluster\", namespace=~\"$namespace\", job_name=~\"$job_name\"}, 1) * 2) * on(job_name,\
\ namespace) group_left(owner_name) kube_job_owner{cluster=\"$cluster\", namespace=~\"\
$namespace\", job_name=~\"$job_name\"} +\nsum by (job_name, namespace) (clamp_max(kube_job_status_failed{cluster=\"\
$cluster\", namespace=~\"$namespace\", job_name=~\"$job_name\"}, 1) * 3) * on(job_name,\
\ namespace) group_left(owner_name) kube_job_owner{cluster=\"$cluster\", namespace=~\"\
$namespace\", job_name=~\"$job_name\"}\n"
sort:
col: 3
desc: true
styles:
- pattern: Time
type: hidden
- alias: Status
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: Value
thresholds:
- 3
- 3
type: string
valueMaps:
- text: Succeeded
value: 1
- text: Active
value: 2
- text: Failed
value: 3
- alias: Job name
pattern: job_name
type: string
- alias: Owner
pattern: owner_name
type: string
- alias: Namespace
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-namespace=$__cell&var-container=All&var-view=container&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: namespace
title: Jobs
transformations:
- id: organize
options:
excludeByName:
Time: true
indexByName:
Time: 0
Value: 4
job_name: 2
namespace: 1
owner_name: 3
renameByName: {}
|
memoryPerNode
Property | Value |
---|---|
memoryPerNodePolystat | memoryPerNodePolystat |
Property | Value |
---|---|
base | "basePolystatTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"},
nodename)
|
panel | default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
networkPerNode
Property | Value |
---|---|
networkPerNodePolystat | networkPerNodePolystat |
Property | Value |
---|---|
base | "basePolystatTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(node_uname_info{cluster="$cluster", job=~"$job"},
nodename)
|
panel | default_click_through: /d/nodeexporter?var-job=$job&var-instance=${__cell_name}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: "avg((sum(rate(node_network_transmit_errs_total{cluster=\"$cluster\", job=~\"\
$job\", device!~\"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+\"}[5m])) \
\ by (instance, pod) \n + sum(rate(node_network_receive_errs_total{cluster=\"\
$cluster\", job=~\"$job\", device!~\"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+\"\
}[5m])) by (instance, pod))\n* on(instance, pod) group_left(nodename) \n node_uname_info{cluster=\"\
$cluster\", nodename=~\"$instance\"}) by (nodename)"
fontColor: '#ffffff'
global_thresholds:
- color: '#56a64b'
state: 0
value: 0
- color: '#ff780a'
state: 1
value: 10
- color: '#e02f44'
state: 2
value: 30
global_unit_format: pps
title: Network Errors per Node
|
nodeOverview
Property | Value |
---|---|
nodeOverviewTable | nodeOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | {}
|
panel | expr:
- sum by (node) (kube_node_spec_unschedulable{cluster="$cluster"})
- sum by (node) (kube_node_status_condition{cluster="$cluster", condition="DiskPressure",
status=~"true|unknown"})
- sum by (node) (kube_node_status_condition{cluster="$cluster", condition="MemoryPressure",
status=~"true|unknown"})
- sum by (node) (kube_node_status_condition{cluster="$cluster", condition="PIDPressure",
status=~"true|unknown"})
- sum by (node) (kube_node_status_condition{cluster="$cluster", condition="Ready",
status=~"false|unknown"})
sort:
col: 6
desc: true
styles:
- pattern: Time
type: hidden
- alias: Schedulable
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #A'
thresholds:
- 1
- 1
type: string
valueMaps:
- text: Failed
value: 1
- text: OK
value: 0
- alias: Disk Pressure
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #B'
thresholds:
- 1
- 1
type: string
valueMaps:
- text: Failed
value: 1
- text: OK
value: 0
- alias: Memory Pressure
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #C'
thresholds:
- 1
- 1
type: string
valueMaps:
- text: Failed
value: 1
- text: OK
value: 0
- alias: PID Pressure
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #D'
thresholds:
- 1
- 1
type: string
valueMaps:
- text: Failed
value: 1
- text: OK
value: 0
- alias: Ready
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #E'
thresholds:
- 1
- 1
type: string
valueMaps:
- text: Failed
value: 1
- text: OK
value: 0
- alias: Node
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-view=pod&var-instance=$__cell&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: node
title: Nodes
|
podOverview
Property | Value |
---|---|
podOverviewTable | podOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_pod_info{cluster="$cluster", namespace=~"$namespace"},
pod)
|
panel | expr:
- "sum by (namespace, pod) (kube_pod_status_phase{cluster=\"$cluster\", namespace=~\"\
$namespace\", phase=\"Running\"} * 1) +\nsum by (namespace, pod) (kube_pod_status_phase{cluster=\"\
$cluster\", namespace=~\"$namespace\", phase=\"Succeeded\"} * 2) +\nsum by (namespace,\
\ pod) (kube_pod_status_phase{cluster=\"$cluster\", namespace=~\"$namespace\", phase=\"\
Unknown\"} * 3) +\nsum by (namespace, pod) (kube_pod_status_phase{cluster=\"$cluster\"\
, namespace=~\"$namespace\", phase=\"Failed\"} * 4) +\nsum by (namespace, pod) (kube_pod_status_phase{cluster=\"\
$cluster\", namespace=~\"$namespace\", pod=~\"$pod\", phase=\"Pending\"} * 5)\n"
sort:
col: 3
desc: true
styles:
- pattern: Time
type: hidden
- alias: Status
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: Value
thresholds:
- 3
- 3
type: string
valueMaps:
- text: Running
value: 1
- text: Succeeded
value: 2
- text: Unknown
value: 3
- text: Failed
value: 4
- text: Pending
value: 5
- alias: Namespace
pattern: namespace
type: string
- alias: Pod
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-container=All&var-view=pod&var-namespace=${__cell_1}&var-pod=${__cell_2}&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: pod
title: Pods
|
pvcOverview
Property | Value |
---|---|
pvcOverviewTable | pvcOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_persistentvolumeclaim_info{cluster="$cluster",
namespace=~"$namespace"}, persistentvolumeclaim)
|
panel | description: Capacity is available only for remote pvc.
expr:
- sum by (persistentvolumeclaim, namespace) (((kubelet_volume_stats_capacity_bytes{cluster="$cluster",
namespace=~"$namespace", persistentvolumeclaim=~"$pvc"} - kubelet_volume_stats_available_bytes{cluster="$cluster",
namespace=~"$namespace", persistentvolumeclaim=~"$pvc"}) / kubelet_volume_stats_capacity_bytes{cluster="$cluster",
namespace=~"$namespace", persistentvolumeclaim=~"$pvc"}) * 100)
- "sum by (persistentvolumeclaim, namespace) (kube_persistentvolumeclaim_status_phase{cluster=\"\
$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\"$pvc\", phase=\"\
Bound\"} * 1) +\nsum by (persistentvolumeclaim, namespace) (kube_persistentvolumeclaim_status_phase{cluster=\"\
$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\"$pvc\", phase=\"\
Lost\"} * 2) +\nsum by (persistentvolumeclaim, namespace) (kube_persistentvolumeclaim_status_phase{cluster=\"\
$cluster\", namespace=~\"$namespace\", persistentvolumeclaim=~\"$pvc\", phase=\"\
Pending\"} * 3)\n"
sort:
col: 3
desc: true
styles:
- pattern: Time
type: hidden
- alias: Capacity
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
pattern: 'Value #A'
thresholds:
- 85
- 97
type: number
unit: percent
- alias: Status
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 1
pattern: 'Value #B'
thresholds:
- 2
- 2
type: string
valueMaps:
- text: Bound
value: 1
- text: Lost
value: 2
- text: Pending
value: 3
- alias: PVC
link: true
linkTooltip: Detail
linkUrl: /d/persistentvolumes?var-namespace=${__cell_1}&var-pvc=${__cell_2}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: persistentvolumeclaim
- alias: Namespace
pattern: namespace
type: string
title: Persistent Volumes
|
statefulSetOverview
Property | Value |
---|---|
statefulSetOverviewTable | statefulSetOverviewTable |
Property | Value |
---|---|
base | "baseTableTemplate" |
dashboardInfo | grafanaTemplateQuery: label_values(kube_statefulset_status_replicas{cluster="$cluster",
namespace=~"$namespace"}, statefulset)
|
panel | expr:
- sum by (statefulset, namespace) (kube_statefulset_status_replicas_updated{cluster="$cluster",
namespace=~"$namespace", statefulset=~"$statefulset"})
- sum by (statefulset, namespace) (kube_statefulset_status_replicas{cluster="$cluster",
namespace=~"$namespace", statefulset=~"$statefulset"}) - sum by (statefulset, namespace)
(kube_statefulset_status_replicas_ready{cluster="$cluster", namespace=~"$namespace",
statefulset=~"$statefulset"})
sort:
col: 4
desc: true
styles:
- pattern: Time
type: hidden
- alias: Updated
pattern: 'Value #A'
type: number
- alias: Ready
colorMode: cell
colors:
- '#56a64b'
- '#ff780a'
- '#e02f44'
mappingType: 2
pattern: 'Value #B'
rangeMaps:
- from: 0
text: OK
to: 0
- from: 1
text: Failed
to: 100000000000000005366162204393472
thresholds:
- 1
- 1
type: string
- alias: StatefulSet
link: true
linkTooltip: Detail
linkUrl: /d/statefulset?var-namespace=${__cell_1}&var-statefulset=${__cell_2}&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: statefulset
- alias: Namespace
link: true
linkTooltip: Detail
linkUrl: /d/containerdetail?var-namespace=$__cell&var-pod=All&var-view=pod&var-search=&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
pattern: namespace
title: StatefulSets
|
vm
Property | Value |
---|---|
mostUtilizedVMCPU | mostUtilizedVMCPU |
mostUtilizedVMDisk | mostUtilizedVMDisk |
mostUtilizedVMNetworkErrors | mostUtilizedVMNetworkErrors |
mostUtilizedVMRAM | mostUtilizedVMRAM |
overallNetworkErrors | overallNetworkErrors |
overallUtilizationCPU | overallUtilizationCPU |
overallUtilizationDisk | overallUtilizationDisk |
overallUtilizationRAM | overallUtilizationRAM |
targetDown | targetDown |
totalCores | totalCores |
totalDisk | totalDisk |
totalRAM | totalRAM |
usedCores | usedCores |
usedDisk | usedDisk |
usedRAM | usedRAM |
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: round((1 - (avg(irate(node_cpu_seconds_total{job=~"%s", mode="idle"}[5m]) *
on(instance) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'VM {{ $labels.nodename }}: High CPU Utilization {{ $value }}%'
name: VMCPUUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(round((1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m])
* on(instance) group_left(nodename) (node_uname_info)) by (job, nodename) )) * 100))
gridPos:
w: 3
x: 3
y: 6
thresholds:
critical: 90
operator: '>='
warning: 75
title: Most Utilized VM
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: round((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance) group_left(nodename)
(node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"}
* on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device))
/ ((sum(node_filesystem_size_bytes{job=~"%s"} * on(instance) group_left(nodename)
(node_uname_info)) by (job, nodename, device) - sum(node_filesystem_free_bytes{job=~"%s"}
* on(instance) group_left(nodename) (node_uname_info)) by (job, nodename, device))
+ sum(node_filesystem_avail_bytes{job=~"%s"} * on(instance) group_left(nodename)
(node_uname_info)) by (job, nodename, device)) * 100 > 0)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'VM {{ $labels.nodename }}: High Disk Utilization {{ $value }}%'
name: VMDiskUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: sum(rate(node_network_transmit_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename)
+ sum(rate(node_network_receive_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'VM {{ $labels.nodename }}: High Network Errors Count {{ $value }}%'
name: VMNetworkErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: max(sum(rate(node_network_transmit_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename)
+ sum(rate(node_network_receive_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance) group_left(nodename) (node_uname_info) ) by (job, nodename))
gridPos:
w: 3
x: 21
y: 6
thresholds:
critical: 15
operator: '>='
warning: 10
title: Most Affected VM
unit: pps
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: round((1 - sum by (job, nodename) (node_memory_MemAvailable_bytes{job=~"%s"}
* on(instance) group_left(nodename) (node_uname_info)) / sum by (job, nodename)
(node_memory_MemTotal_bytes{job=~"%s"} * on(instance) group_left(nodename) (node_uname_info)))
* 100)
linkGetParams: var-instance={{ $labels.nodename }}
message: 'VM {{ $labels.nodename }}: High RAM Utilization {{ $value }}%'
name: VMRAMUtilizationHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: sum(sum(rate(node_network_transmit_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job,
nodename) + sum(rate(node_network_receive_errs_total{job=~"%s", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job, nodename))
message: VM High Overall Network Errors Count {{ $value }}%
name: VMNetworkOverallErrorsHigh
thresholds:
critical: 15
operator: '>='
warning: 10
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: sum(sum(rate(node_network_transmit_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}
[5m]) * on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job,
nodename) + sum(rate(node_network_receive_errs_total{job=~"$job", device!~"lo|veth.+|docker.+|flannel.+|cali.+|cbr.|cni.+|br.+"}[5m])
* on(instance, cluster) group_left(nodename) (node_uname_info) ) by (job, nodename))
gridPos:
w: 3
x: 18
y: 6
thresholds:
critical: 15
operator: '>='
warning: 10
title: Overall Errors
unit: pps
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"%s", mode="idle"}[5m])
* on(instance, cluster) group_left(nodename) (node_uname_info)) by (job, nodename)
)) * 100))
message: VM High CPU Overall Utilization {{ $value }}%
name: VMCPUOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
expr: avg(round((1 - (avg(irate(node_cpu_seconds_total{job=~"$job", mode="idle"}[5m])
* on(instance, cluster) group_left(nodename) (node_uname_info)) by (job, nodename)
)) * 100))
gridPos:
w: 3
x: 0
y: 6
thresholds:
critical: 90
operator: '>='
warning: 75
title: Overall Utilization
|
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: round((sum(node_filesystem_size_bytes{job=~"%s"}) - sum(node_filesystem_free_bytes{job=~"%s"}))
/ (sum(node_filesystem_size_bytes{job=~"%s"}) - sum(node_filesystem_free_bytes{job=~"%s"})
+ sum(node_filesystem_avail_bytes{job=~"%s"})) * 100 > 0)
message: VM High Disk Overall Utilization {{ $value }}%
name: VMDiskOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the disk utilization is calculated using the fraction:\n\
```\n |
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: avg(round((1 - sum by (job, nodename, cluster) (node_memory_MemAvailable_bytes{job=~"%s"}
* on(instance, cluster) group_left(nodename) (node_uname_info)) / sum by (job, nodename,
cluster) (node_memory_MemTotal_bytes{job=~"%s"} * on(instance, cluster) group_left(nodename)
(node_uname_info))) * 100))
message: VM High RAM Overall Utilization {{ $value }}%
name: VMRAMOverallHigh
thresholds:
critical: 90
operator: '>='
warning: 75
|
linkTo | - nodeexporter
|
panel | dataLinks:
- title: System Overview
url: /d/nodeexporter?var-job=$job&refresh=10s&var-datasource=$datasource&var-cluster=$cluster&from=$__from&to=$__to
description: "The percentage of the memory utilization is calculated by:\n```\n1 -\
\ ( |
Property | Value |
---|---|
alert | customLables:
alertgroup: ClusterVM
expr: 100 * (count by(job, namespace, service) (up{job=~"%s"} == 0) / count by(job,
namespace, service) (up{job=~"%s"}))
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
name: VMTargetDown
thresholds:
critical: 90
operator: '>='
warning: 10
|
panel | null |
Property | Value |
---|---|
panel | colorMode: value
expr: count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system"})
graphMode: none
gridPos:
h: 2
w: 3
x: 3
y: 9
thresholds:
color: '#858187'
value:
title: Total Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"})
graphMode: none
gridPos:
h: 2
w: 3
x: 15
y: 9
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"})
graphMode: none
gridPos:
h: 2
w: 3
x: 9
y: 9
thresholds:
color: '#858187'
value:
title: Total
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: (1 - (avg(irate(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="idle"}[5m]))))
* count(node_cpu_seconds_total{cluster="$cluster", job=~"$job", mode="system"})
graphMode: none
gridPos:
h: 2
w: 3
x: 0
y: 9
thresholds:
color: '#858187'
value:
title: Used Cores
unit: none
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_filesystem_size_bytes{cluster="$cluster", job=~"$job"}) - sum(node_filesystem_free_bytes{cluster="$cluster",
job=~"$job"})
graphMode: none
gridPos:
h: 2
w: 3
x: 12
y: 9
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
Property | Value |
---|---|
panel | colorMode: value
expr: sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"}) * (((1 - sum(node_memory_MemAvailable_bytes{cluster="$cluster",
job=~"$job"}) / sum(node_memory_MemTotal_bytes{cluster="$cluster", job=~"$job"}))))
graphMode: none
gridPos:
h: 2
w: 3
x: 6
y: 9
thresholds:
color: '#858187'
value:
title: Used
unit: bytes
|
commonThresholds
Property | Value |
---|---|
app | app |
controlPlane | controlPlane |
criticalPanel | criticalPanel |
k8s | k8s |
node | node |
warningPanel | warningPanel |
app
Property | Value |
---|---|
critical | 95 |
operator | "<" |
warning | 99 |
controlPlane
Property | Value |
---|---|
critical | 95 |
lowest | 0 |
operator | "<" |
warning | 99 |
criticalPanel
Property | Value |
---|---|
critical | 1 |
operator | ">=" |
k8s
Property | Value |
---|---|
critical | 95 |
operator | "<" |
warning | 99 |
node
Property | Value |
---|---|
critical | 90 |
operator | ">=" |
warning | 75 |
warningPanel
Property | Value |
---|---|
operator | ">=" |
warning | 1 |
templateBases
Property | Value |
---|---|
baseAlert | baseAlert |
basePolystatTemplate | basePolystatTemplate |
baseStatsTemplate | baseStatsTemplate |
baseTableTemplate | baseTableTemplate |
baseAlert
Property | Value |
---|---|
customLables | {}
|
expr | "" |
linkGetParams | "" |
message | "" |
name | "error must be overwritten" |
thresholds | {}
|
basePolystatTemplate
Property | Value |
---|---|
default | true |
enabled | true |
panel | panel |
Property | Value |
---|---|
datasource | "$datasource" |
default_click_through | "" |
description | "" |
expr | "" |
fontAutoColor | false |
fontColor | "white" |
globalDecimals | null |
global_thresholds | {}
|
global_unit_format | "" |
gridPos | h: 6
w: 24
x: 0
y: 0
|
hexagon_sort_by_direction | 2 |
hexagon_sort_by_field | "value" |
polygon_border_size | 0 |
title | "error must be overwritten" |
tooltip_timestamp_enabled | false |
baseStatsTemplate
Property | Value |
---|---|
alert | {}
|
default | true |
enabled | true |
panel | panel |
Property | Value |
---|---|
colorMode | "background" |
dataLinks | []
|
datasource | "$datasource" |
decimals | null |
description | "" |
expr | "" |
graphMode | "area" |
gridPos | h: 3
w: 6
x: error must be overwritten
y: error must be overwritten
|
mappings | []
|
thresholds | {}
|
title | "error must be overwritten" |
unit | "percent" |
baseTableTemplate
Property | Value |
---|---|
default | true |
enabled | true |
panel | panel |
Property | Value |
---|---|
datasource | "$datasource" |
description | "" |
expr | []
|
gridPos | h: 19
w: 24
x: 0
y: 1
|
sort | {}
|
styles | []
|
title | "error must be overwritten" |
transformations | []
|