From 7798278204630df1e90e65957393290a8f7a60a6 Mon Sep 17 00:00:00 2001 From: Joe Banks Date: Mon, 1 Apr 2024 00:57:38 +0100 Subject: Add a sanity alert for when a node becomes unschedulable --- kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kubernetes/namespaces') diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml index 6bfa6d1..08873ea 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml @@ -41,9 +41,18 @@ groups: - alert: KubernetesNodeReady expr: kube_node_status_condition{condition="Ready",status="true"} == 0 - for: 5m + for: 1m labels: severity: page annotations: summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready description: "Node {{ $labels.kubernetes_node }} has been unready for a long time" + + - alert: KubernetesNodeCordoned + expr: kube_node_spec_unschedulable == 1 + for: 1m + labels: + severity: page + annotations: + summary: Kubernetes node ({{ $labels.kubernetes_node }}) is cordoned + description: "Node {{ $labels.kubernetes_node }} has been cordoned" \ No newline at end of file -- cgit v1.2.3