groups: - name: nodes rules: - alert: kubernetes/node-disk-pressure expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 for: 1m labels: severity: page annotations: summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure description: "{{ $labels.kubernetes_node }} does not have adequate space to work with." - alert: kubernetes/node-memory-pressure expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 for: 15s labels: severity: page annotations: summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with." - alert: kubernetes/node-network-unavailable expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1 for: 15s labels: severity: page annotations: summary: Node {{ $labels.kubernetes_node }} is experiencing network problems description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections" - alert: kubernetes/node-pid-pressure expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1 for: 15s labels: severity: page annotations: summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with." - alert: kubernetes/node-not-ready expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 1m labels: severity: page annotations: summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready description: "Node {{ $labels.kubernetes_node }} has been unready for a long time" - alert: kubernetes/node-cordoned expr: kube_node_spec_unschedulable == 1 for: 30m labels: severity: page annotations: summary: Kubernetes node ({{ $labels.kubernetes_node }}) is cordoned description: "Node {{ $labels.kubernetes_node }} has been cordoned"