aboutsummaryrefslogtreecommitdiffstats
path: root/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
blob: b7de612f69326055717f17bd8cb30a637fd1e9a4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
groups:
- name: memory
  rules:

  - alert: node/high-memory-usage
    expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8
    for: 30s
    labels:
      severity: page
    annotations:
      summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes"
      description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}'

  - alert: container/oom
    expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
    for: 0m
    labels:
      severity: page
    annotations:
      summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
      description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"