diff options
| -rw-r--r-- | kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml | 8 | 
1 files changed, 4 insertions, 4 deletions
| diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml index c5aa3c2..dff5352 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml @@ -12,10 +12,10 @@ groups:        description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}'    - alert: ContainerOOMEvent -    expr: container_oom_events_total{pod=~".+", container_name!="POD", image!=""} > 0 -    for: 30s +    expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 +    for: 0m      labels:        severity: page      annotations: -      summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} was OOM killed" -      description: "{{ $labels.container_name }} inside {{ $labels.pod }} has been OOM killed" +      summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) +      description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}" | 
