aboutsummaryrefslogtreecommitdiffstats
path: root/kubernetes
diff options
context:
space:
mode:
authorGravatar Johannes Christ <[email protected]>2024-08-25 20:14:33 +0200
committerGravatar Joe Banks <[email protected]>2024-08-25 20:35:07 +0100
commitba5208b1795b6824a9e510a5c9a3788eb045ee82 (patch)
treea936463376a6807045e6aea37a133f68c473d70a /kubernetes
parentAutomatically label pull requests (diff)
Unify alertmanager naming
Closes #451.
Diffstat (limited to 'kubernetes')
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml2
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml6
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml2
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml6
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml12
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml2
14 files changed, 31 insertions, 31 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
index ba23a77..e1e9863 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
@@ -2,7 +2,7 @@ groups:
- name: alertmanager
rules:
- - alert: AlertManagerClusterFailedPeers
+ - alert: alert-manager/cluster-failed-peers
expr: alertmanager_cluster_failed_peers > 0
for: 1m
labels:
@@ -11,7 +11,7 @@ groups:
summary: "An Alertmanager node is reporting failed peers"
description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid."
- - alert: AlertManagerHealthScore
+ - alert: alert-manager/health-score
expr: alertmanager_cluster_health_score > 0
for: 1m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
index 38070a5..ad0e9d5 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
@@ -3,7 +3,7 @@ groups:
interval: 1d
rules:
- - alert: CertificateExpiringSoon
+ - alert: cert-manager/certificate-expiring-soon
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7
for: 0m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
index 9daa660..a530cda 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
@@ -2,7 +2,7 @@ groups:
- name: coredns
rules:
- - alert: CoreDNSPanics
+ - alert: core-dns/panics
expr: increase(coredns_panics_total[1m]) > 0
for: 0m
labels:
@@ -11,7 +11,7 @@ groups:
summary: "CoreDNS is experiencing panic"
description: "Number of CoreDNS panics encountered: {{ $value }}"
- - alert: CoreDNSCacheMisses
+ - alert: core-dns/cache-misses
expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00
labels:
severity: page
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
index 5e8868e..0559943 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
@@ -2,7 +2,7 @@ groups:
- name: cpu
rules:
- - alert: HighCPUThrottling
+ - alert: containers/high-cpu-throttling
expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1
for: 5m
labels:
@@ -11,7 +11,7 @@ groups:
summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling "
description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}"
- - alert: HighNodeCPU
+ - alert: kubernetes/high-node-cpu
expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
index f516d3f..5654068 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
@@ -1,7 +1,7 @@
groups:
- name: django
rules:
- - alert: DjangoErrors
+ - alert: django/errors
expr: increase(django_http_responses_total_by_status_total{status=~"5.."}[5m]) > 0
for: 5m
labels:
@@ -10,7 +10,7 @@ groups:
summary: "Django is experiencing 5xx errors"
description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}"
- - alert: DjangoLatencyElevated
+ - alert: django/latency-elevated
expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 1.0
for: 15m
labels:
@@ -19,7 +19,7 @@ groups:
summary: "Django route is experiencing high latency"
description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency"
- - alert: DjangoLatencyHigh
+ - alert: django/latency-high
expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 10.0
for: 15m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml
index 68ef4a5..1799e70 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml
@@ -1,7 +1,7 @@
groups:
- name: etcd
rules:
- - alert: EtcdErrorsSpike
+ - alert: etcd/error-spike
expr: rate(etcd_request_error_total[5m]) > 0
for: 5m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
index 723d267..fd16337 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
@@ -1,7 +1,7 @@
groups:
- name: jobs
rules:
- - alert: KubernetesCronjobSuspended
+ - alert: kubernetes/cronjob-suspended
expr: kube_cronjob_spec_suspend != 0
for: 0m
labels:
@@ -10,7 +10,7 @@ groups:
summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}"
description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended"
- - alert: KubernetesJobFailed
+ - alert: kubernetes/jobs-failed
expr: kube_job_status_failed > 0
for: 0m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
index dff5352..b7de612 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
@@ -2,7 +2,7 @@ groups:
- name: memory
rules:
- - alert: NodeHighMemoryUsage
+ - alert: node/high-memory-usage
expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8
for: 30s
labels:
@@ -11,7 +11,7 @@ groups:
summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes"
description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}'
- - alert: ContainerOOMEvent
+ - alert: container/oom
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
for: 0m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
index 20b639e..317a4fe 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
@@ -2,7 +2,7 @@ groups:
- name: nginx
rules:
- - alert: NGINX4XXRequests
+ - alert: nginx/4xx-requests
expr: sum by (service) (rate(nginx_ingress_controller_requests{service!="pixels",status!~"404|444",status=~"^4.."}[1m])) / sum by (service) (rate(nginx_ingress_controller_requests[1m])) > 0.5
for: 1m
labels:
@@ -11,7 +11,7 @@ groups:
summary: "High rate of 4XX requests for inbound requests"
description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
- - alert: NGINX5XXRequests
+ - alert: nginx/5xx-requests
expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5
for: 1m
labels:
@@ -20,7 +20,7 @@ groups:
summary: "High rate of 5XX requests for inbound requests"
description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
- - alert: NGINXP99Timing
+ - alert: nginx/p99-timing
expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10
for: 5m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
index 5f64e0c..6661f50 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
@@ -2,7 +2,7 @@ groups:
- name: nodes
rules:
- - alert: KubernetesNodeDiskPressure
+ - alert: kubernetes/node-disk-pressure
expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
for: 1m
labels:
@@ -11,7 +11,7 @@ groups:
summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure
description: "{{ $labels.kubernetes_node }} does not have adequate space to work with."
- - alert: KubernetesNodeMemoryPressure
+ - alert: kubernetes/node-memory-pressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
for: 15s
labels:
@@ -20,7 +20,7 @@ groups:
summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure
description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with."
- - alert: KubernetesNodeNetworkUnavailable
+ - alert: kubernetes/node-network-unavailable
expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
for: 15s
labels:
@@ -30,7 +30,7 @@ groups:
description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections"
- - alert: KubernetesNodePIDPressure
+ - alert: kubernetes/node-pid-pressure
expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1
for: 15s
labels:
@@ -39,7 +39,7 @@ groups:
summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion
description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with."
- - alert: KubernetesNodeReady
+ - alert: kubernetes/node-not-ready
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1m
labels:
@@ -48,7 +48,7 @@ groups:
summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready
description: "Node {{ $labels.kubernetes_node }} has been unready for a long time"
- - alert: KubernetesNodeCordoned
+ - alert: kubernetes/node-cordoned
expr: kube_node_spec_unschedulable == 1
for: 30m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
index 9efdffa..b03398a 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
@@ -1,7 +1,7 @@
groups:
- name: pods
rules:
- - alert: KubernetesPodNotHealthy
+ - alert: kubernetes/pod-not-healthy
expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0
for: 3m
labels:
@@ -10,7 +10,7 @@ groups:
summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}"
description: "Pod has been in a non-ready state for longer than 3 minutes."
- - alert: KubernetesPodCrashLooping
+ - alert: kubernetes/pod-crash-looping
expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
for: 1m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
index 6442b13..22e16cf 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
@@ -3,7 +3,7 @@ groups:
rules:
# Alert for any instance that is unreachable for >5 minutes.
- - alert: InstanceDown
+ - alert: prometheus/instance-down
expr: up == 0
for: 5m
labels:
@@ -12,7 +12,7 @@ groups:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- - alert: PrometheusConfigFailed
+ - alert: prometheus/config-failed
expr: prometheus_config_last_reload_successful == 0
for: 0m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
index 6b946f6..42ce4d1 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
@@ -1,7 +1,7 @@
groups:
- name: redis
rules:
- - alert: RedisDown
+ - alert: redis/down
expr: redis_up == 0
for: 1m
labels:
@@ -10,7 +10,7 @@ groups:
summary: "Redis is offline"
description: "Redis Exporter cannot connect to Redis."
- - alert: RedisOutOfMemory
+ - alert: redis/oom
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
for: 0m
labels:
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml
index 790d3f7..410879d 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml
@@ -1,7 +1,7 @@
groups:
- name: volumes
rules:
- - alert: KubernetesVolumeOutOfDiskSpace
+ - alert: kubernetes/volume-out-of-space
expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim!="prometheus-storage"} / kubelet_volume_stats_capacity_bytes * 100 < 10
for: 2m
labels: