diff options
author | 2024-08-25 20:14:33 +0200 | |
---|---|---|
committer | 2024-08-25 20:35:07 +0100 | |
commit | ba5208b1795b6824a9e510a5c9a3788eb045ee82 (patch) | |
tree | a936463376a6807045e6aea37a133f68c473d70a /kubernetes | |
parent | Automatically label pull requests (diff) |
Unify alertmanager naming
Closes #451.
Diffstat (limited to 'kubernetes')
14 files changed, 31 insertions, 31 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml index ba23a77..e1e9863 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml @@ -2,7 +2,7 @@ groups: - name: alertmanager rules: - - alert: AlertManagerClusterFailedPeers + - alert: alert-manager/cluster-failed-peers expr: alertmanager_cluster_failed_peers > 0 for: 1m labels: @@ -11,7 +11,7 @@ groups: summary: "An Alertmanager node is reporting failed peers" description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid." - - alert: AlertManagerHealthScore + - alert: alert-manager/health-score expr: alertmanager_cluster_health_score > 0 for: 1m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml index 38070a5..ad0e9d5 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml @@ -3,7 +3,7 @@ groups: interval: 1d rules: - - alert: CertificateExpiringSoon + - alert: cert-manager/certificate-expiring-soon expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7 for: 0m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml index 9daa660..a530cda 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml @@ -2,7 +2,7 @@ groups: - name: coredns rules: - - alert: CoreDNSPanics + - alert: core-dns/panics expr: increase(coredns_panics_total[1m]) > 0 for: 0m labels: @@ -11,7 +11,7 @@ groups: summary: "CoreDNS is experiencing panic" description: "Number of CoreDNS panics encountered: {{ $value }}" - - alert: CoreDNSCacheMisses + - alert: core-dns/cache-misses expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00 labels: severity: page diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml index 5e8868e..0559943 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml @@ -2,7 +2,7 @@ groups: - name: cpu rules: - - alert: HighCPUThrottling + - alert: containers/high-cpu-throttling expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1 for: 5m labels: @@ -11,7 +11,7 @@ groups: summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling " description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}" - - alert: HighNodeCPU + - alert: kubernetes/high-node-cpu expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80 for: 5m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml index f516d3f..5654068 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml @@ -1,7 +1,7 @@ groups: - name: django rules: - - alert: DjangoErrors + - alert: django/errors expr: increase(django_http_responses_total_by_status_total{status=~"5.."}[5m]) > 0 for: 5m labels: @@ -10,7 +10,7 @@ groups: summary: "Django is experiencing 5xx errors" description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}" - - alert: DjangoLatencyElevated + - alert: django/latency-elevated expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 1.0 for: 15m labels: @@ -19,7 +19,7 @@ groups: summary: "Django route is experiencing high latency" description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency" - - alert: DjangoLatencyHigh + - alert: django/latency-high expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 10.0 for: 15m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml index 68ef4a5..1799e70 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml @@ -1,7 +1,7 @@ groups: - name: etcd rules: - - alert: EtcdErrorsSpike + - alert: etcd/error-spike expr: rate(etcd_request_error_total[5m]) > 0 for: 5m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml index 723d267..fd16337 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml @@ -1,7 +1,7 @@ groups: - name: jobs rules: - - alert: KubernetesCronjobSuspended + - alert: kubernetes/cronjob-suspended expr: kube_cronjob_spec_suspend != 0 for: 0m labels: @@ -10,7 +10,7 @@ groups: summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}" description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended" - - alert: KubernetesJobFailed + - alert: kubernetes/jobs-failed expr: kube_job_status_failed > 0 for: 0m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml index dff5352..b7de612 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml @@ -2,7 +2,7 @@ groups: - name: memory rules: - - alert: NodeHighMemoryUsage + - alert: node/high-memory-usage expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8 for: 30s labels: @@ -11,7 +11,7 @@ groups: summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes" description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}' - - alert: ContainerOOMEvent + - alert: container/oom expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 for: 0m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml index 20b639e..317a4fe 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml @@ -2,7 +2,7 @@ groups: - name: nginx rules: - - alert: NGINX4XXRequests + - alert: nginx/4xx-requests expr: sum by (service) (rate(nginx_ingress_controller_requests{service!="pixels",status!~"404|444",status=~"^4.."}[1m])) / sum by (service) (rate(nginx_ingress_controller_requests[1m])) > 0.5 for: 1m labels: @@ -11,7 +11,7 @@ groups: summary: "High rate of 4XX requests for inbound requests" description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" - - alert: NGINX5XXRequests + - alert: nginx/5xx-requests expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5 for: 1m labels: @@ -20,7 +20,7 @@ groups: summary: "High rate of 5XX requests for inbound requests" description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" - - alert: NGINXP99Timing + - alert: nginx/p99-timing expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10 for: 5m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml index 5f64e0c..6661f50 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml @@ -2,7 +2,7 @@ groups: - name: nodes rules: - - alert: KubernetesNodeDiskPressure + - alert: kubernetes/node-disk-pressure expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 for: 1m labels: @@ -11,7 +11,7 @@ groups: summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure description: "{{ $labels.kubernetes_node }} does not have adequate space to work with." - - alert: KubernetesNodeMemoryPressure + - alert: kubernetes/node-memory-pressure expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 for: 15s labels: @@ -20,7 +20,7 @@ groups: summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with." - - alert: KubernetesNodeNetworkUnavailable + - alert: kubernetes/node-network-unavailable expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1 for: 15s labels: @@ -30,7 +30,7 @@ groups: description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections" - - alert: KubernetesNodePIDPressure + - alert: kubernetes/node-pid-pressure expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1 for: 15s labels: @@ -39,7 +39,7 @@ groups: summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with." - - alert: KubernetesNodeReady + - alert: kubernetes/node-not-ready expr: kube_node_status_condition{condition="Ready",status="true"} == 0 for: 1m labels: @@ -48,7 +48,7 @@ groups: summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready description: "Node {{ $labels.kubernetes_node }} has been unready for a long time" - - alert: KubernetesNodeCordoned + - alert: kubernetes/node-cordoned expr: kube_node_spec_unschedulable == 1 for: 30m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml index 9efdffa..b03398a 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml @@ -1,7 +1,7 @@ groups: - name: pods rules: - - alert: KubernetesPodNotHealthy + - alert: kubernetes/pod-not-healthy expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0 for: 3m labels: @@ -10,7 +10,7 @@ groups: summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}" description: "Pod has been in a non-ready state for longer than 3 minutes." - - alert: KubernetesPodCrashLooping + - alert: kubernetes/pod-crash-looping expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 for: 1m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml index 6442b13..22e16cf 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml @@ -3,7 +3,7 @@ groups: rules: # Alert for any instance that is unreachable for >5 minutes. - - alert: InstanceDown + - alert: prometheus/instance-down expr: up == 0 for: 5m labels: @@ -12,7 +12,7 @@ groups: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - - alert: PrometheusConfigFailed + - alert: prometheus/config-failed expr: prometheus_config_last_reload_successful == 0 for: 0m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml index 6b946f6..42ce4d1 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml @@ -1,7 +1,7 @@ groups: - name: redis rules: - - alert: RedisDown + - alert: redis/down expr: redis_up == 0 for: 1m labels: @@ -10,7 +10,7 @@ groups: summary: "Redis is offline" description: "Redis Exporter cannot connect to Redis." - - alert: RedisOutOfMemory + - alert: redis/oom expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 for: 0m labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml index 790d3f7..410879d 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml @@ -1,7 +1,7 @@ groups: - name: volumes rules: - - alert: KubernetesVolumeOutOfDiskSpace + - alert: kubernetes/volume-out-of-space expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim!="prometheus-storage"} / kubelet_volume_stats_capacity_bytes * 100 < 10 for: 2m labels: |