diff options
Diffstat (limited to 'kubernetes')
14 files changed, 31 insertions, 31 deletions
| diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml index ba23a77..e1e9863 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml @@ -2,7 +2,7 @@ groups:  - name: alertmanager    rules: -  - alert: AlertManagerClusterFailedPeers +  - alert: alert-manager/cluster-failed-peers      expr: alertmanager_cluster_failed_peers > 0      for: 1m      labels: @@ -11,7 +11,7 @@ groups:        summary: "An Alertmanager node is reporting failed peers"        description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid." -  - alert: AlertManagerHealthScore +  - alert: alert-manager/health-score      expr: alertmanager_cluster_health_score > 0      for: 1m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml index 38070a5..ad0e9d5 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml @@ -3,7 +3,7 @@ groups:    interval: 1d    rules: -  - alert: CertificateExpiringSoon +  - alert: cert-manager/certificate-expiring-soon      expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7      for: 0m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml index 9daa660..a530cda 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml @@ -2,7 +2,7 @@ groups:  - name: coredns    rules: -  - alert: CoreDNSPanics +  - alert: core-dns/panics      expr: increase(coredns_panics_total[1m]) > 0      for: 0m      labels: @@ -11,7 +11,7 @@ groups:        summary: "CoreDNS is experiencing panic"        description: "Number of CoreDNS panics encountered: {{ $value }}" -  - alert: CoreDNSCacheMisses +  - alert: core-dns/cache-misses      expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00      labels:        severity: page diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml index 5e8868e..0559943 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml @@ -2,7 +2,7 @@ groups:  - name: cpu    rules: -  - alert: HighCPUThrottling +  - alert: containers/high-cpu-throttling      expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1      for: 5m      labels: @@ -11,7 +11,7 @@ groups:        summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling "        description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}" -  - alert: HighNodeCPU +  - alert: kubernetes/high-node-cpu      expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80      for: 5m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml index f516d3f..5654068 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml @@ -1,7 +1,7 @@  groups:    - name: django      rules: -      - alert: DjangoErrors +      - alert: django/errors          expr: increase(django_http_responses_total_by_status_total{status=~"5.."}[5m]) > 0          for: 5m          labels: @@ -10,7 +10,7 @@ groups:            summary: "Django is experiencing 5xx errors"            description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}" -      - alert: DjangoLatencyElevated +      - alert: django/latency-elevated          expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 1.0          for: 15m          labels: @@ -19,7 +19,7 @@ groups:            summary: "Django route is experiencing high latency"            description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency" -      - alert: DjangoLatencyHigh +      - alert: django/latency-high          expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts", view!="api:github-webhook-filter", view!="home:home", view!="content:tag"}[5m])) > 10.0          for: 15m          labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml index 68ef4a5..1799e70 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/etcd.yaml @@ -1,7 +1,7 @@  groups:    - name: etcd      rules: -      - alert: EtcdErrorsSpike +      - alert: etcd/error-spike          expr: rate(etcd_request_error_total[5m]) > 0          for: 5m          labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml index 723d267..fd16337 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml @@ -1,7 +1,7 @@  groups:  - name: jobs    rules: -  - alert: KubernetesCronjobSuspended +  - alert: kubernetes/cronjob-suspended      expr: kube_cronjob_spec_suspend != 0      for: 0m      labels: @@ -10,7 +10,7 @@ groups:        summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}"        description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended" -  - alert: KubernetesJobFailed +  - alert: kubernetes/jobs-failed      expr: kube_job_status_failed > 0      for: 0m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml index dff5352..b7de612 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml @@ -2,7 +2,7 @@ groups:  - name: memory    rules: -  - alert: NodeHighMemoryUsage +  - alert: node/high-memory-usage      expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8      for: 30s      labels: @@ -11,7 +11,7 @@ groups:        summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes"        description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}' -  - alert: ContainerOOMEvent +  - alert: container/oom      expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1      for: 0m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml index 20b639e..317a4fe 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml @@ -2,7 +2,7 @@ groups:  - name: nginx    rules: -  - alert: NGINX4XXRequests +  - alert: nginx/4xx-requests      expr: sum by (service) (rate(nginx_ingress_controller_requests{service!="pixels",status!~"404|444",status=~"^4.."}[1m])) / sum by (service) (rate(nginx_ingress_controller_requests[1m])) > 0.5      for: 1m      labels: @@ -11,7 +11,7 @@ groups:        summary: "High rate of 4XX requests for inbound requests"        description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" -  - alert: NGINX5XXRequests +  - alert: nginx/5xx-requests      expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5      for: 1m      labels: @@ -20,7 +20,7 @@ groups:        summary: "High rate of 5XX requests for inbound requests"        description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" -  - alert: NGINXP99Timing +  - alert: nginx/p99-timing      expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10      for: 5m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml index 5f64e0c..6661f50 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml @@ -2,7 +2,7 @@ groups:  - name: nodes    rules: -  - alert: KubernetesNodeDiskPressure +  - alert: kubernetes/node-disk-pressure      expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1      for: 1m      labels: @@ -11,7 +11,7 @@ groups:        summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure        description: "{{ $labels.kubernetes_node }} does not have adequate space to work with." -  - alert: KubernetesNodeMemoryPressure +  - alert: kubernetes/node-memory-pressure      expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1      for: 15s      labels: @@ -20,7 +20,7 @@ groups:        summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure        description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with." -  - alert: KubernetesNodeNetworkUnavailable +  - alert: kubernetes/node-network-unavailable      expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1      for: 15s      labels: @@ -30,7 +30,7 @@ groups:        description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections" -  - alert: KubernetesNodePIDPressure +  - alert: kubernetes/node-pid-pressure      expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1      for: 15s      labels: @@ -39,7 +39,7 @@ groups:        summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion        description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with." -  - alert: KubernetesNodeReady +  - alert: kubernetes/node-not-ready      expr: kube_node_status_condition{condition="Ready",status="true"} == 0      for: 1m      labels: @@ -48,7 +48,7 @@ groups:        summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready        description: "Node {{ $labels.kubernetes_node }} has been unready for a long time" -  - alert: KubernetesNodeCordoned +  - alert: kubernetes/node-cordoned      expr: kube_node_spec_unschedulable == 1      for: 30m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml index 9efdffa..b03398a 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml @@ -1,7 +1,7 @@  groups:  - name: pods    rules: -  - alert: KubernetesPodNotHealthy +  - alert: kubernetes/pod-not-healthy      expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0      for: 3m      labels: @@ -10,7 +10,7 @@ groups:        summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}"        description: "Pod has been in a non-ready state for longer than 3 minutes." -  - alert: KubernetesPodCrashLooping +  - alert: kubernetes/pod-crash-looping      expr: increase(kube_pod_container_status_restarts_total[5m]) > 3      for: 1m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml index 6442b13..22e16cf 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml @@ -3,7 +3,7 @@ groups:    rules:    # Alert for any instance that is unreachable for >5 minutes. -  - alert: InstanceDown +  - alert: prometheus/instance-down      expr: up == 0      for: 5m      labels: @@ -12,7 +12,7 @@ groups:        summary: "Instance {{ $labels.instance }} down"        description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." -  - alert: PrometheusConfigFailed +  - alert: prometheus/config-failed      expr: prometheus_config_last_reload_successful == 0      for: 0m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml index 6b946f6..42ce4d1 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml @@ -1,7 +1,7 @@  groups:  - name: redis    rules: -  - alert: RedisDown +  - alert: redis/down      expr: redis_up == 0      for: 1m      labels: @@ -10,7 +10,7 @@ groups:        summary: "Redis is offline"        description: "Redis Exporter cannot connect to Redis." -  - alert: RedisOutOfMemory +  - alert: redis/oom      expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9      for: 0m      labels: diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml index 790d3f7..410879d 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/volumes.yaml @@ -1,7 +1,7 @@  groups:  - name: volumes    rules: -  - alert: KubernetesVolumeOutOfDiskSpace +  - alert: kubernetes/volume-out-of-space      expr: kubelet_volume_stats_available_bytes{persistentvolumeclaim!="prometheus-storage"} / kubelet_volume_stats_capacity_bytes * 100 < 10      for: 2m      labels: | 
