aboutsummaryrefslogtreecommitdiffstats
path: root/kubernetes/namespaces/monitoring/alerts/alerts.d
diff options
context:
space:
mode:
Diffstat (limited to 'kubernetes/namespaces/monitoring/alerts/alerts.d')
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml21
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml13
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml21
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml12
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml30
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml49
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml29
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml13
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml20
12 files changed, 268 insertions, 0 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
new file mode 100644
index 0000000..b3fcad9
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
@@ -0,0 +1,21 @@
+groups:
+- name: alertmanager
+ rules:
+
+ - alert: AlertManagerClusterFailedPeers
+ expr: alertmanager_cluster_failed_peers > 0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "An Alertmanager node is reporting failed peers"
+ description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid."
+
+ - alert: AlertManagerHealthScore
+ expr: alertmanager_cluster_health_score > 0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "An AlertManagerNode is reporting an unhealthy cluster"
+ description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
new file mode 100644
index 0000000..10eb3dd
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
@@ -0,0 +1,13 @@
+groups:
+- name: certificates
+ interval: 1d
+ rules:
+
+ - alert: CertificateExpiringSoon
+ expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Certificate is expiring in < 7 days"
+ description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
new file mode 100644
index 0000000..9daa660
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: coredns
+ rules:
+
+ - alert: CoreDNSPanics
+ expr: increase(coredns_panics_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "CoreDNS is experiencing panic"
+ description: "Number of CoreDNS panics encountered: {{ $value }}"
+
+ - alert: CoreDNSCacheMisses
+ expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00
+ labels:
+ severity: page
+ annotations:
+ summary: "High CoreDNS cache misses in last 10 minutes"
+ description: "This can sometimes be an indication of networking troubles, currently {{ $value | humanizePercentage }} over last 10 minutes."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
new file mode 100644
index 0000000..5e8868e
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
@@ -0,0 +1,21 @@
+groups:
+- name: cpu
+ rules:
+
+ - alert: HighCPUThrottling
+ expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling "
+ description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}"
+
+ - alert: HighNodeCPU
+ expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Node {{ $labels.kubernetes_node }} has CPU over 80% for last 5 minute"
+ description: "CPU on {{ $labels.kubernetes_node }} is averaging {{ $value }}"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
new file mode 100644
index 0000000..723d267
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: jobs
+ rules:
+ - alert: KubernetesCronjobSuspended
+ expr: kube_cronjob_spec_suspend != 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}"
+ description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended"
+
+ - alert: KubernetesJobFailed
+ expr: kube_job_status_failed > 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "Kubernetes Job failed: {{ $labels.job_name }}"
+ description: "Job {{$labels.kubernetes_namespacenamespace}}/{{$labels.job_name}} failed to complete"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
new file mode 100644
index 0000000..d53da5e
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
@@ -0,0 +1,12 @@
+groups:
+- name: memory
+ rules:
+
+ - alert: NodeHighMemoryUsage
+ expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8
+ for: 30s
+ labels:
+ severity: page
+ annotations:
+ summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes"
+ description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}'
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
new file mode 100644
index 0000000..441f7df
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
@@ -0,0 +1,30 @@
+groups:
+- name: nginx
+ rules:
+
+ - alert: NGINX4XXRequests
+ expr: sum by(service) (rate(nginx_ingress_controller_requests{status=~"^4..", status!="404", service!="pixels"}[1m])) / sum by(service) (rate(nginx_ingress_controller_requests[1m])) > 0.5
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: "High rate of 4XX requests for inbound requests"
+ description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
+
+ - alert: NGINX5XXRequests
+ expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: "High rate of 5XX requests for inbound requests"
+ description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
+
+ - alert: NGINXP99Timing
+ expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)", host!="pydis-api.default.svc.cluster.local"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Request timing P99 has been over 3 seconds for 5 minutes"
+ description: "Requests to service {{ $labels.host }} (to service {{ $labels.service }}) have taken over 3 seconds (P99) to complete."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
new file mode 100644
index 0000000..6bfa6d1
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
@@ -0,0 +1,49 @@
+groups:
+- name: nodes
+ rules:
+
+ - alert: KubernetesNodeDiskPressure
+ expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure
+ description: "{{ $labels.kubernetes_node }} does not have adequate space to work with."
+
+ - alert: KubernetesNodeMemoryPressure
+ expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+ for: 15s
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure
+ description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with."
+
+ - alert: KubernetesNodeNetworkUnavailable
+ expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
+ for: 15s
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing network problems
+ description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections"
+
+
+ - alert: KubernetesNodePIDPressure
+ expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1
+ for: 15s
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion
+ description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with."
+
+ - alert: KubernetesNodeReady
+ expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready
+ description: "Node {{ $labels.kubernetes_node }} has been unready for a long time"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
new file mode 100644
index 0000000..9efdffa
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: pods
+ rules:
+ - alert: KubernetesPodNotHealthy
+ expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0
+ for: 3m
+ labels:
+ severity: page
+ annotations:
+ summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}"
+ description: "Pod has been in a non-ready state for longer than 3 minutes."
+
+ - alert: KubernetesPodCrashLooping
+ expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes pod crash looping: {{ $labels.kubernetes_namespace }}/{{ $labels.pod }}"
+ description: "Pod {{ $labels.pod }} is crash looping"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml
new file mode 100644
index 0000000..399a84b
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml
@@ -0,0 +1,29 @@
+groups:
+- name: postgres
+ rules:
+ - alert: PostgresUp
+ expr: pg_up == 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "PostgreSQL is offline"
+ description: "Postgres Exporter cannot connect to PostgreSQL."
+
+ - alert: PostgresTooManyConnections
+ expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: PostgreSQL connections near max_connections setting
+ description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections"
+
+ - alert: PostgresDeadlockedTable
+ expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: Too many deadlocked tables
+ description: "PostgreSQL has dead-locks, value: {{ $value }}"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
new file mode 100644
index 0000000..25e555d
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
@@ -0,0 +1,13 @@
+groups:
+- name: prometheus
+ rules:
+
+ # Alert for any instance that is unreachable for >5 minutes.
+ - alert: InstanceDown
+ expr: up == 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Instance {{ $labels.instance }} down"
+ description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
new file mode 100644
index 0000000..6b946f6
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: redis
+ rules:
+ - alert: RedisDown
+ expr: redis_up == 0
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: "Redis is offline"
+ description: "Redis Exporter cannot connect to Redis."
+
+ - alert: RedisOutOfMemory
+ expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "Redis is approaching it's memory limit"
+ description: "Redis is currently using {{ $value | humanizePercentage }} of configured memory."