aboutsummaryrefslogtreecommitdiffstats
path: root/kubernetes/namespaces
diff options
context:
space:
mode:
authorGravatar Joe Banks <[email protected]>2024-04-02 01:02:00 +0100
committerGravatar Joe Banks <[email protected]>2024-04-02 01:02:00 +0100
commitdea9a66482a70bf3fb37779a47328d8e6d51918d (patch)
tree1a90c51dad3b31ba9fdf91c6ce1d6d3e98dcfb3f /kubernetes/namespaces
parentIncrease KubernetesNodeCordoned wait duration (diff)
update alert priorities
Diffstat (limited to 'kubernetes/namespaces')
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml4
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml3
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml13
3 files changed, 15 insertions, 5 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
index b3fcad9..ba23a77 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
@@ -6,7 +6,7 @@ groups:
expr: alertmanager_cluster_failed_peers > 0
for: 1m
labels:
- severity: warning
+ severity: page
annotations:
summary: "An Alertmanager node is reporting failed peers"
description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid."
@@ -15,7 +15,7 @@ groups:
expr: alertmanager_cluster_health_score > 0
for: 1m
labels:
- severity: warning
+ severity: page
annotations:
summary: "An AlertManagerNode is reporting an unhealthy cluster"
description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
index 10eb3dd..38070a5 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
@@ -7,7 +7,8 @@ groups:
expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7
for: 0m
labels:
- severity: warning
+ # This isn't critical, but if we have reached this point, renewal probably needs manual intervention.
+ severity: page
annotations:
summary: "Certificate is expiring in < 7 days"
description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
index bad99f5..fd48dfb 100644
--- a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml
@@ -10,11 +10,20 @@ groups:
summary: "Django is experiencing 5xx errors"
description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}"
- - alert: DjangoLatencyHigh
+ - alert: DjangoLatencyElevated
expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts"}[5m])) > 1.0
for: 3m
labels:
+ severity: warning
+ annotations:
+ summary: "Django route is experiencing high latency"
+ description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency"
+
+ - alert: DjangoLatencyHigh
+ expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts"}[5m])) > 10.0
+ for: 3m
+ labels:
severity: page
annotations:
summary: "Django route is experiencing high latency"
- description: "Django route {{ $labels.method }} {{ $labels.view }} is experiencing high latency"
+ description: "Django route {{ $labels.method }} {{ $labels.view }} is experiencing high latency" \ No newline at end of file