diff options
author | 2024-04-02 01:02:00 +0100 | |
---|---|---|
committer | 2024-04-02 01:02:00 +0100 | |
commit | dea9a66482a70bf3fb37779a47328d8e6d51918d (patch) | |
tree | 1a90c51dad3b31ba9fdf91c6ce1d6d3e98dcfb3f /kubernetes/namespaces | |
parent | Increase KubernetesNodeCordoned wait duration (diff) |
update alert priorities
Diffstat (limited to 'kubernetes/namespaces')
3 files changed, 15 insertions, 5 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml index b3fcad9..ba23a77 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml @@ -6,7 +6,7 @@ groups: expr: alertmanager_cluster_failed_peers > 0 for: 1m labels: - severity: warning + severity: page annotations: summary: "An Alertmanager node is reporting failed peers" description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid." @@ -15,7 +15,7 @@ groups: expr: alertmanager_cluster_health_score > 0 for: 1m labels: - severity: warning + severity: page annotations: summary: "An AlertManagerNode is reporting an unhealthy cluster" description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml index 10eb3dd..38070a5 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml @@ -7,7 +7,8 @@ groups: expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7 for: 0m labels: - severity: warning + # This isn't critical, but if we have reached this point, renewal probably needs manual intervention. + severity: page annotations: summary: "Certificate is expiring in < 7 days" description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml index bad99f5..fd48dfb 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml @@ -10,11 +10,20 @@ groups: summary: "Django is experiencing 5xx errors" description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}" - - alert: DjangoLatencyHigh + - alert: DjangoLatencyElevated expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts"}[5m])) > 1.0 for: 3m labels: + severity: warning + annotations: + summary: "Django route is experiencing high latency" + description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency" + + - alert: DjangoLatencyHigh + expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts"}[5m])) > 10.0 + for: 3m + labels: severity: page annotations: summary: "Django route is experiencing high latency" - description: "Django route {{ $labels.method }} {{ $labels.view }} is experiencing high latency" + description: "Django route {{ $labels.method }} {{ $labels.view }} is experiencing high latency"
\ No newline at end of file |