diff options
Diffstat (limited to 'kubernetes/namespaces/monitoring/alerts/alerts.d')
3 files changed, 15 insertions, 5 deletions
| diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml index b3fcad9..ba23a77 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml @@ -6,7 +6,7 @@ groups:      expr: alertmanager_cluster_failed_peers > 0      for: 1m      labels: -      severity: warning +      severity: page      annotations:        summary: "An Alertmanager node is reporting failed peers"        description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid." @@ -15,7 +15,7 @@ groups:      expr: alertmanager_cluster_health_score > 0      for: 1m      labels: -      severity: warning +      severity: page      annotations:        summary: "An AlertManagerNode is reporting an unhealthy cluster"        description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml index 10eb3dd..38070a5 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml @@ -7,7 +7,8 @@ groups:      expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7      for: 0m      labels: -      severity: warning +      # This isn't critical, but if we have reached this point, renewal probably needs manual intervention. +      severity: page      annotations:        summary: "Certificate is expiring in < 7 days"        description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml index bad99f5..fd48dfb 100644 --- a/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/django.yaml @@ -10,11 +10,20 @@ groups:            summary: "Django is experiencing 5xx errors"            description: "Django is experiencing 5xx errors on {{ $labels.namespace }}/{{ $labels.job }}" -      - alert: DjangoLatencyHigh +      - alert: DjangoLatencyElevated          expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts"}[5m])) > 1.0          for: 3m          labels: +          severity: warning +        annotations: +          summary: "Django route is experiencing high latency" +          description: "Django route {{ $labels.method }} {{ $labels.view }} has raised latency" + +      - alert: DjangoLatencyHigh +        expr: histogram_quantile(0.95, rate(django_http_requests_latency_seconds_by_view_method_bucket{view!="api:github-artifacts"}[5m])) > 10.0 +        for: 3m +        labels:            severity: page          annotations:            summary: "Django route is experiencing high latency" -          description: "Django route {{ $labels.method }} {{ $labels.view }} is experiencing high latency" +          description: "Django route {{ $labels.method }} {{ $labels.view }} is experiencing high latency"
\ No newline at end of file | 
