blob: 2138b7031efacdb45816d4c28ed75967328bc1d3 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
|
groups:
- name: nginx
rules:
- alert: nginx/4xx-requests
expr: sum by (service, status) (rate(nginx_ingress_controller_requests{service!="pixels",status!~"401|404|444",status=~"^4.."}[1m])) / sum by (service, status) (rate(nginx_ingress_controller_requests[1m])) > 0.5
for: 10m
labels:
severity: page
annotations:
summary: "High rate of 4XX requests for inbound requests"
description: "Rate of {{ $labels.status }} errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
- alert: nginx/5xx-requests
expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service, status) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service, status) > 0.5
for: 1m
labels:
severity: page
annotations:
summary: "High rate of 5XX requests for inbound requests"
description: "Rate of {{ $labels.status }} errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
- alert: nginx/p99-timing
expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10
for: 5m
labels:
severity: page
annotations:
summary: "Request timing P99 has been over 3 seconds for 5 minutes"
description: "Requests to service {{ $labels.host }} (to service {{ $labels.service }}) have taken over 3 seconds (P99) to complete."
|