aboutsummaryrefslogtreecommitdiffstats
path: root/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
blob: 2138b7031efacdb45816d4c28ed75967328bc1d3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
groups:
- name: nginx
  rules:

  - alert: nginx/4xx-requests
    expr: sum by (service, status) (rate(nginx_ingress_controller_requests{service!="pixels",status!~"401|404|444",status=~"^4.."}[1m])) / sum by (service, status) (rate(nginx_ingress_controller_requests[1m])) > 0.5
    for: 10m
    labels:
      severity: page
    annotations:
      summary: "High rate of 4XX requests for inbound requests"
      description: "Rate of {{ $labels.status }} errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"

  - alert: nginx/5xx-requests
    expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service, status) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service, status) > 0.5
    for: 1m
    labels:
      severity: page
    annotations:
      summary: "High rate of 5XX requests for inbound requests"
      description: "Rate of {{ $labels.status }} errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"

  - alert: nginx/p99-timing
    expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase)"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10
    for: 5m
    labels:
      severity: page
    annotations:
      summary: "Request timing P99 has been over 3 seconds for 5 minutes"
      description: "Requests to service {{ $labels.host }} (to service {{ $labels.service }}) have taken over 3 seconds (P99) to complete."