groups: - name: prometheus rules: # Alert for any instance that is unreachable for >5 minutes. - alert: prometheus/instance-down expr: up == 0 for: 5m labels: severity: page annotations: summary: "Instance {{ $labels.instance }} down" description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." - alert: prometheus/config-failed expr: prometheus_config_last_reload_successful == 0 for: 0m labels: severity: page annotations: summary: "Prometheus config reload in pod {{ $labels.kubernetes_pod_name }} has failed" description: "Prometheus instance {{ $labels.kubernetes_pod_name }} (`{{ $labels.instance }}`) has failed to reload its config."