From a131791e16de17b24df55473a95b7194a85dafda Mon Sep 17 00:00:00 2001 From: jchristgit Date: Wed, 8 May 2024 10:18:18 +0200 Subject: Configure Prometheus alerting for failed systemd units (#278) The two services that I would normally exclude are intentionally not excluded right now to test out the alertmanager setup. If all goes well, we should receive a notification on Discord. --- ansible/host_vars/lovelace/prometheus.yml | 15 +++++++++++++++ ansible/roles/prometheus/tasks/main.yml | 2 +- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'ansible') diff --git a/ansible/host_vars/lovelace/prometheus.yml b/ansible/host_vars/lovelace/prometheus.yml index 41c5a97..d079986 100644 --- a/ansible/host_vars/lovelace/prometheus.yml +++ b/ansible/host_vars/lovelace/prometheus.yml @@ -73,3 +73,18 @@ prometheus_configuration: - source_labels: [__param_target] target_label: instance - target_label: __address__ + +prometheus_rules: | + {% raw %} + groups: + - name: node + rules: + - alert: node/systemd-unit-failed + # expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0 + expr: node_systemd_unit_state{state="failed"} != 0 + for: 15m + labels: + severity: warning + annotations: + summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }}) + {% endraw %} diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index 9333c57..7fd32c4 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -37,7 +37,7 @@ copy: content: | # Ansible managed - {{ prometheus_rules | to_nice_yaml }} + {{ prometheus_rules }} dest: /etc/prometheus/rules.yml owner: prometheus group: prometheus -- cgit v1.2.3