aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--ansible/host_vars/lovelace/prometheus.yml15
-rw-r--r--ansible/roles/prometheus/tasks/main.yml2
2 files changed, 16 insertions, 1 deletions
diff --git a/ansible/host_vars/lovelace/prometheus.yml b/ansible/host_vars/lovelace/prometheus.yml
index 41c5a97..d079986 100644
--- a/ansible/host_vars/lovelace/prometheus.yml
+++ b/ansible/host_vars/lovelace/prometheus.yml
@@ -73,3 +73,18 @@ prometheus_configuration:
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
+
+prometheus_rules: |
+ {% raw %}
+ groups:
+ - name: node
+ rules:
+ - alert: node/systemd-unit-failed
+ # expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0
+ expr: node_systemd_unit_state{state="failed"} != 0
+ for: 15m
+ labels:
+ severity: warning
+ annotations:
+ summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }})
+ {% endraw %}
diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml
index 9333c57..7fd32c4 100644
--- a/ansible/roles/prometheus/tasks/main.yml
+++ b/ansible/roles/prometheus/tasks/main.yml
@@ -37,7 +37,7 @@
copy:
content: |
# Ansible managed
- {{ prometheus_rules | to_nice_yaml }}
+ {{ prometheus_rules }}
dest: /etc/prometheus/rules.yml
owner: prometheus
group: prometheus