blob: b9e0d58c97ddf71c5a2fd601dee465e0119a204b (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
---
prometheus_cmdline_options: >-
--config.file=/etc/prometheus/prometheus.yml
--web.page-title='Python Discord Helper Monitoring And Supervision Service'
--storage.tsdb.path='/var/lib/prometheus/metrics2/'
prometheus_configuration: |
#jinja2: trim_blocks:False
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- scheme: https
static_configs:
- targets:
- alertmanager.pydis.wtf
rule_files:
- rules.yml
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: prometheus
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
scrape_timeout: 5s
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: node
# Scrape node exporters on all hosts
static_configs:
- targets:
{%- for host in hostvars.values() %}
- {{ host['ansible_wg0']['ipv4']['address'] }}:9100
{%- endfor %}
- job_name: postgres
# Scrape PostgreSQL metrics from database hosts
static_configs:
- targets:
{%- for host in groups['databases'] %}
- {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9187
{%- endfor %}
- job_name: blackbox-ssh
metrics_path: /probe
params:
module: [ssh_banner]
static_configs:
- targets:
- lovelace.box.pydis.wtf
- turing.box.pydis.wtf
relabel_configs:
# Ensure that the SSH port is included explicitly
- source_labels: [__address__]
regex: (.*?)(:.*)?
replacement: ${1}:22
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: localhost:9115
- job_name: blackbox-http
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://pydis.wtf/
- https://cloud.native.is.fun.and.easy.pydis.wtf/
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
prometheus_rules: |
{% raw %}
groups:
- name: node
rules:
- alert: node/systemd-unit-failed
expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0
for: 15m
labels:
severity: warning
annotations:
summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }})
- name: postgres
rules:
- alert: postgres/up
expr: pg_up == 0
for: 0m
labels:
severity: page
annotations:
summary: "PostgreSQL is offline"
description: "Postgres Exporter cannot connect to PostgreSQL."
- alert: postgres/connection-limit
expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80
for: 1m
labels:
severity: page
annotations:
summary: PostgreSQL connections near max_connections setting
description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections"
- alert: postgres/deadlocked-table
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3
for: 1m
labels:
severity: page
annotations:
summary: Too many deadlocked tables
description: "PostgreSQL has dead-locks, value: {{ $value }}"
{% endraw %}
|