blob: 674a1275daf283ade13484716263a0c0a385fac0 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
|
---
prometheus_cmdline_options: >-
--config.file=/etc/prometheus/prometheus.yml
--web.page-title='Python Discord Helper Monitoring And Supervision Service'
--storage.tsdb.path='/var/lib/prometheus/metrics2/'
prometheus_configuration: |
#jinja2: trim_blocks:False
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- scheme: https
static_configs:
- targets:
- alertmanager.pydis.wtf
rule_files:
- rules.yml
# - "first_rules.yml"
# - "second_rules.yml"
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: prometheus
# Override the global default and scrape targets from this job every 5 seconds.
scrape_interval: 5s
scrape_timeout: 5s
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: node
# Scrape node exporters on all hosts
static_configs:
- targets:
{%- for host in groups['netcup'] %}
- {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9100
{%- endfor %}
- job_name: postgres
# Scrape PostgreSQL metrics from database hosts
static_configs:
- targets:
{%- for host in groups['databases'] %}
- {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9187
{%- endfor %}
- job_name: postfix
static_configs:
- targets:
{%- for host in groups['mail'] %}
- {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9154
{%- endfor %}
- job_name: dmarc
static_configs:
- targets:
{%- for host in groups['mail'] %}
- {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9797
{%- endfor %}
- job_name: blackbox-ssh
metrics_path: /probe
params:
module: [ssh_banner]
static_configs:
- targets:
- lovelace.box.pydis.wtf
relabel_configs:
# Ensure that the SSH port is included explicitly
- source_labels: [__address__]
regex: (.*?)(:.*)?
replacement: ${1}:22
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: localhost:9115
- job_name: blackbox-http
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://pydis.wtf/
- https://cloud.native.is.fun.and.easy.pydis.wtf/
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
prometheus_rules: |
{% raw %}
groups:
- name: node
rules:
- alert: node/systemd-unit-failed
expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0
for: 15m
labels:
severity: warning
annotations:
summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }})
- name: postgres
rules:
- alert: postgres/up
expr: pg_up == 0
for: 0m
labels:
severity: page
annotations:
summary: "PostgreSQL is offline"
description: "Postgres Exporter cannot connect to PostgreSQL."
- alert: postgres/connection-limit
expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80
for: 1m
labels:
severity: page
annotations:
summary: PostgreSQL connections near max_connections setting
description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections"
- alert: postgres/deadlocked-table
expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3
for: 1m
labels:
severity: page
annotations:
summary: Too many deadlocked tables
description: "PostgreSQL has dead-locks, value: {{ $value }}"
- name: postfix
rules:
- alert: postfix/down
expr: postfix_up != 1
for: 5m
labels:
severity: warning
annotations:
summary: Postfix is down (instance {{ $labels.instance }})
- alert: postfix/smtp-temporary-errors
expr: rate(postfix_smtpd_messages_rejected_total{code=~"^4.*"}[15m]) > 0
for: 0m
labels:
severity: warning
annotations:
summary: Postfix is rejecting messages due to errors (instance {{ $labels.instance }})
description: Postfix has seen code {{ $labels.code }} errors recently
and temporarily rejected emails.
https://en.wikipedia.org/wiki/List_of_SMTP_server_return_codes and
`sudo journalctl -xeu postfix@-` may provide more information on
the current issue.
{% endraw %}
|