ansible/host_vars/lovelace/prometheus.yml


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172

---
prometheus_cmdline_options: >-
  --config.file=/etc/prometheus/prometheus.yml
  --web.page-title='Python Discord Helper Monitoring And Supervision Service'
  --storage.tsdb.path='/var/lib/prometheus/metrics2/'

prometheus_configuration: |
  #jinja2: trim_blocks:False

  global:
    scrape_interval: 15s  # Set the scrape interval to every 15 seconds. Default is every 1 minute.
    evaluation_interval: 15s  # Evaluate rules every 15 seconds. The default is every 1 minute.
    # scrape_timeout is set to the global default (10s).

  # Alertmanager configuration
  alerting:
    alertmanagers:
      - scheme: https
        static_configs:
          - targets:
              - alertmanager.pydis.wtf

  rule_files:
    - rules.yml
    # - "first_rules.yml"
    # - "second_rules.yml"

  scrape_configs:
    # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
    - job_name: prometheus

      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      scrape_timeout: 5s

      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      static_configs:
        - targets: ['localhost:9090']

    - job_name: node
      # Scrape node exporters on all hosts
      static_configs:
        - targets:
          {%- for host in groups['netcup'] %}
          - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9100
          {%- endfor %}

    - job_name: postgres
      # Scrape PostgreSQL metrics from database hosts
      static_configs:
        - targets:
          {%- for host in groups['databases'] %}
          - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9187
          {%- endfor %}

    - job_name: postfix
      static_configs:
        - targets:
          {%- for host in groups['mail'] %}
          - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9154
          {%- endfor %}

    - job_name: dmarc
      static_configs:
        - targets:
          {%- for host in groups['mail'] %}
          - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9797
          {%- endfor %}

    - job_name: blackbox-ssh
      metrics_path: /probe
      params:
        module: [ssh_banner]

      static_configs:
        - targets:
            - lovelace.box.pydis.wtf

      relabel_configs:
        # Ensure that the SSH port is included explicitly
        - source_labels: [__address__]
          regex: (.*?)(:.*)?
          replacement: ${1}:22
          target_label: __param_target

        - source_labels: [__param_target]
          target_label: instance

        - target_label: __address__
          replacement: localhost:9115

    - job_name: blackbox-http
      metrics_path: /probe
      params:
        module: [http_2xx]
      static_configs:
        - targets:
            - https://pydis.wtf/
            - https://cloud.native.is.fun.and.easy.pydis.wtf/
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__

prometheus_rules: |
  {% raw %}
  groups:
    - name: node
      rules:
        - alert: node/systemd-unit-failed
          expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0
          for: 15m
          labels:
            severity: warning
          annotations:
            summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }})

    - name: postgres
      rules:
      - alert: postgres/up
        expr: pg_up == 0
        for: 0m
        labels:
          severity: page
        annotations:
          summary: "PostgreSQL is offline"
          description: "Postgres Exporter cannot connect to PostgreSQL."

      - alert: postgres/connection-limit
        expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80
        for: 1m
        labels:
          severity: page
        annotations:
          summary: PostgreSQL connections near max_connections setting
          description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections"

      - alert: postgres/deadlocked-table
        expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3
        for: 1m
        labels:
          severity: page
        annotations:
          summary: Too many deadlocked tables
          description: "PostgreSQL has dead-locks, value: {{ $value }}"

    - name: postfix
      rules:
        - alert: postfix/down
          expr: postfix_up != 1
          for: 5m
          labels:
            severity: warning
          annotations:
            summary: Postfix is down (instance {{ $labels.instance }})
        - alert: postfix/smtp-temporary-errors
          expr: rate(postfix_smtpd_messages_rejected_total{code=~"^4.*"}[15m]) > 0
          for: 0m
          labels:
            severity: warning
          annotations:
            summary: Postfix is rejecting messages due to errors (instance {{ $labels.instance }})
            description: Postfix has seen code {{ $labels.code }} errors recently
              and temporarily rejected emails.
              https://en.wikipedia.org/wiki/List_of_SMTP_server_return_codes and
              `sudo journalctl -xeu postfix@-` may provide more information on
              the current issue.
  {% endraw %}