aboutsummaryrefslogtreecommitdiffstats
path: root/ansible/host_vars/lovelace/prometheus.yml
blob: b9e0d58c97ddf71c5a2fd601dee465e0119a204b (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
---

prometheus_cmdline_options: >-
  --config.file=/etc/prometheus/prometheus.yml
  --web.page-title='Python Discord Helper Monitoring And Supervision Service'
  --storage.tsdb.path='/var/lib/prometheus/metrics2/'

prometheus_configuration: |
  #jinja2: trim_blocks:False

  global:
    scrape_interval: 15s  # Set the scrape interval to every 15 seconds. Default is every 1 minute.
    evaluation_interval: 15s  # Evaluate rules every 15 seconds. The default is every 1 minute.
    # scrape_timeout is set to the global default (10s).

  # Alertmanager configuration
  alerting:
    alertmanagers:
      - scheme: https
        static_configs:
          - targets:
              - alertmanager.pydis.wtf

  rule_files:
    - rules.yml
    # - "first_rules.yml"
    # - "second_rules.yml"

  scrape_configs:
    # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
    - job_name: prometheus

      # Override the global default and scrape targets from this job every 5 seconds.
      scrape_interval: 5s
      scrape_timeout: 5s

      # metrics_path defaults to '/metrics'
      # scheme defaults to 'http'.

      static_configs:
        - targets: ['localhost:9090']

    - job_name: node
      # Scrape node exporters on all hosts
      static_configs:
        - targets:
          {%- for host in hostvars.values() %}
          - {{ host['ansible_wg0']['ipv4']['address'] }}:9100
          {%- endfor %}

    - job_name: postgres
      # Scrape PostgreSQL metrics from database hosts
      static_configs:
        - targets:
          {%- for host in groups['databases'] %}
          - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9187
          {%- endfor %}

    - job_name: blackbox-ssh
      metrics_path: /probe
      params:
        module: [ssh_banner]

      static_configs:
        - targets:
            - lovelace.box.pydis.wtf
            - turing.box.pydis.wtf

      relabel_configs:
        # Ensure that the SSH port is included explicitly
        - source_labels: [__address__]
          regex: (.*?)(:.*)?
          replacement: ${1}:22
          target_label: __param_target

        - source_labels: [__param_target]
          target_label: instance

        - target_label: __address__
          replacement: localhost:9115

    - job_name: blackbox-http
      metrics_path: /probe
      params:
        module: [http_2xx]
      static_configs:
        - targets:
            - https://pydis.wtf/
            - https://cloud.native.is.fun.and.easy.pydis.wtf/
      relabel_configs:
        - source_labels: [__address__]
          target_label: __param_target
        - source_labels: [__param_target]
          target_label: instance
        - target_label: __address__

prometheus_rules: |
  {% raw %}
  groups:
    - name: node
      rules:
        - alert: node/systemd-unit-failed
          expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0
          for: 15m
          labels:
            severity: warning
          annotations:
            summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }})

    - name: postgres
      rules:
      - alert: postgres/up
        expr: pg_up == 0
        for: 0m
        labels:
          severity: page
        annotations:
          summary: "PostgreSQL is offline"
          description: "Postgres Exporter cannot connect to PostgreSQL."

      - alert: postgres/connection-limit
        expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80
        for: 1m
        labels:
          severity: page
        annotations:
          summary: PostgreSQL connections near max_connections setting
          description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections"

      - alert: postgres/deadlocked-table
        expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3
        for: 1m
        labels:
          severity: page
        annotations:
          summary: Too many deadlocked tables
          description: "PostgreSQL has dead-locks, value: {{ $value }}"

  {% endraw %}