--- prometheus_cmdline_options: >- --config.file=/etc/prometheus/prometheus.yml --web.page-title='Python Discord Helper Monitoring And Supervision Service' --storage.tsdb.path='/var/lib/prometheus/metrics2/' prometheus_configuration: | #jinja2: trim_blocks:False global: scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - scheme: https static_configs: - targets: - alertmanager.pydis.wtf rule_files: - rules.yml # - "first_rules.yml" # - "second_rules.yml" scrape_configs: # The job name is added as a label `job=` to any timeseries scraped from this config. - job_name: prometheus # Override the global default and scrape targets from this job every 5 seconds. scrape_interval: 5s scrape_timeout: 5s # metrics_path defaults to '/metrics' # scheme defaults to 'http'. static_configs: - targets: ['localhost:9090'] - job_name: node # Scrape node exporters on all hosts static_configs: - targets: {%- for host in groups['netcup'] %} - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9100 {%- endfor %} - job_name: postgres # Scrape PostgreSQL metrics from database hosts static_configs: - targets: {%- for host in groups['databases'] %} - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9187 {%- endfor %} - job_name: postfix static_configs: - targets: {%- for host in groups['mail'] %} - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9154 {%- endfor %} - job_name: dmarc static_configs: - targets: {%- for host in groups['mail'] %} - {{ hostvars[host]['ansible_wg0']['ipv4']['address'] }}:9797 {%- endfor %} - job_name: blackbox-ssh metrics_path: /probe params: module: [ssh_banner] static_configs: - targets: - lovelace.box.pydis.wtf relabel_configs: # Ensure that the SSH port is included explicitly - source_labels: [__address__] regex: (.*?)(:.*)? replacement: ${1}:22 target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: localhost:9115 - job_name: blackbox-http metrics_path: /probe params: module: [http_2xx] static_configs: - targets: - https://pydis.wtf/ - https://cloud.native.is.fun.and.easy.pydis.wtf/ relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ prometheus_rules: | {% raw %} groups: - name: node rules: - alert: node/systemd-unit-failed expr: node_systemd_unit_state{state="failed", name!="openipmi.service", name!="nvmf-autoconnect.service"} != 0 for: 15m labels: severity: warning annotations: summary: Node systemd unit {{ $labels.name }} has failed (instance {{ $labels.instance }}) - name: postgres rules: - alert: postgres/up expr: pg_up == 0 for: 0m labels: severity: page annotations: summary: "PostgreSQL is offline" description: "Postgres Exporter cannot connect to PostgreSQL." - alert: postgres/connection-limit expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80 for: 1m labels: severity: page annotations: summary: PostgreSQL connections near max_connections setting description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections" - alert: postgres/deadlocked-table expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3 for: 1m labels: severity: page annotations: summary: Too many deadlocked tables description: "PostgreSQL has dead-locks, value: {{ $value }}" - name: postfix rules: - alert: postfix/down expr: postfix_up != 1 for: 5m labels: severity: warning annotations: summary: Postfix is down (instance {{ $labels.instance }}) - alert: postfix/smtp-temporary-errors expr: rate(postfix_smtpd_messages_rejected_total{code=~"^4.*"}[15m]) > 0 for: 0m labels: severity: warning annotations: summary: Postfix is rejecting messages due to errors (instance {{ $labels.instance }}) description: Postfix has seen code {{ $labels.code }} errors recently and temporarily rejected emails. https://en.wikipedia.org/wiki/List_of_SMTP_server_return_codes and `sudo journalctl -xeu postfix@-` may provide more information on the current issue. {% endraw %}