aboutsummaryrefslogtreecommitdiffstats
path: root/kubernetes/namespaces/monitoring
diff options
context:
space:
mode:
Diffstat (limited to 'kubernetes/namespaces/monitoring')
-rw-r--r--kubernetes/namespaces/monitoring/alerts/Makefile11
-rw-r--r--kubernetes/namespaces/monitoring/alerts/README.md5
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager.yaml24
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml92
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml24
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml30
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml16
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yamlbin0 -> 316 bytes
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml28
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml14
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml21
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml13
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml21
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml12
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml30
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml49
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml29
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml13
-rw-r--r--kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml20
-rw-r--r--kubernetes/namespaces/monitoring/calico-metrics-svc.yaml14
-rw-r--r--kubernetes/namespaces/monitoring/exporters/README.md8
-rw-r--r--kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml65
-rw-r--r--kubernetes/namespaces/monitoring/exporters/postgres/secrets.yamlbin0 -> 307 bytes
-rw-r--r--kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml54
-rw-r--r--kubernetes/namespaces/monitoring/exporters/redis/secrets.yamlbin0 -> 263 bytes
-rw-r--r--kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml30
-rw-r--r--kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml136
-rw-r--r--kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml15
-rw-r--r--kubernetes/namespaces/monitoring/kubewatch/README.md3
-rw-r--r--kubernetes/namespaces/monitoring/kubewatch/configmap.yaml34
-rw-r--r--kubernetes/namespaces/monitoring/kubewatch/deployment.yaml32
-rw-r--r--kubernetes/namespaces/monitoring/kubewatch/secrets.yamlbin0 -> 316 bytes
-rw-r--r--kubernetes/namespaces/monitoring/kubewatch/service-account.yaml30
-rw-r--r--kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml84
-rw-r--r--kubernetes/namespaces/monitoring/node_exporter/service.yaml18
-rw-r--r--kubernetes/namespaces/monitoring/prometheus/deployment.yaml58
-rw-r--r--kubernetes/namespaces/monitoring/prometheus/ingress.yaml24
-rw-r--r--kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml267
-rw-r--r--kubernetes/namespaces/monitoring/prometheus/service-account.yaml32
-rw-r--r--kubernetes/namespaces/monitoring/prometheus/service.yaml11
-rw-r--r--kubernetes/namespaces/monitoring/prometheus/volume.yaml14
44 files changed, 1441 insertions, 0 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/Makefile b/kubernetes/namespaces/monitoring/alerts/Makefile
new file mode 100644
index 0000000..c599ee6
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/Makefile
@@ -0,0 +1,11 @@
+.PHONY: alerts alertmanager
+
+all: alerts alertmanager
+
+# Upload the alerting rules to the Kubernetes cluster
+alerts:
+ kubectl create configmap -n monitoring prometheus-alert-rules --from-file=alerts.d/ -o yaml --dry-run=client | kubectl apply -f -
+
+# Upload the alertmanager configuration to the Kubernetes cluster
+alertmanager:
+ kubectl create configmap -n monitoring alertmanager-config --from-file=alertmanager.yaml=alertmanager.yaml -o yaml --dry-run=client | kubectl apply -f -
diff --git a/kubernetes/namespaces/monitoring/alerts/README.md b/kubernetes/namespaces/monitoring/alerts/README.md
new file mode 100644
index 0000000..75f70ac
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/README.md
@@ -0,0 +1,5 @@
+# Alerts
+
+This directory contains alerting rules and routing configuration for production.
+
+To build and upload this configuration, see the annotated `Makefile` in this directory.
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml
new file mode 100644
index 0000000..bef166a
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml
@@ -0,0 +1,24 @@
+route:
+ group_by: ['alertname', 'cluster', 'service']
+
+ group_wait: 15s
+
+ group_interval: 1m
+
+ receiver: devops-team
+
+receivers:
+- name: devops-team
+ slack_configs:
+ - api_url_file: "/opt/pydis/alertmanager/webhooks/DEVOPS_HOOK"
+ send_resolved: true
+ title: '{{ if eq .Status "firing" }}[FIRING]{{ else }}[RESOLVED]{{ end }}'
+ text: |
+ {{ if eq .Status "firing" }}{{ range .Alerts }}
+ **{{ .Annotations.summary }}:**
+ {{ .Annotations.description }} [(Link)]({{.GeneratorURL}})
+
+ {{ end }}{{ else }}Alert has resolved.{{ end }}
+ fields:
+ - title: Alert
+ value: "{{ .GroupLabels.alertname }}"
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml
new file mode 100644
index 0000000..4f1c322
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml
@@ -0,0 +1,92 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: alertmanager
+ namespace: monitoring
+spec:
+ replicas: 3
+ selector:
+ matchLabels:
+ app: alertmanager
+ template:
+ metadata:
+ labels:
+ app: alertmanager
+ spec:
+ serviceAccountName: prometheus
+ affinity:
+ podAntiAffinity:
+ preferredDuringSchedulingIgnoredDuringExecution:
+ - podAffinityTerm:
+ labelSelector:
+ matchExpressions:
+ - key: app
+ operator: In
+ values:
+ - alertmanager
+ namespaces:
+ - monitoring
+ topologyKey: kubernetes.io/hostname
+ weight: 100
+ initContainers:
+ - image: debian:bullseye-slim
+ imagePullPolicy: Always
+ name: alertmanager-peering-setup
+ command: [
+ '/opt/pydis/alertmanager/init.d/find-pods.sh'
+ ]
+ volumeMounts:
+ - name: alertmanager-init
+ mountPath: /opt/pydis/alertmanager/init.d
+ - name: alertmanager-tmp
+ mountPath: /tmp
+ securityContext:
+ runAsUser: 0
+ containers:
+ - image: prom/alertmanager:latest
+ imagePullPolicy: Always
+ name: alertmanager
+ command:
+ - /bin/sh
+ - -c
+ - |
+ exec /bin/alertmanager \
+ --config.file=/opt/pydis/alertmanager/config.d/alertmanager.yaml \
+ --web.external-url=https://alertmanager.pythondiscord.com \
+ --storage.path=/data/alertmanager \
+ $(cat /tmp/peers)
+ ports:
+ - name: am
+ containerPort: 9093
+ - name: am-peering
+ containerPort: 9094
+ volumeMounts:
+ - name: alertmanager-config
+ mountPath: /opt/pydis/alertmanager/config.d
+ - name: alertmanager-webhooks
+ mountPath: /opt/pydis/alertmanager/webhooks
+ - name: alertmanager-tmp-data
+ mountPath: /data
+ - name: alertmanager-tmp
+ mountPath: /tmp
+ securityContext:
+ readOnlyRootFilesystem: true
+ restartPolicy: Always
+ volumes:
+ - name: alertmanager-config
+ configMap:
+ name: alertmanager-config
+ - name: alertmanager-webhooks
+ secret:
+ secretName: alert-manager-hook
+ - name: alertmanager-tmp-data
+ emptyDir: {}
+ - name: alertmanager-tmp
+ emptyDir: {}
+ - name: alertmanager-init
+ configMap:
+ name: alertmanager-init
+ defaultMode: 0777
+ securityContext:
+ fsGroup: 1000
+ runAsUser: 1000
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml
new file mode 100644
index 0000000..fc99e52
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml
@@ -0,0 +1,24 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ annotations:
+ nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
+ nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle"
+ nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+ name: alertmanager
+ namespace: monitoring
+spec:
+ tls:
+ - hosts:
+ - "*.pythondiscord.com"
+ rules:
+ - host: alertmanager.pythondiscord.com
+ http:
+ paths:
+ - path: /
+ pathType: Prefix
+ backend:
+ service:
+ name: alertmanager
+ port:
+ number: 9093
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml
new file mode 100644
index 0000000..f1f36e2
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml
@@ -0,0 +1,30 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: alertmanager-init
+ namespace: monitoring
+data:
+ find-pods.sh: |
+ #!/bin/sh
+
+ # Install curl and jq for JSON parsing
+ apt update && apt install -y curl jq
+
+ # Find the template hash
+ echo Finding template hash...
+ TEMPLATE_HASH=$(echo $HOSTNAME | cut -d- -f2)
+
+ # Query kubernetes API for all matching pods
+ echo Querying Kubernetes API for pods...
+ PODS=$(curl \
+ -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \
+ https://kubernetes.default/api/v1/namespaces/monitoring/pods\?labelSelector=pod-template-hash=$TEMPLATE_HASH\&pretty=false -sk -o /tmp/peers.json)
+
+ echo Finding Alertmanager IPs...
+ AM_IPS=$(jq '.items[].status.podIP' /tmp/peers.json -r)
+
+ echo Generating CLI flags for Alertmanager...
+ PEER_ARGS=$(echo $AM_IPS | sed 's/ /\n/g' | awk '{ print "--cluster.peer="$1":9094" }')
+
+ echo Writing CLI flags to /tmp/peers...
+ echo $PEER_ARGS > /tmp/peers
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml
new file mode 100644
index 0000000..8ec901a
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: alertmanager-sd
+ namespace: monitoring
+spec:
+ selector:
+ app: alertmanager
+ clusterIP: None
+ ports:
+ - port: 9093
+ targetPort: 9093
+ name: am
+ - port: 9094
+ targetPort: 9094
+ name: am-peering
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml
new file mode 100644
index 0000000..7cc1d95
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml
Binary files differ
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml
new file mode 100644
index 0000000..3f26311
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml
@@ -0,0 +1,28 @@
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: alertmanager
+rules:
+- apiGroups: [""]
+ resources: ["pods", "endpoints"]
+ verbs: ["get", "list"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: alertmanager
+ namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: alertmanager
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: alertmanager
+subjects:
+ - kind: ServiceAccount
+ name: alertmanager
+ namespace: monitoring
diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml
new file mode 100644
index 0000000..145b1e2
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: alertmanager
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9093"
+spec:
+ selector:
+ app: alertmanager
+ ports:
+ - port: 9093
+ targetPort: 9093
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
new file mode 100644
index 0000000..b3fcad9
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml
@@ -0,0 +1,21 @@
+groups:
+- name: alertmanager
+ rules:
+
+ - alert: AlertManagerClusterFailedPeers
+ expr: alertmanager_cluster_failed_peers > 0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "An Alertmanager node is reporting failed peers"
+ description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid."
+
+ - alert: AlertManagerHealthScore
+ expr: alertmanager_cluster_health_score > 0
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "An AlertManagerNode is reporting an unhealthy cluster"
+ description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
new file mode 100644
index 0000000..10eb3dd
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml
@@ -0,0 +1,13 @@
+groups:
+- name: certificates
+ interval: 1d
+ rules:
+
+ - alert: CertificateExpiringSoon
+ expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7
+ for: 0m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Certificate is expiring in < 7 days"
+ description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
new file mode 100644
index 0000000..9daa660
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: coredns
+ rules:
+
+ - alert: CoreDNSPanics
+ expr: increase(coredns_panics_total[1m]) > 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "CoreDNS is experiencing panic"
+ description: "Number of CoreDNS panics encountered: {{ $value }}"
+
+ - alert: CoreDNSCacheMisses
+ expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00
+ labels:
+ severity: page
+ annotations:
+ summary: "High CoreDNS cache misses in last 10 minutes"
+ description: "This can sometimes be an indication of networking troubles, currently {{ $value | humanizePercentage }} over last 10 minutes."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
new file mode 100644
index 0000000..5e8868e
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml
@@ -0,0 +1,21 @@
+groups:
+- name: cpu
+ rules:
+
+ - alert: HighCPUThrottling
+ expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling "
+ description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}"
+
+ - alert: HighNodeCPU
+ expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Node {{ $labels.kubernetes_node }} has CPU over 80% for last 5 minute"
+ description: "CPU on {{ $labels.kubernetes_node }} is averaging {{ $value }}"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
new file mode 100644
index 0000000..723d267
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: jobs
+ rules:
+ - alert: KubernetesCronjobSuspended
+ expr: kube_cronjob_spec_suspend != 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}"
+ description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended"
+
+ - alert: KubernetesJobFailed
+ expr: kube_job_status_failed > 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "Kubernetes Job failed: {{ $labels.job_name }}"
+ description: "Job {{$labels.kubernetes_namespacenamespace}}/{{$labels.job_name}} failed to complete"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
new file mode 100644
index 0000000..d53da5e
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml
@@ -0,0 +1,12 @@
+groups:
+- name: memory
+ rules:
+
+ - alert: NodeHighMemoryUsage
+ expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8
+ for: 30s
+ labels:
+ severity: page
+ annotations:
+ summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes"
+ description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}'
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
new file mode 100644
index 0000000..441f7df
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml
@@ -0,0 +1,30 @@
+groups:
+- name: nginx
+ rules:
+
+ - alert: NGINX4XXRequests
+ expr: sum by(service) (rate(nginx_ingress_controller_requests{status=~"^4..", status!="404", service!="pixels"}[1m])) / sum by(service) (rate(nginx_ingress_controller_requests[1m])) > 0.5
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: "High rate of 4XX requests for inbound requests"
+ description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
+
+ - alert: NGINX5XXRequests
+ expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: "High rate of 5XX requests for inbound requests"
+ description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`"
+
+ - alert: NGINXP99Timing
+ expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)", host!="pydis-api.default.svc.cluster.local"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Request timing P99 has been over 3 seconds for 5 minutes"
+ description: "Requests to service {{ $labels.host }} (to service {{ $labels.service }}) have taken over 3 seconds (P99) to complete."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
new file mode 100644
index 0000000..6bfa6d1
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml
@@ -0,0 +1,49 @@
+groups:
+- name: nodes
+ rules:
+
+ - alert: KubernetesNodeDiskPressure
+ expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure
+ description: "{{ $labels.kubernetes_node }} does not have adequate space to work with."
+
+ - alert: KubernetesNodeMemoryPressure
+ expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+ for: 15s
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure
+ description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with."
+
+ - alert: KubernetesNodeNetworkUnavailable
+ expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
+ for: 15s
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing network problems
+ description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections"
+
+
+ - alert: KubernetesNodePIDPressure
+ expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1
+ for: 15s
+ labels:
+ severity: page
+ annotations:
+ summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion
+ description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with."
+
+ - alert: KubernetesNodeReady
+ expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready
+ description: "Node {{ $labels.kubernetes_node }} has been unready for a long time"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
new file mode 100644
index 0000000..9efdffa
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: pods
+ rules:
+ - alert: KubernetesPodNotHealthy
+ expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0
+ for: 3m
+ labels:
+ severity: page
+ annotations:
+ summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}"
+ description: "Pod has been in a non-ready state for longer than 3 minutes."
+
+ - alert: KubernetesPodCrashLooping
+ expr: increase(kube_pod_container_status_restarts_total[5m]) > 3
+ for: 1m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes pod crash looping: {{ $labels.kubernetes_namespace }}/{{ $labels.pod }}"
+ description: "Pod {{ $labels.pod }} is crash looping"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml
new file mode 100644
index 0000000..399a84b
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml
@@ -0,0 +1,29 @@
+groups:
+- name: postgres
+ rules:
+ - alert: PostgresUp
+ expr: pg_up == 0
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "PostgreSQL is offline"
+ description: "Postgres Exporter cannot connect to PostgreSQL."
+
+ - alert: PostgresTooManyConnections
+ expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: PostgreSQL connections near max_connections setting
+ description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections"
+
+ - alert: PostgresDeadlockedTable
+ expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: Too many deadlocked tables
+ description: "PostgreSQL has dead-locks, value: {{ $value }}"
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
new file mode 100644
index 0000000..25e555d
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml
@@ -0,0 +1,13 @@
+groups:
+- name: prometheus
+ rules:
+
+ # Alert for any instance that is unreachable for >5 minutes.
+ - alert: InstanceDown
+ expr: up == 0
+ for: 5m
+ labels:
+ severity: page
+ annotations:
+ summary: "Instance {{ $labels.instance }} down"
+ description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
new file mode 100644
index 0000000..6b946f6
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml
@@ -0,0 +1,20 @@
+groups:
+- name: redis
+ rules:
+ - alert: RedisDown
+ expr: redis_up == 0
+ for: 1m
+ labels:
+ severity: page
+ annotations:
+ summary: "Redis is offline"
+ description: "Redis Exporter cannot connect to Redis."
+
+ - alert: RedisOutOfMemory
+ expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9
+ for: 0m
+ labels:
+ severity: page
+ annotations:
+ summary: "Redis is approaching it's memory limit"
+ description: "Redis is currently using {{ $value | humanizePercentage }} of configured memory."
diff --git a/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml b/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml
new file mode 100644
index 0000000..5690881
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: felix-metrics-svc
+ namespace: kube-system
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9091"
+spec:
+ selector:
+ k8s-app: calico-node
+ ports:
+ - port: 9091
+ targetPort: 9091
diff --git a/kubernetes/namespaces/monitoring/exporters/README.md b/kubernetes/namespaces/monitoring/exporters/README.md
new file mode 100644
index 0000000..6ed79f5
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/exporters/README.md
@@ -0,0 +1,8 @@
+# Exporters
+This directory contains prometheus exporters for various services running on our cluster.
+
+If any secrets are required for each exporter they will be in a secrets.yaml file next to the deployment.
+
+Below is a list of the exporters:
+- [postgres_exporter](https://github.com/wrouesnel/postgres_exporter)
+- [redis_exporter](https://github.com/oliver006/redis_exporter)
diff --git a/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml b/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml
new file mode 100644
index 0000000..5542d74
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml
@@ -0,0 +1,65 @@
+# Exporter for taking statistics on our PostgreSQL instance
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: postgres-exporter
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: postgres-exporter
+ template:
+ metadata:
+ labels:
+ app: postgres-exporter
+ spec:
+ containers:
+ - name: postgres-exporter
+ image: quay.io/prometheuscommunity/postgres-exporter:latest
+ imagePullPolicy: Always
+ resources:
+ requests:
+ cpu: 5m
+ memory: 20Mi
+ limits:
+ cpu: 20m
+ memory: 50Mi
+ ports:
+ - containerPort: 9187
+ env:
+ - name: PG_EXPORTER_EXTEND_QUERY_PATH
+ value: /opt/python-discord/queries/queries.yaml
+ envFrom:
+ - secretRef:
+ name: postgres-exporter-env
+ securityContext:
+ readOnlyRootFilesystem: true
+ volumeMounts:
+ - mountPath: /opt/python-discord/queries
+ name: queries
+ securityContext:
+ fsGroup: 2000
+ runAsUser: 1000
+ runAsNonRoot: true
+ volumes:
+ - configMap:
+ defaultMode: 420
+ name: postgres-exporter-queries
+ name: queries
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: postgres-exporter
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9187"
+spec:
+ selector:
+ app: postgres-exporter
+ ports:
+ - protocol: TCP
+ port: 9187
+ targetPort: 9187
diff --git a/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml b/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml
new file mode 100644
index 0000000..bec9067
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml
Binary files differ
diff --git a/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml b/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml
new file mode 100644
index 0000000..28a8489
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml
@@ -0,0 +1,54 @@
+# Exporter for taking statistics on our Redis instance
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: redis-exporter
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: redis-exporter
+ template:
+ metadata:
+ labels:
+ app: redis-exporter
+ spec:
+ containers:
+ - name: redis-exporter
+ image: oliver006/redis_exporter:latest
+ imagePullPolicy: Always
+ resources:
+ requests:
+ cpu: 5m
+ memory: 20Mi
+ limits:
+ cpu: 20m
+ memory: 50Mi
+ ports:
+ - containerPort: 9187
+ envFrom:
+ - secretRef:
+ name: redis-exporter-env
+ securityContext:
+ readOnlyRootFilesystem: true
+ securityContext:
+ fsGroup: 2000
+ runAsUser: 1000
+ runAsNonRoot: true
+---
+apiVersion: v1
+kind: Service
+metadata:
+ name: redis-exporter
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9121"
+spec:
+ selector:
+ app: redis-exporter
+ ports:
+ - protocol: TCP
+ port: 9121
+ targetPort: 9121
diff --git a/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml b/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml
new file mode 100644
index 0000000..f6ce9d0
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml
Binary files differ
diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml
new file mode 100644
index 0000000..5b5c2e7
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml
@@ -0,0 +1,30 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: kube-state-metrics
+ namespace: monitoring
+spec:
+ selector:
+ matchLabels:
+ app: kube-state-metrics
+ template:
+ metadata:
+ labels:
+ app: kube-state-metrics
+ spec:
+ serviceAccountName: kube-state-metrics
+ containers:
+ - image: ghcr.io/python-discord/kube-state-metrics:v2.1.0
+ imagePullPolicy: Always
+ args:
+ - --metric-labels-allowlist=pods=[*]
+ name: kube-state-metrics
+ securityContext:
+ readOnlyRootFilesystem: true
+ imagePullSecrets:
+ - name: ghcr-pull-secret
+ restartPolicy: Always
+ securityContext:
+ fsGroup: 2000
+ runAsUser: 1000
+ runAsNonRoot: true
diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml
new file mode 100644
index 0000000..17b56cb
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml
@@ -0,0 +1,136 @@
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: kube-state-metrics
+ namespace: monitoring
+rules:
+ - apiGroups:
+ - ""
+ resources:
+ - configmaps
+ - secrets
+ - nodes
+ - pods
+ - services
+ - resourcequotas
+ - replicationcontrollers
+ - limitranges
+ - persistentvolumeclaims
+ - persistentvolumes
+ - namespaces
+ - endpoints
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - extensions
+ resources:
+ - daemonsets
+ - deployments
+ - replicasets
+ - ingresses
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - apps
+ resources:
+ - statefulsets
+ - daemonsets
+ - deployments
+ - replicasets
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - batch
+ resources:
+ - cronjobs
+ - jobs
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - autoscaling
+ resources:
+ - horizontalpodautoscalers
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - authentication.k8s.io
+ resources:
+ - tokenreviews
+ verbs:
+ - create
+ - apiGroups:
+ - authorization.k8s.io
+ resources:
+ - subjectaccessreviews
+ verbs:
+ - create
+ - apiGroups:
+ - policy
+ resources:
+ - poddisruptionbudgets
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - certificates.k8s.io
+ resources:
+ - certificatesigningrequests
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - storage.k8s.io
+ resources:
+ - storageclasses
+ - volumeattachments
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - admissionregistration.k8s.io
+ resources:
+ - mutatingwebhookconfigurations
+ - validatingwebhookconfigurations
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - networking.k8s.io
+ resources:
+ - networkpolicies
+ verbs:
+ - list
+ - watch
+ - apiGroups:
+ - coordination.k8s.io
+ resources:
+ - leases
+ verbs:
+ - list
+ - watch
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: kube-state-metrics
+ namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: kube-state-metrics
+ namespace: monitoring
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: kube-state-metrics
+subjects:
+ - kind: ServiceAccount
+ name: kube-state-metrics
+ namespace: monitoring
diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml
new file mode 100644
index 0000000..7faa2c1
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: kube-state-metrics
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "8080"
+spec:
+ selector:
+ app: kube-state-metrics
+ ports:
+ - protocol: TCP
+ port: 8080
+ targetPort: 8080
diff --git a/kubernetes/namespaces/monitoring/kubewatch/README.md b/kubernetes/namespaces/monitoring/kubewatch/README.md
new file mode 100644
index 0000000..294c666
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kubewatch/README.md
@@ -0,0 +1,3 @@
+# Kubewatch
+
+> **kubewatch** is a Kubernetes watcher that currently publishes notification to available collaboration hubs/notification channels. Run it in your k8s cluster, and you will get event notifications through webhooks.
diff --git a/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml b/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml
new file mode 100644
index 0000000..902cfbc
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml
@@ -0,0 +1,34 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: kubewatch-config
+ namespace: monitoring
+data:
+ .kubewatch.yaml: |
+ namespace: ""
+ handler:
+ discord:
+ webhook: ""
+ ignores:
+ - pixels-discord-channel
+ - cert-manager-cainjector-leader-election
+ - cert-manager-controller
+ - ingress-controller-leader-nginx
+ - cluster-autoscaler-status
+ - ingress-controller-leader
+ resource:
+ deployment: true
+ replicationcontroller: true
+ replicaset: true
+ daemonset: true
+ services: true
+ pod: true
+ job: true
+ node: false
+ clusterrole: true
+ serviceaccount: true
+ persistentvolume: true
+ namespace: true
+ secret: true
+ configmap: true
+ ingress: true
diff --git a/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml b/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml
new file mode 100644
index 0000000..a674648
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml
@@ -0,0 +1,32 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: kubewatch
+ namespace: monitoring
+spec:
+ selector:
+ matchLabels:
+ app: kubewatch
+ template:
+ metadata:
+ labels:
+ app: kubewatch
+ spec:
+ serviceAccountName: kubewatch
+ containers:
+ - image: ghcr.io/python-discord/kubewatch:latest
+ imagePullPolicy: Always
+ name: kubewatch
+ volumeMounts:
+ - name: config-volume
+ mountPath: /root
+ envFrom:
+ - secretRef:
+ name: kubewatch-secrets
+ securityContext:
+ readOnlyRootFilesystem: true
+ restartPolicy: Always
+ volumes:
+ - name: config-volume
+ configMap:
+ name: kubewatch-config
diff --git a/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml b/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml
new file mode 100644
index 0000000..7427da2
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml
Binary files differ
diff --git a/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml b/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml
new file mode 100644
index 0000000..f0748ba
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml
@@ -0,0 +1,30 @@
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: kubewatch
+ namespace: monitoring
+rules:
+- apiGroups: ["", "extensions", "apps", "batch", "rbac.authorization.k8s.io", ]
+ resources: ["*"]
+ verbs: ["get", "watch", "list"]
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: kubewatch
+ namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: kubewatch
+ namespace: monitoring
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: kubewatch
+subjects:
+ - kind: ServiceAccount
+ name: kubewatch
+ namespace: monitoring
diff --git a/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml b/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml
new file mode 100644
index 0000000..075b1b7
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml
@@ -0,0 +1,84 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: node-exporter
+ namespace: monitoring
+spec:
+ updateStrategy:
+ type: RollingUpdate
+ rollingUpdate:
+ maxUnavailable: 1
+ selector:
+ matchLabels:
+ name: node-exporter
+ phase: prod
+ template:
+ metadata:
+ labels:
+ name: node-exporter
+ phase: prod
+ annotations:
+ seccomp.security.alpha.kubernetes.io/pod: 'docker/default'
+ spec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: kubernetes.io/os
+ operator: In
+ values:
+ - linux
+ - matchExpressions:
+ - key: beta.kubernetes.io/os
+ operator: In
+ values:
+ - linux
+ securityContext:
+ runAsNonRoot: true
+ runAsUser: 65534
+ hostPID: true
+ containers:
+ - name: node-exporter
+ image: quay.io/prometheus/node-exporter:v1.2.0
+ args:
+ - --path.procfs=/host/proc
+ - --path.sysfs=/host/sys
+ - --path.rootfs=/host/root
+ - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker|var/lib/containerd|var/lib/containers/.+)($|/)
+ - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
+ ports:
+ - name: metrics
+ containerPort: 9100
+ securityContext:
+ readOnlyRootFilesystem: true
+ resources:
+ requests:
+ cpu: 10m
+ memory: 24Mi
+ limits:
+ cpu: 200m
+ memory: 100Mi
+ volumeMounts:
+ - name: proc
+ mountPath: /host/proc
+ readOnly: true
+ - name: sys
+ mountPath: /host/sys
+ readOnly: true
+ - name: root
+ mountPath: /host/root
+ readOnly: true
+ tolerations:
+ - effect: NoSchedule
+ operator: Exists
+ volumes:
+ - name: proc
+ hostPath:
+ path: /proc
+ - name: sys
+ hostPath:
+ path: /sys
+ - name: root
+ hostPath:
+ path: /
diff --git a/kubernetes/namespaces/monitoring/node_exporter/service.yaml b/kubernetes/namespaces/monitoring/node_exporter/service.yaml
new file mode 100644
index 0000000..b6be8d5
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/node_exporter/service.yaml
@@ -0,0 +1,18 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: node-exporter
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+spec:
+ type: ClusterIP
+ clusterIP: None
+ selector:
+ name: node-exporter
+ phase: prod
+ ports:
+ - name: metrics
+ protocol: TCP
+ port: 80
+ targetPort: 9100
diff --git a/kubernetes/namespaces/monitoring/prometheus/deployment.yaml b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml
new file mode 100644
index 0000000..5a806ff
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml
@@ -0,0 +1,58 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: prometheus
+ namespace: monitoring
+spec:
+ strategy:
+ type: Recreate
+ selector:
+ matchLabels:
+ app: prometheus
+ template:
+ metadata:
+ labels:
+ app: prometheus
+ spec:
+ serviceAccountName: prometheus
+ containers:
+ - image: prom/prometheus:latest
+ imagePullPolicy: Always
+ args: [
+ "--storage.tsdb.path", "/opt/prometheus/data",
+ "--config.file", "/etc/prometheus/prometheus.yaml",
+ "--web.external-url", "https://prometheus.pythondiscord.com",
+ "--web.enable-lifecycle",
+ "--web.enable-admin-api",
+ "--web.page-title", "Python Discord Prometheus",
+ "--storage.tsdb.retention.size", "28GB",
+ "--storage.tsdb.retention.time", "100d"
+ ]
+ name: prometheus
+ ports:
+ - name: prometheus
+ containerPort: 9090
+ securityContext:
+ readOnlyRootFilesystem: true
+ volumeMounts:
+ - name: prometheus-data
+ mountPath: /opt/prometheus/data
+ - name: prometheus-config
+ mountPath: /etc/prometheus
+ - name: prometheus-alerts
+ mountPath: /opt/pydis/prometheus/alerts.d
+ restartPolicy: Always
+ securityContext:
+ fsGroup: 2000
+ runAsUser: 1000
+ runAsNonRoot: true
+ volumes:
+ - name: prometheus-data
+ persistentVolumeClaim:
+ claimName: prometheus-storage
+ - name: prometheus-config
+ configMap:
+ name: prometheus-config
+ - name: prometheus-alerts
+ configMap:
+ name: prometheus-alert-rules
diff --git a/kubernetes/namespaces/monitoring/prometheus/ingress.yaml b/kubernetes/namespaces/monitoring/prometheus/ingress.yaml
new file mode 100644
index 0000000..69e240a
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/prometheus/ingress.yaml
@@ -0,0 +1,24 @@
+apiVersion: networking.k8s.io/v1
+kind: Ingress
+metadata:
+ annotations:
+ nginx.ingress.kubernetes.io/auth-tls-verify-client: "on"
+ nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle"
+ nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
+ name: prometheus
+ namespace: monitoring
+spec:
+ tls:
+ - hosts:
+ - "*.pythondiscord.com"
+ rules:
+ - host: prometheus.pythondiscord.com
+ http:
+ paths:
+ - path: /
+ pathType: Prefix
+ backend:
+ service:
+ name: prometheus
+ port:
+ number: 9090
diff --git a/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml b/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml
new file mode 100644
index 0000000..7ad047c
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml
@@ -0,0 +1,267 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: prometheus-config
+ namespace: monitoring
+data:
+ prometheus.yaml: |-
+ # Global config
+ global:
+ scrape_interval: 15s
+
+ rule_files:
+ - /opt/pydis/prometheus/alerts.d/*.yaml
+
+ alerting:
+ alertmanagers:
+ - scheme: http
+ dns_sd_configs:
+ - names:
+ - alertmanager-sd.monitoring.svc.cluster.local
+ type: A
+ port: 9093
+
+ # Scrape configs for running Prometheus on a Kubernetes cluster.
+ # This uses separate scrape configs for cluster components (i.e. API server, node)
+ # and services to allow each to use different authentication configs.
+ #
+ # Kubernetes labels will be added as Prometheus labels on metrics via the
+ # `labelmap` relabeling action.
+ scrape_configs:
+
+ # Scrape config for API servers.
+ #
+ # Kubernetes exposes API servers as endpoints to the default/kubernetes
+ # service so this uses `endpoints` role and uses relabelling to only keep
+ # the endpoints associated with the default/kubernetes service using the
+ # default named port `https`. This works for single API server deployments as
+ # well as HA API server deployments.
+ - job_name: 'kubernetes-apiservers'
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ # Using endpoints to discover kube-apiserver targets finds the pod IP
+ # (host IP since apiserver uses host network) which is not used in
+ # the server certificate.
+ insecure_skip_verify: true
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ # Keep only the default/kubernetes service endpoints for the https port. This
+ # will add targets for each API server which Kubernetes adds an endpoint to
+ # the default/kubernetes service.
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
+ action: keep
+ regex: default;kubernetes;https
+ - replacement: apiserver
+ action: replace
+ target_label: job
+
+ # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore
+ # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics).
+ - job_name: 'kubelet'
+ kubernetes_sd_configs:
+ - role: node
+
+ scheme: https
+ tls_config:
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ # Kubelet certs don't have any fixed IP SANs
+ insecure_skip_verify: true
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+ - replacement: 'monitoring'
+ target_label: kubernetes_namespace
+
+ metric_relabel_configs:
+ - source_labels:
+ - namespace
+ action: replace
+ regex: (.+)
+ target_label: kubernetes_namespace
+
+ # Scrape config for Kubelet cAdvisor. Explore metrics from a node by
+ # scraping kubelet (127.0.0.1:10250/metrics/cadvisor).
+ - job_name: 'kubernetes-cadvisor'
+ kubernetes_sd_configs:
+ - role: node
+
+ scheme: https
+ metrics_path: /metrics/cadvisor
+ tls_config:
+ # Kubelet certs don't have any fixed IP SANs
+ insecure_skip_verify: true
+ ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
+ bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
+
+ relabel_configs:
+ - action: labelmap
+ regex: __meta_kubernetes_node_label_(.+)
+ metric_relabel_configs:
+ - source_labels:
+ - namespace
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels:
+ - pod
+ regex: (.*)
+ replacement: $1
+ action: replace
+ target_label: pod_name
+ - source_labels:
+ - container
+ regex: (.*)
+ replacement: $1
+ action: replace
+ target_label: container_name
+
+ # Scrap etcd metrics from masters via etcd-scraper-proxy
+ - job_name: 'etcd'
+ kubernetes_sd_configs:
+ - role: pod
+ scheme: http
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_namespace]
+ action: keep
+ regex: 'kube-system'
+ - source_labels: [__meta_kubernetes_pod_label_component]
+ action: keep
+ regex: 'etcd-scraper-proxy'
+ - action: labelmap
+ regex: __meta_kubernetes_pod_label_(.+)
+
+ # Scrape config for service endpoints.
+ #
+ # The relabeling allows the actual service scrape endpoint to be configured
+ # via the following annotations:
+ #
+ # * `prometheus.io/scrape`: Only scrape services that have a value of `true`
+ # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need
+ # to set this to `https` & most likely set the `tls_config` of the scrape config.
+ # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+ # * `prometheus.io/port`: If the metrics are exposed on a different port to the
+ # service then set this appropriately.
+ - job_name: 'kubernetes-service-endpoints'
+
+ kubernetes_sd_configs:
+ - role: endpoints
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
+ action: replace
+ target_label: __scheme__
+ regex: (https?)
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
+ action: replace
+ target_label: __address__
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_service_name]
+ action: replace
+ target_label: job
+ - action: replace
+ source_labels:
+ - __meta_kubernetes_pod_node_name
+ target_label: kubernetes_node
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ metric_relabel_configs:
+ - source_labels:
+ - namespace
+ action: replace
+ regex: (.+)
+ target_label: kubernetes_namespace
+
+ # Example scrape config for probing services via the Blackbox Exporter.
+ #
+ # The relabeling allows the actual service scrape endpoint to be configured
+ # via the following annotations:
+ #
+ # * `prometheus.io/probe`: Only probe services that have a value of `true`
+ - job_name: 'kubernetes-services'
+
+ metrics_path: /probe
+ params:
+ module: [http_2xx]
+
+ kubernetes_sd_configs:
+ - role: service
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
+ action: keep
+ regex: true
+ - source_labels: [__address__]
+ target_label: __param_target
+ - target_label: __address__
+ replacement: blackbox
+ - source_labels: [__param_target]
+ target_label: instance
+ - action: labelmap
+ regex: __meta_kubernetes_service_label_(.+)
+ - source_labels: [__meta_kubernetes_service_name]
+ target_label: job
+ metric_relabel_configs:
+ - source_labels:
+ - namespace
+ action: replace
+ regex: (.+)
+ target_label: kubernetes_namespace
+
+ # Example scrape config for pods
+ #
+ # The relabeling allows the actual pod scrape endpoint to be configured via the
+ # following annotations:
+ #
+ # * `prometheus.io/scrape`: Only scrape pods that have a value of `true`
+ # * `prometheus.io/path`: If the metrics path is not `/metrics` override this.
+ # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the
+ # pod's declared ports (default is a port-free target if none are declared).
+ - job_name: 'kubernetes-pods'
+
+ kubernetes_sd_configs:
+ - role: pod
+
+ relabel_configs:
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
+ action: keep
+ regex: true
+ - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
+ action: replace
+ target_label: __metrics_path__
+ regex: (.+)
+ - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
+ action: replace
+ regex: ([^:]+)(?::\d+)?;(\d+)
+ replacement: $1:$2
+ target_label: __address__
+ - action: labelmap
+ regex: __meta_kubernetes_pod_label_(.+)
+ - source_labels: [__meta_kubernetes_namespace]
+ action: replace
+ target_label: kubernetes_namespace
+ - source_labels: [__meta_kubernetes_pod_name]
+ action: replace
+ target_label: kubernetes_pod_name
+ metric_relabel_configs:
+ - source_labels:
+ - namespace
+ action: replace
+ regex: (.+)
+ target_label: kubernetes_namespace
diff --git a/kubernetes/namespaces/monitoring/prometheus/service-account.yaml b/kubernetes/namespaces/monitoring/prometheus/service-account.yaml
new file mode 100644
index 0000000..00cf0c2
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/prometheus/service-account.yaml
@@ -0,0 +1,32 @@
+---
+kind: ClusterRole
+apiVersion: rbac.authorization.k8s.io/v1
+metadata:
+ name: prometheus
+rules:
+- apiGroups: ["*"]
+ resources: ["*"]
+ verbs: ["get", "list", "watch"]
+- nonResourceURLs:
+ - "/metrics"
+ verbs:
+ - get
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+ name: prometheus
+ namespace: monitoring
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+ name: prometheus
+roleRef:
+ apiGroup: rbac.authorization.k8s.io
+ kind: ClusterRole
+ name: prometheus
+subjects:
+ - kind: ServiceAccount
+ name: prometheus
+ namespace: monitoring
diff --git a/kubernetes/namespaces/monitoring/prometheus/service.yaml b/kubernetes/namespaces/monitoring/prometheus/service.yaml
new file mode 100644
index 0000000..5ec3a21
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/prometheus/service.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+ name: prometheus
+ namespace: monitoring
+spec:
+ selector:
+ app: prometheus
+ ports:
+ - port: 9090
+ targetPort: 9090
diff --git a/kubernetes/namespaces/monitoring/prometheus/volume.yaml b/kubernetes/namespaces/monitoring/prometheus/volume.yaml
new file mode 100644
index 0000000..4468a20
--- /dev/null
+++ b/kubernetes/namespaces/monitoring/prometheus/volume.yaml
@@ -0,0 +1,14 @@
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+ name: prometheus-storage
+ namespace: monitoring
+ labels:
+ app: prometheus
+spec:
+ storageClassName: linode-block-storage-retain
+ accessModes:
+ - ReadWriteOnce
+ resources:
+ requests:
+ storage: 30Gi