diff options
Diffstat (limited to 'kubernetes/namespaces/monitoring')
44 files changed, 1441 insertions, 0 deletions
diff --git a/kubernetes/namespaces/monitoring/alerts/Makefile b/kubernetes/namespaces/monitoring/alerts/Makefile new file mode 100644 index 0000000..c599ee6 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/Makefile @@ -0,0 +1,11 @@ +.PHONY: alerts alertmanager + +all: alerts alertmanager + +# Upload the alerting rules to the Kubernetes cluster +alerts: + kubectl create configmap -n monitoring prometheus-alert-rules --from-file=alerts.d/ -o yaml --dry-run=client | kubectl apply -f - + +# Upload the alertmanager configuration to the Kubernetes cluster +alertmanager: + kubectl create configmap -n monitoring alertmanager-config --from-file=alertmanager.yaml=alertmanager.yaml -o yaml --dry-run=client | kubectl apply -f - diff --git a/kubernetes/namespaces/monitoring/alerts/README.md b/kubernetes/namespaces/monitoring/alerts/README.md new file mode 100644 index 0000000..75f70ac --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/README.md @@ -0,0 +1,5 @@ +# Alerts + +This directory contains alerting rules and routing configuration for production. + +To build and upload this configuration, see the annotated `Makefile` in this directory. diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml new file mode 100644 index 0000000..bef166a --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml @@ -0,0 +1,24 @@ +route: + group_by: ['alertname', 'cluster', 'service'] + + group_wait: 15s + + group_interval: 1m + + receiver: devops-team + +receivers: +- name: devops-team + slack_configs: + - api_url_file: "/opt/pydis/alertmanager/webhooks/DEVOPS_HOOK" + send_resolved: true + title: '{{ if eq .Status "firing" }}[FIRING]{{ else }}[RESOLVED]{{ end }}' + text: | + {{ if eq .Status "firing" }}{{ range .Alerts }} + **{{ .Annotations.summary }}:** + {{ .Annotations.description }} [(Link)]({{.GeneratorURL}}) + + {{ end }}{{ else }}Alert has resolved.{{ end }} + fields: + - title: Alert + value: "{{ .GroupLabels.alertname }}" diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml new file mode 100644 index 0000000..4f1c322 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml @@ -0,0 +1,92 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: monitoring +spec: + replicas: 3 + selector: + matchLabels: + app: alertmanager + template: + metadata: + labels: + app: alertmanager + spec: + serviceAccountName: prometheus + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - alertmanager + namespaces: + - monitoring + topologyKey: kubernetes.io/hostname + weight: 100 + initContainers: + - image: debian:bullseye-slim + imagePullPolicy: Always + name: alertmanager-peering-setup + command: [ + '/opt/pydis/alertmanager/init.d/find-pods.sh' + ] + volumeMounts: + - name: alertmanager-init + mountPath: /opt/pydis/alertmanager/init.d + - name: alertmanager-tmp + mountPath: /tmp + securityContext: + runAsUser: 0 + containers: + - image: prom/alertmanager:latest + imagePullPolicy: Always + name: alertmanager + command: + - /bin/sh + - -c + - | + exec /bin/alertmanager \ + --config.file=/opt/pydis/alertmanager/config.d/alertmanager.yaml \ + --web.external-url=https://alertmanager.pythondiscord.com \ + --storage.path=/data/alertmanager \ + $(cat /tmp/peers) + ports: + - name: am + containerPort: 9093 + - name: am-peering + containerPort: 9094 + volumeMounts: + - name: alertmanager-config + mountPath: /opt/pydis/alertmanager/config.d + - name: alertmanager-webhooks + mountPath: /opt/pydis/alertmanager/webhooks + - name: alertmanager-tmp-data + mountPath: /data + - name: alertmanager-tmp + mountPath: /tmp + securityContext: + readOnlyRootFilesystem: true + restartPolicy: Always + volumes: + - name: alertmanager-config + configMap: + name: alertmanager-config + - name: alertmanager-webhooks + secret: + secretName: alert-manager-hook + - name: alertmanager-tmp-data + emptyDir: {} + - name: alertmanager-tmp + emptyDir: {} + - name: alertmanager-init + configMap: + name: alertmanager-init + defaultMode: 0777 + securityContext: + fsGroup: 1000 + runAsUser: 1000 diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml new file mode 100644 index 0000000..fc99e52 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: alertmanager + namespace: monitoring +spec: + tls: + - hosts: + - "*.pythondiscord.com" + rules: + - host: alertmanager.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: alertmanager + port: + number: 9093 diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml new file mode 100644 index 0000000..f1f36e2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager-init + namespace: monitoring +data: + find-pods.sh: | + #!/bin/sh + + # Install curl and jq for JSON parsing + apt update && apt install -y curl jq + + # Find the template hash + echo Finding template hash... + TEMPLATE_HASH=$(echo $HOSTNAME | cut -d- -f2) + + # Query kubernetes API for all matching pods + echo Querying Kubernetes API for pods... + PODS=$(curl \ + -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \ + https://kubernetes.default/api/v1/namespaces/monitoring/pods\?labelSelector=pod-template-hash=$TEMPLATE_HASH\&pretty=false -sk -o /tmp/peers.json) + + echo Finding Alertmanager IPs... + AM_IPS=$(jq '.items[].status.podIP' /tmp/peers.json -r) + + echo Generating CLI flags for Alertmanager... + PEER_ARGS=$(echo $AM_IPS | sed 's/ /\n/g' | awk '{ print "--cluster.peer="$1":9094" }') + + echo Writing CLI flags to /tmp/peers... + echo $PEER_ARGS > /tmp/peers diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml new file mode 100644 index 0000000..8ec901a --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager-sd + namespace: monitoring +spec: + selector: + app: alertmanager + clusterIP: None + ports: + - port: 9093 + targetPort: 9093 + name: am + - port: 9094 + targetPort: 9094 + name: am-peering diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml Binary files differnew file mode 100644 index 0000000..7cc1d95 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml new file mode 100644 index 0000000..3f26311 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml @@ -0,0 +1,28 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: alertmanager +rules: +- apiGroups: [""] + resources: ["pods", "endpoints"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alertmanager + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: alertmanager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alertmanager +subjects: + - kind: ServiceAccount + name: alertmanager + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml new file mode 100644 index 0000000..145b1e2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9093" +spec: + selector: + app: alertmanager + ports: + - port: 9093 + targetPort: 9093 diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml new file mode 100644 index 0000000..b3fcad9 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml @@ -0,0 +1,21 @@ +groups: +- name: alertmanager + rules: + + - alert: AlertManagerClusterFailedPeers + expr: alertmanager_cluster_failed_peers > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "An Alertmanager node is reporting failed peers" + description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid." + + - alert: AlertManagerHealthScore + expr: alertmanager_cluster_health_score > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "An AlertManagerNode is reporting an unhealthy cluster" + description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml new file mode 100644 index 0000000..10eb3dd --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml @@ -0,0 +1,13 @@ +groups: +- name: certificates + interval: 1d + rules: + + - alert: CertificateExpiringSoon + expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7 + for: 0m + labels: + severity: warning + annotations: + summary: "Certificate is expiring in < 7 days" + description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml new file mode 100644 index 0000000..9daa660 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml @@ -0,0 +1,20 @@ +groups: +- name: coredns + rules: + + - alert: CoreDNSPanics + expr: increase(coredns_panics_total[1m]) > 0 + for: 0m + labels: + severity: page + annotations: + summary: "CoreDNS is experiencing panic" + description: "Number of CoreDNS panics encountered: {{ $value }}" + + - alert: CoreDNSCacheMisses + expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00 + labels: + severity: page + annotations: + summary: "High CoreDNS cache misses in last 10 minutes" + description: "This can sometimes be an indication of networking troubles, currently {{ $value | humanizePercentage }} over last 10 minutes." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml new file mode 100644 index 0000000..5e8868e --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml @@ -0,0 +1,21 @@ +groups: +- name: cpu + rules: + + - alert: HighCPUThrottling + expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1 + for: 5m + labels: + severity: page + annotations: + summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling " + description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}" + + - alert: HighNodeCPU + expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: page + annotations: + summary: "Node {{ $labels.kubernetes_node }} has CPU over 80% for last 5 minute" + description: "CPU on {{ $labels.kubernetes_node }} is averaging {{ $value }}" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml new file mode 100644 index 0000000..723d267 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml @@ -0,0 +1,20 @@ +groups: +- name: jobs + rules: + - alert: KubernetesCronjobSuspended + expr: kube_cronjob_spec_suspend != 0 + for: 0m + labels: + severity: page + annotations: + summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}" + description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended" + + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 0m + labels: + severity: page + annotations: + summary: "Kubernetes Job failed: {{ $labels.job_name }}" + description: "Job {{$labels.kubernetes_namespacenamespace}}/{{$labels.job_name}} failed to complete" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml new file mode 100644 index 0000000..d53da5e --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml @@ -0,0 +1,12 @@ +groups: +- name: memory + rules: + + - alert: NodeHighMemoryUsage + expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8 + for: 30s + labels: + severity: page + annotations: + summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes" + description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}' diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml new file mode 100644 index 0000000..441f7df --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml @@ -0,0 +1,30 @@ +groups: +- name: nginx + rules: + + - alert: NGINX4XXRequests + expr: sum by(service) (rate(nginx_ingress_controller_requests{status=~"^4..", status!="404", service!="pixels"}[1m])) / sum by(service) (rate(nginx_ingress_controller_requests[1m])) > 0.5 + for: 1m + labels: + severity: page + annotations: + summary: "High rate of 4XX requests for inbound requests" + description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" + + - alert: NGINX5XXRequests + expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5 + for: 1m + labels: + severity: page + annotations: + summary: "High rate of 5XX requests for inbound requests" + description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" + + - alert: NGINXP99Timing + expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)", host!="pydis-api.default.svc.cluster.local"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10 + for: 5m + labels: + severity: page + annotations: + summary: "Request timing P99 has been over 3 seconds for 5 minutes" + description: "Requests to service {{ $labels.host }} (to service {{ $labels.service }}) have taken over 3 seconds (P99) to complete." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml new file mode 100644 index 0000000..6bfa6d1 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml @@ -0,0 +1,49 @@ +groups: +- name: nodes + rules: + + - alert: KubernetesNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 1m + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure + description: "{{ $labels.kubernetes_node }} does not have adequate space to work with." + + - alert: KubernetesNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 15s + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure + description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with." + + - alert: KubernetesNodeNetworkUnavailable + expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1 + for: 15s + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing network problems + description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections" + + + - alert: KubernetesNodePIDPressure + expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1 + for: 15s + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion + description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with." + + - alert: KubernetesNodeReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: page + annotations: + summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready + description: "Node {{ $labels.kubernetes_node }} has been unready for a long time" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml new file mode 100644 index 0000000..9efdffa --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml @@ -0,0 +1,20 @@ +groups: +- name: pods + rules: + - alert: KubernetesPodNotHealthy + expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0 + for: 3m + labels: + severity: page + annotations: + summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}" + description: "Pod has been in a non-ready state for longer than 3 minutes." + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: "Kubernetes pod crash looping: {{ $labels.kubernetes_namespace }}/{{ $labels.pod }}" + description: "Pod {{ $labels.pod }} is crash looping" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml new file mode 100644 index 0000000..399a84b --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml @@ -0,0 +1,29 @@ +groups: +- name: postgres + rules: + - alert: PostgresUp + expr: pg_up == 0 + for: 0m + labels: + severity: page + annotations: + summary: "PostgreSQL is offline" + description: "Postgres Exporter cannot connect to PostgreSQL." + + - alert: PostgresTooManyConnections + expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80 + for: 1m + labels: + severity: page + annotations: + summary: PostgreSQL connections near max_connections setting + description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections" + + - alert: PostgresDeadlockedTable + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3 + for: 1m + labels: + severity: page + annotations: + summary: Too many deadlocked tables + description: "PostgreSQL has dead-locks, value: {{ $value }}" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml new file mode 100644 index 0000000..25e555d --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml @@ -0,0 +1,13 @@ +groups: +- name: prometheus + rules: + + # Alert for any instance that is unreachable for >5 minutes. + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml new file mode 100644 index 0000000..6b946f6 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml @@ -0,0 +1,20 @@ +groups: +- name: redis + rules: + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: page + annotations: + summary: "Redis is offline" + description: "Redis Exporter cannot connect to Redis." + + - alert: RedisOutOfMemory + expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 + for: 0m + labels: + severity: page + annotations: + summary: "Redis is approaching it's memory limit" + description: "Redis is currently using {{ $value | humanizePercentage }} of configured memory." diff --git a/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml b/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml new file mode 100644 index 0000000..5690881 --- /dev/null +++ b/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: felix-metrics-svc + namespace: kube-system + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9091" +spec: + selector: + k8s-app: calico-node + ports: + - port: 9091 + targetPort: 9091 diff --git a/kubernetes/namespaces/monitoring/exporters/README.md b/kubernetes/namespaces/monitoring/exporters/README.md new file mode 100644 index 0000000..6ed79f5 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/README.md @@ -0,0 +1,8 @@ +# Exporters +This directory contains prometheus exporters for various services running on our cluster. + +If any secrets are required for each exporter they will be in a secrets.yaml file next to the deployment. + +Below is a list of the exporters: +- [postgres_exporter](https://github.com/wrouesnel/postgres_exporter) +- [redis_exporter](https://github.com/oliver006/redis_exporter) diff --git a/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml b/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml new file mode 100644 index 0000000..5542d74 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml @@ -0,0 +1,65 @@ +# Exporter for taking statistics on our PostgreSQL instance +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: postgres-exporter + template: + metadata: + labels: + app: postgres-exporter + spec: + containers: + - name: postgres-exporter + image: quay.io/prometheuscommunity/postgres-exporter:latest + imagePullPolicy: Always + resources: + requests: + cpu: 5m + memory: 20Mi + limits: + cpu: 20m + memory: 50Mi + ports: + - containerPort: 9187 + env: + - name: PG_EXPORTER_EXTEND_QUERY_PATH + value: /opt/python-discord/queries/queries.yaml + envFrom: + - secretRef: + name: postgres-exporter-env + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - mountPath: /opt/python-discord/queries + name: queries + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + volumes: + - configMap: + defaultMode: 420 + name: postgres-exporter-queries + name: queries +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9187" +spec: + selector: + app: postgres-exporter + ports: + - protocol: TCP + port: 9187 + targetPort: 9187 diff --git a/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml b/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml Binary files differnew file mode 100644 index 0000000..bec9067 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml b/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml new file mode 100644 index 0000000..28a8489 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml @@ -0,0 +1,54 @@ +# Exporter for taking statistics on our Redis instance +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: redis-exporter + template: + metadata: + labels: + app: redis-exporter + spec: + containers: + - name: redis-exporter + image: oliver006/redis_exporter:latest + imagePullPolicy: Always + resources: + requests: + cpu: 5m + memory: 20Mi + limits: + cpu: 20m + memory: 50Mi + ports: + - containerPort: 9187 + envFrom: + - secretRef: + name: redis-exporter-env + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true +--- +apiVersion: v1 +kind: Service +metadata: + name: redis-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9121" +spec: + selector: + app: redis-exporter + ports: + - protocol: TCP + port: 9121 + targetPort: 9121 diff --git a/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml b/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml Binary files differnew file mode 100644 index 0000000..f6ce9d0 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..5b5c2e7 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring +spec: + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - image: ghcr.io/python-discord/kube-state-metrics:v2.1.0 + imagePullPolicy: Always + args: + - --metric-labels-allowlist=pods=[*] + name: kube-state-metrics + securityContext: + readOnlyRootFilesystem: true + imagePullSecrets: + - name: ghcr-pull-secret + restartPolicy: Always + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml new file mode 100644 index 0000000..17b56cb --- /dev/null +++ b/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml @@ -0,0 +1,136 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kube-state-metrics + namespace: monitoring +rules: + - apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + - ingresses + verbs: + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch + - apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch + - apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch + - apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - list + - watch + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml new file mode 100644 index 0000000..7faa2c1 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" +spec: + selector: + app: kube-state-metrics + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 diff --git a/kubernetes/namespaces/monitoring/kubewatch/README.md b/kubernetes/namespaces/monitoring/kubewatch/README.md new file mode 100644 index 0000000..294c666 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/README.md @@ -0,0 +1,3 @@ +# Kubewatch + +> **kubewatch** is a Kubernetes watcher that currently publishes notification to available collaboration hubs/notification channels. Run it in your k8s cluster, and you will get event notifications through webhooks. diff --git a/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml b/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml new file mode 100644 index 0000000..902cfbc --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kubewatch-config + namespace: monitoring +data: + .kubewatch.yaml: | + namespace: "" + handler: + discord: + webhook: "" + ignores: + - pixels-discord-channel + - cert-manager-cainjector-leader-election + - cert-manager-controller + - ingress-controller-leader-nginx + - cluster-autoscaler-status + - ingress-controller-leader + resource: + deployment: true + replicationcontroller: true + replicaset: true + daemonset: true + services: true + pod: true + job: true + node: false + clusterrole: true + serviceaccount: true + persistentvolume: true + namespace: true + secret: true + configmap: true + ingress: true diff --git a/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml b/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml new file mode 100644 index 0000000..a674648 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kubewatch + namespace: monitoring +spec: + selector: + matchLabels: + app: kubewatch + template: + metadata: + labels: + app: kubewatch + spec: + serviceAccountName: kubewatch + containers: + - image: ghcr.io/python-discord/kubewatch:latest + imagePullPolicy: Always + name: kubewatch + volumeMounts: + - name: config-volume + mountPath: /root + envFrom: + - secretRef: + name: kubewatch-secrets + securityContext: + readOnlyRootFilesystem: true + restartPolicy: Always + volumes: + - name: config-volume + configMap: + name: kubewatch-config diff --git a/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml b/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml Binary files differnew file mode 100644 index 0000000..7427da2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml b/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml new file mode 100644 index 0000000..f0748ba --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml @@ -0,0 +1,30 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubewatch + namespace: monitoring +rules: +- apiGroups: ["", "extensions", "apps", "batch", "rbac.authorization.k8s.io", ] + resources: ["*"] + verbs: ["get", "watch", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kubewatch + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kubewatch + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kubewatch +subjects: + - kind: ServiceAccount + name: kubewatch + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml b/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml new file mode 100644 index 0000000..075b1b7 --- /dev/null +++ b/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml @@ -0,0 +1,84 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: monitoring +spec: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + selector: + matchLabels: + name: node-exporter + phase: prod + template: + metadata: + labels: + name: node-exporter + phase: prod + annotations: + seccomp.security.alpha.kubernetes.io/pod: 'docker/default' + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - matchExpressions: + - key: beta.kubernetes.io/os + operator: In + values: + - linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + hostPID: true + containers: + - name: node-exporter + image: quay.io/prometheus/node-exporter:v1.2.0 + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker|var/lib/containerd|var/lib/containers/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ + ports: + - name: metrics + containerPort: 9100 + securityContext: + readOnlyRootFilesystem: true + resources: + requests: + cpu: 10m + memory: 24Mi + limits: + cpu: 200m + memory: 100Mi + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + readOnly: true + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / diff --git a/kubernetes/namespaces/monitoring/node_exporter/service.yaml b/kubernetes/namespaces/monitoring/node_exporter/service.yaml new file mode 100644 index 0000000..b6be8d5 --- /dev/null +++ b/kubernetes/namespaces/monitoring/node_exporter/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' +spec: + type: ClusterIP + clusterIP: None + selector: + name: node-exporter + phase: prod + ports: + - name: metrics + protocol: TCP + port: 80 + targetPort: 9100 diff --git a/kubernetes/namespaces/monitoring/prometheus/deployment.yaml b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml new file mode 100644 index 0000000..5a806ff --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: monitoring +spec: + strategy: + type: Recreate + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + containers: + - image: prom/prometheus:latest + imagePullPolicy: Always + args: [ + "--storage.tsdb.path", "/opt/prometheus/data", + "--config.file", "/etc/prometheus/prometheus.yaml", + "--web.external-url", "https://prometheus.pythondiscord.com", + "--web.enable-lifecycle", + "--web.enable-admin-api", + "--web.page-title", "Python Discord Prometheus", + "--storage.tsdb.retention.size", "28GB", + "--storage.tsdb.retention.time", "100d" + ] + name: prometheus + ports: + - name: prometheus + containerPort: 9090 + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: prometheus-data + mountPath: /opt/prometheus/data + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-alerts + mountPath: /opt/pydis/prometheus/alerts.d + restartPolicy: Always + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + volumes: + - name: prometheus-data + persistentVolumeClaim: + claimName: prometheus-storage + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-alerts + configMap: + name: prometheus-alert-rules diff --git a/kubernetes/namespaces/monitoring/prometheus/ingress.yaml b/kubernetes/namespaces/monitoring/prometheus/ingress.yaml new file mode 100644 index 0000000..69e240a --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: prometheus + namespace: monitoring +spec: + tls: + - hosts: + - "*.pythondiscord.com" + rules: + - host: prometheus.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 diff --git a/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml b/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml new file mode 100644 index 0000000..7ad047c --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml @@ -0,0 +1,267 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yaml: |- + # Global config + global: + scrape_interval: 15s + + rule_files: + - /opt/pydis/prometheus/alerts.d/*.yaml + + alerting: + alertmanagers: + - scheme: http + dns_sd_configs: + - names: + - alertmanager-sd.monitoring.svc.cluster.local + type: A + port: 9093 + + # Scrape configs for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + scrape_configs: + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Using endpoints to discover kube-apiserver targets finds the pod IP + # (host IP since apiserver uses host network) which is not used in + # the server certificate. + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - replacement: apiserver + action: replace + target_label: job + + # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore + # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics). + - job_name: 'kubelet' + kubernetes_sd_configs: + - role: node + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: 'monitoring' + target_label: kubernetes_namespace + + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace + + # Scrape config for Kubelet cAdvisor. Explore metrics from a node by + # scraping kubelet (127.0.0.1:10250/metrics/cadvisor). + - job_name: 'kubernetes-cadvisor' + kubernetes_sd_configs: + - role: node + + scheme: https + metrics_path: /metrics/cadvisor + tls_config: + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + metric_relabel_configs: + - source_labels: + - namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - pod + regex: (.*) + replacement: $1 + action: replace + target_label: pod_name + - source_labels: + - container + regex: (.*) + replacement: $1 + action: replace + target_label: container_name + + # Scrap etcd metrics from masters via etcd-scraper-proxy + - job_name: 'etcd' + kubernetes_sd_configs: + - role: pod + scheme: http + relabel_configs: + - source_labels: [__meta_kubernetes_namespace] + action: keep + regex: 'kube-system' + - source_labels: [__meta_kubernetes_pod_label_component] + action: keep + regex: 'etcd-scraper-proxy' + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'kubernetes-service-endpoints' + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_service_name] + target_label: job + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the + # pod's declared ports (default is a port-free target if none are declared). + - job_name: 'kubernetes-pods' + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace diff --git a/kubernetes/namespaces/monitoring/prometheus/service-account.yaml b/kubernetes/namespaces/monitoring/prometheus/service-account.yaml new file mode 100644 index 0000000..00cf0c2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/service-account.yaml @@ -0,0 +1,32 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: prometheus +rules: +- apiGroups: ["*"] + resources: ["*"] + verbs: ["get", "list", "watch"] +- nonResourceURLs: + - "/metrics" + verbs: + - get +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/prometheus/service.yaml b/kubernetes/namespaces/monitoring/prometheus/service.yaml new file mode 100644 index 0000000..5ec3a21 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring +spec: + selector: + app: prometheus + ports: + - port: 9090 + targetPort: 9090 diff --git a/kubernetes/namespaces/monitoring/prometheus/volume.yaml b/kubernetes/namespaces/monitoring/prometheus/volume.yaml new file mode 100644 index 0000000..4468a20 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/volume.yaml @@ -0,0 +1,14 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: prometheus-storage + namespace: monitoring + labels: + app: prometheus +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 30Gi |