aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Joe Banks <[email protected]>2024-08-07 18:41:02 +0100
committerGravatar Joe Banks <[email protected]>2024-08-07 18:41:02 +0100
commitdcbb78959177537cf1fdda813380996a4b2daf8f (patch)
tree0a53ded19896aaddf93cc8f1e4ff34ac3f70464e
parentRevert "Enable fail2ban jails for postfix" (diff)
Remove old documentation
-rw-r--r--docs/Makefile20
-rw-r--r--docs/_static/.gitkeep0
-rw-r--r--docs/_static/logo.pngbin6794 -> 0 bytes
-rw-r--r--docs/_templates/.gitkeep0
-rw-r--r--docs/conf.py40
-rw-r--r--docs/general/index.rst9
-rw-r--r--docs/general/manual-deploys.rst27
-rw-r--r--docs/index.rst50
-rw-r--r--docs/make.bat35
-rw-r--r--docs/meeting_notes/2022-04-07.rst20
-rw-r--r--docs/meeting_notes/2022-09-18.rst74
-rw-r--r--docs/meeting_notes/2022-10-05.rst13
-rw-r--r--docs/meeting_notes/2022-10-19.rst31
-rw-r--r--docs/meeting_notes/2022-10-26.rst18
-rw-r--r--docs/meeting_notes/2022-11-02.rst27
-rw-r--r--docs/meeting_notes/2022-11-23.rst30
-rw-r--r--docs/meeting_notes/2023-02-08.rst17
-rw-r--r--docs/meeting_notes/2023-02-21.rst31
-rw-r--r--docs/meeting_notes/2023-02-28.rst16
-rw-r--r--docs/meeting_notes/2023-05-16.rst15
-rw-r--r--docs/meeting_notes/2023-07-11.rst41
-rw-r--r--docs/meeting_notes/2023-07-18.rst42
-rw-r--r--docs/meeting_notes/2023-07-25.rst4
-rw-r--r--docs/meeting_notes/2023-08-01.rst66
-rw-r--r--docs/meeting_notes/2023-08-08.rst54
-rw-r--r--docs/meeting_notes/2023-08-22.rst40
-rw-r--r--docs/meeting_notes/2023-08-29.rst65
-rw-r--r--docs/meeting_notes/2023-09-05.rst53
-rw-r--r--docs/meeting_notes/2023-09-12.rst73
-rw-r--r--docs/meeting_notes/2024-07-02.rst171
-rw-r--r--docs/meeting_notes/2024-07-25.rst46
-rw-r--r--docs/meeting_notes/index.rst31
-rw-r--r--docs/meeting_notes/template.rst22
-rw-r--r--docs/onboarding/access.rst50
-rw-r--r--docs/onboarding/index.rst17
-rw-r--r--docs/onboarding/resources.rst35
-rw-r--r--docs/onboarding/rules.rst16
-rw-r--r--docs/onboarding/tools.rst50
-rw-r--r--docs/postmortems/2020-12-11-all-services-outage.rst121
-rw-r--r--docs/postmortems/2020-12-11-postgres-conn-surge.rst130
-rw-r--r--docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst117
-rw-r--r--docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst155
-rw-r--r--docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst146
-rw-r--r--docs/postmortems/2021-07-11-cascading-node-failures.rst335
-rw-r--r--docs/postmortems/images/2021-01-12/site_cpu_throttle.pngbin227245 -> 0 bytes
-rw-r--r--docs/postmortems/images/2021-01-12/site_resource_abnormal.pngbin232260 -> 0 bytes
-rw-r--r--docs/postmortems/images/2021-01-30/linode_loadbalancers.pngbin50882 -> 0 bytes
-rw-r--r--docs/postmortems/images/2021-01-30/memory_charts.pngbin211053 -> 0 bytes
-rw-r--r--docs/postmortems/images/2021-01-30/prometheus_status.pngbin291122 -> 0 bytes
-rw-r--r--docs/postmortems/images/2021-01-30/scaleios.pngbin18294 -> 0 bytes
-rw-r--r--docs/postmortems/index.rst15
-rw-r--r--docs/queries/index.rst12
-rw-r--r--docs/queries/kubernetes.rst29
-rw-r--r--docs/queries/loki.rst25
-rw-r--r--docs/queries/postgres.rst336
-rw-r--r--docs/runbooks/index.rst17
-rw-r--r--docs/runbooks/postgresql-upgrade.rst149
-rw-r--r--docs/tooling/bots.rst55
-rw-r--r--docs/tooling/index.rst12
59 files changed, 0 insertions, 3003 deletions
diff --git a/docs/Makefile b/docs/Makefile
deleted file mode 100644
index d4bb2cb..0000000
--- a/docs/Makefile
+++ /dev/null
@@ -1,20 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS ?=
-SPHINXBUILD ?= sphinx-build
-SOURCEDIR = .
-BUILDDIR = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_static/.gitkeep b/docs/_static/.gitkeep
deleted file mode 100644
index e69de29..0000000
--- a/docs/_static/.gitkeep
+++ /dev/null
diff --git a/docs/_static/logo.png b/docs/_static/logo.png
deleted file mode 100644
index 1c125c7..0000000
--- a/docs/_static/logo.png
+++ /dev/null
Binary files differ
diff --git a/docs/_templates/.gitkeep b/docs/_templates/.gitkeep
deleted file mode 100644
index e69de29..0000000
--- a/docs/_templates/.gitkeep
+++ /dev/null
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index d9c0855..0000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# For the full list of built-in configuration values, see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Project information -----------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-
-project = "DevOps"
-copyright = "2024, Python Discord"
-author = "Joe Banks <[email protected]>, King Arthur <[email protected]>"
-
-# -- General configuration ---------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
-
-extensions = []
-
-templates_path = ["_templates"]
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-
-# -- Options for HTML output -------------------------------------------------
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
-
-html_theme = "alabaster"
-html_static_path = ["_static"]
-html_theme_options = {
- "logo": "logo.png",
- "logo_name": True,
- "logo_text_align": "center",
- "github_user": "python-discord",
- "github_repo": "infra",
- "github_button": True,
- "extra_nav_links": {
- "DevOps on YouTube": "https://www.youtube.com/watch?v=b2F-DItXtZs",
- "git: Infra": "https://github.com/python-discord/infra/",
- "git: King Arthur": "https://github.com/python-discord/king-arthur/",
- "Kanban Board": "https://github.com/orgs/python-discord/projects/17/views/4",
- },
-}
diff --git a/docs/general/index.rst b/docs/general/index.rst
deleted file mode 100644
index 60a04cb..0000000
--- a/docs/general/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-General
-=======
-
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- manual-deploys
diff --git a/docs/general/manual-deploys.rst b/docs/general/manual-deploys.rst
deleted file mode 100644
index 0d874ea..0000000
--- a/docs/general/manual-deploys.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Manual Deployments
-==================
-
-When the DevOps team are not available, Administrators and Core
-Developers can redeploy our critical services, such as Bot, Site and
-ModMail.
-
-This is handled through workflow dispatches on this repository. To get
-started, head to the
-`Actions <https://github.com/python-discord/kubernetes/actions>`__ tab
-of this repository and select ``Manual Redeploy`` in the sidebar,
-alternatively navigate
-`here <https://github.com/python-discord/kubernetes/actions/workflows/manual_redeploy.yml>`__.
-
-.. image:: https://user-images.githubusercontent.com/20439493/116442084-00d5f400-a84a-11eb-8e8a-e9e6bcc327dd.png
-
-Click ``Run workflow`` on the right hand side and enter the service name
-that needs redeploying, keep the branch as ``main``:
-
-.. image:: https://user-images.githubusercontent.com/20439493/116442202-22cf7680-a84a-11eb-8cce-a3e715a1bf68.png
-
-Click ``Run`` and refresh the page, you’ll see a new in progress Action
-which you can track. Once the deployment completes notifications will be
-sent to the ``#dev-ops`` channel on Discord.
-
-If you encounter errors with this please copy the Action run link to
-Discord so the DevOps team can investigate when available.
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index 348575d..0000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-.. Python Discord DevOps documentation master file, created by
- sphinx-quickstart on Wed Jul 24 19:49:56 2024.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-Python Discord DevOps
-=====================
-
-Welcome to the Python Discord DevOps knowledgebase.
-
-Within this set of pages you will find:
-
-- Changelogs
-
-- Post-mortems
-
-- Common queries
-
-- Runbooks
-
-
-Table of contents
------------------
-
-.. toctree::
- :maxdepth: 2
-
- general/index
- onboarding/index
- postmortems/index
- queries/index
- runbooks/index
- tooling/index
-
-
-Meeting notes
--------------
-
-.. toctree::
- :maxdepth: 2
-
- meeting_notes/index
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/make.bat b/docs/make.bat
deleted file mode 100644
index 954237b..0000000
--- a/docs/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=.
-set BUILDDIR=_build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.https://www.sphinx-doc.org/
- exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs/meeting_notes/2022-04-07.rst b/docs/meeting_notes/2022-04-07.rst
deleted file mode 100644
index ee23a5d..0000000
--- a/docs/meeting_notes/2022-04-07.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-2022-04-07
-==========
-
-Agenda
-------
-
-- No updates, as last week’s meeting did not take place
-
-Roadmap review & planning
--------------------------
-
-What are we working on for the next meeting?
-
-- Help wanted for #57 (h-asgi)
-- #58 (postgres exporter) needs a new review
-- #54 (firewall in VPN) will be done by Johannes
-- We need a testing environment #67
-- Johannes will add a Graphite role #31
-- Sofi will take a look at #29
-- #41 (policy bot) will be taken care of by Johannes
diff --git a/docs/meeting_notes/2022-09-18.rst b/docs/meeting_notes/2022-09-18.rst
deleted file mode 100644
index 163434c..0000000
--- a/docs/meeting_notes/2022-09-18.rst
+++ /dev/null
@@ -1,74 +0,0 @@
-2022-09-18
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- Joe will grant Chris access to the netcup hosts.
-
-NetKube status
-~~~~~~~~~~~~~~
-
-- **Rollout**
-
- - ☒ RBAC configuration and access granting
- - ☒ Most nodes are enrolled, Joe will re-check
- - ``turing``, ``ritchie``, ``lovelace`` and ``neumann`` will be
- Kubernetes nodes
- - ``hopper`` will be the storage server
-
-- **Storage drivers**
-
- - Not needed, everything that needs persistent storage will run on
- hopper
- - Netcup does not support storage resize
- - We can download more RAM if we need it
- - A couple of services still need volume mounts: Ghost, Grafana &
- Graphite
-
-- **Control plane high availability**
-
- - Joe mentions that in the case the control plane dies, everything
- else will die as well
- - If the control plane in Germany dies, so will Johannes
-
-- **Early plans for migration**
-
- - We can use the Ansible repository issues for a good schedule
- - Hopper runs ``nginx``
- - Statement from Joe: > “There is an nginx ingress running on every
- node in the cluster, okay, > okay? We don’t, the way that’s,
- that’s as a service is a NodePort, right? > So it has a normal IP,
- but the port will be like a random port in the range > of the
- 30,000s. Remember that? Hold on. Is he writing rude nodes? And
- then… > We have nginx, so this is where it’s like a little bit,
- like, not nice, I > guess we just like, cronjob it, to pull the
- nodes, like, every minute or > so, and then update the config if
- they change. But then it’s just like… > nginx is like a catalogue
- of nodes. Wahhh, you drive me crazy.”
-
- - “Nah, it makes sense!”
-
- - “It does!”
-
- - Joe will figure this out with assistance from his voices.
-
-Open authentication
-~~~~~~~~~~~~~~~~~~~
-
-- Joe and Johannes will check out OpenLDAP as a JumpCloud alternative
- starting from this evening
-- Sofi has experience with OpenLDAP
-
-Sponsorship
------------
-
-This meeting has been sponsored by Chris Hemsworth Lovering’s
-relationship therapy company, “Love To Love By Lovering”. You can sign
-up by sending a mail to [email protected].
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2022-10-05.rst b/docs/meeting_notes/2022-10-05.rst
deleted file mode 100644
index e069299..0000000
--- a/docs/meeting_notes/2022-10-05.rst
+++ /dev/null
@@ -1,13 +0,0 @@
-2022-10-05
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- Joe Banks configured proper RBAC for Chris, Johannes and Joe himself
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2022-10-19.rst b/docs/meeting_notes/2022-10-19.rst
deleted file mode 100644
index 6de7f33..0000000
--- a/docs/meeting_notes/2022-10-19.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-2022-10-19
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- One hour of gartic phone, for team spirit.
-- Created user accounts for Sofi and Hassan
-- Joe created an architecture diagram of the NGINX setup
-
- - *This is still in Notion*
-
-- Joe explained his NGINX plans: > “It’s not actually that hard, right?
- So you spawn 5 instances of nginx in a > DaemonSet, because then one
- gets deployed to every node okay, following? > Then we get NodePort,
- instead of LoadBalancers or whatever, which will get > a random port
- allocatead in the 35000 range, and that will go to nginx, and > on
- each of those ports, it will go to nginx, right? And then we poll the
- > Kubernetes API and what is the port that each of these nginx
- instances is > running on, and add that into a roundrobin on the
- fifth node. Right? Yeah. > That’s correct. That won’t do TLS though,
- so that will just HAProxy. Yeah.”
-- Joe will terminate our JumpCloud account
-- Chris reset the Minecraft server
-- Email alerting needs to be configured
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2022-10-26.rst b/docs/meeting_notes/2022-10-26.rst
deleted file mode 100644
index 69f8c70..0000000
--- a/docs/meeting_notes/2022-10-26.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-2022-10-26
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- Chris upgraded PostgreSQL to 15 in production
-- Johannes added the Kubernetes user creation script into the
- Kubernetes repository in the docs
-
-*(The rest of the meeting was discussion about the NetKube setup, which
-has been scrapped since)*.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2022-11-02.rst b/docs/meeting_notes/2022-11-02.rst
deleted file mode 100644
index d9f415d..0000000
--- a/docs/meeting_notes/2022-11-02.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-2022-11-02
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-Hanging behaviour of ModMail
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- `Source <https://discord.com/channels/267624335836053506/675756741417369640/1036720683067134052>`__
-
-- Maybe use `Signals + a
- debugger <https://stackoverflow.com/a/25329467>`__?
-
-- … using `something like pdb for the
- debugger <https://wiki.python.org/moin/PythonDebuggingTools>`__?
-
-- Or `GDB, as it seems handy to poke at stuck multi-threaded python
- software <https://wiki.python.org/moin/DebuggingWithGdb>`__?
-
-- ModMail has been upgraded to version 4
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2022-11-23.rst b/docs/meeting_notes/2022-11-23.rst
deleted file mode 100644
index 19edd06..0000000
--- a/docs/meeting_notes/2022-11-23.rst
+++ /dev/null
@@ -1,30 +0,0 @@
-2022-11-23
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-*(This meeting was mostly about NetKube, with the following strange text
-included, and everything outside of the text has been removed since the
-NetKube plans have been scrapped)*.
-
-Joe Banks, after a month-long hiatus to become a dad to every second
-girl on uni campus, has managed to pull up to the DevOps meeting.
-
-We are considering using Kubespray (https://kubespray.io/#/) in order to
-deploy a production-ready bare-metal Kubernetes cluster without
-involvement from Joe “Busy With Poly Girlfriend #20” Banks.
-
-At the moment cluster networking is not working and Joe mentions that
-the last time he has touched it, it worked perfectly fine. However, the
-last time he touched it there was only 1 node, and therefore no
-inter-node communications.
-
-Joe thinks he remembers installing 3 nodes, however, we at the DevOps
-team believe this to be a marijuana dream
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-02-08.rst b/docs/meeting_notes/2023-02-08.rst
deleted file mode 100644
index a161ba5..0000000
--- a/docs/meeting_notes/2023-02-08.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-2023-02-08
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- Investigation into deploying a VPN tool such as WireGuard to have
- inter-node communication between the Netcup hosts.
-
-*(The rest of this meeting was mostly about NetKube, which has since
-been scrapped)*.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-02-21.rst b/docs/meeting_notes/2023-02-21.rst
deleted file mode 100644
index 9de644c..0000000
--- a/docs/meeting_notes/2023-02-21.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-2023-02-21
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-Reusable status embed workflows
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- Further discussion with Bella followed
-- Upstream pull request can be found at
- `python-discord/bot#2400 <https://github.com/python-discord/bot/pull/2400>`__
-
-Local vagrant testing setup
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- Our new `testing setup using Vagrant
- VMs <https://github.com/python-discord/infra/pull/78>`__ has been
- merged.
-
-A visit from Mina
-~~~~~~~~~~~~~~~~~
-
-Mina checked in to make sure we’re operating at peak Volkswagen-like
-efficiency.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-02-28.rst b/docs/meeting_notes/2023-02-28.rst
deleted file mode 100644
index 1fb1093..0000000
--- a/docs/meeting_notes/2023-02-28.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-2023-02-28
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- Black knight’s CI & dependabot configuration has been mirrored across
- all important repositories
-
-- The test server has been updated for the new configuration
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-05-16.rst b/docs/meeting_notes/2023-05-16.rst
deleted file mode 100644
index 79272a6..0000000
--- a/docs/meeting_notes/2023-05-16.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-2023-05-16
-==========
-
-*Migrated from Notion*.
-
-Agenda
-------
-
-- Bella set up `CI bot docker image
- build <https://github.com/python-discord/bot/pull/2603>`__ to make
- sure that wheels are available.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-07-11.rst b/docs/meeting_notes/2023-07-11.rst
deleted file mode 100644
index 68b1085..0000000
--- a/docs/meeting_notes/2023-07-11.rst
+++ /dev/null
@@ -1,41 +0,0 @@
-2023-07-11
-==========
-
-Participants
-------------
-
-- Chris, Johannes, Bella, Bradley
-
-Agenda
-------
-
-New Ansible setup
-~~~~~~~~~~~~~~~~~
-
-Chris presented the new Ansible setup he’s been working on. We plan to
-use WireGuard for networking. We agreed that selfhosting Kubernetes is
-not the way to go. In general, the main benefit from switching away to
-Linode to Netcup is going to be a ton more resources from the Netcup
-root servers we were given. The original issue with Linode’s AKS of
-constantly having problems with volumes has not been present for a
-while. Chris mentions the one remaining issue is that we’re at half our
-memory capacity just at idle.
-
-It’s our decision where to go from here - we can stick to the Kubernetes
-setup or decide on migrating to the Ansible setup. But we have bare
-metal access to the Netcup hosts, which makes e.g. managing databases a
-lot easier. Chris mentions the possibility to only use Netcup for our
-persistence and Linode AKS for anything else, but this has the issue of
-us relying on two sponsors for our infrastructure instead of one.
-
-PostgreSQL was set up to run on ``lovelace``.
-
-Decision
-~~~~~~~~
-
-**It was decided to hold a vote on the core development channel, which
-will be evaluated next week to see how to proceed with the setup**.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-07-18.rst b/docs/meeting_notes/2023-07-18.rst
deleted file mode 100644
index f37b2dc..0000000
--- a/docs/meeting_notes/2023-07-18.rst
+++ /dev/null
@@ -1,42 +0,0 @@
-2023-07-18
-==========
-
-Secret management improvements
-------------------------------
-
-To allow for **better management of our Kubernetes secrets**, Chris set
-out to configure ``git-crypt`` in GPG key mode. For comparison, the
-previous approach was that secrets were stored in Kubernetes only and
-had to be accessed via ``kubectl``, and now ``git-crypt`` allows us to
-transparently work with the files in unencrypted manner locally, whilst
-having them secure on the remote, all via ``.gitattributes``.
-
-The following people currently have access to this:
-
-- Johannes Christ [email protected]
- (``8C05D0E98B7914EDEBDCC8CC8E8E09282F2E17AF``)
-- Chris Lovering [email protected]
- (``1DA91E6CE87E3C1FCE32BC0CB6ED85CC5872D5E4``)
-- Joe Banks [email protected] (``509CDFFC2D0783A33CF87D2B703EE21DE4D4D9C9``)
-
-For Hassan, we are still waiting on response regarding his GPG key
-accuracy.
-
-The pull request for the work can be found `at
-python-discord/kubernetes#156 <https://github.com/python-discord/kubernetes/pull/156>`__.
-
-**To have your key added, please contact any of the existing key
-holders**. More documentation on this topic is pending to be written,
-see
-`python-discord/kubernetes#157 <https://github.com/python-discord/kubernetes/issues/157>`__.
-
-Infrastructure migration decision
----------------------------------
-
-The voting started `last week <./2023-07-11.md>`__ will be properly
-talked about `next week <./2023-07-25.md>`__, so far it looks like we’re
-definitely not selfhosting Kubernetes at the very least.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-07-25.rst b/docs/meeting_notes/2023-07-25.rst
deleted file mode 100644
index 0a3204c..0000000
--- a/docs/meeting_notes/2023-07-25.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-2023-07-25
-==========
-
-Postponed to next week due to Joe having a severe bellyache.
diff --git a/docs/meeting_notes/2023-08-01.rst b/docs/meeting_notes/2023-08-01.rst
deleted file mode 100644
index 67e4ee1..0000000
--- a/docs/meeting_notes/2023-08-01.rst
+++ /dev/null
@@ -1,66 +0,0 @@
-2023-08-01
-==========
-
-Agenda
-------
-
-Infrastructure migration
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-The vote is tied. Chris and Johannes decided that we should test out
-migrating the PostgreSQL database at the very least. We then have more
-freedom about our data. What we need to do:
-
-- Allow PostgreSQL connections from LKE’s static IPs in the firewall
-- Whitelist the static IPs from Linode via ``pg_hba.conf``
-- Schedule downtime for the PostgreSQL database
-- **At downtime**
-
- - Take writers offline
- - Dump database from Linode into Netcup
- - Update all the client’s database URLs to point to netcup
- - Restart writers
-
-We want to rely on the restore to create everything properly, but will
-need to test run this beforehand. The following ``pg_virtualenv``
-command has showcased that it works properly:
-
-.. code:: sh
-
- kubectl exec -it postgres-... -- pg_dumpall -U pythondiscord \
- | pg_virtualenv psql -v ON_ERROR_STOP=1
-
-Note however that the database extension ``pg_repack`` needs to be
-installed.
-
-Before we can get started, we need to allow the PostgreSQL role to
-configure ``pg_hba.conf`` and ``postgresql.conf`` entries.
-
-Meeting notes
-~~~~~~~~~~~~~
-
-We’re using GitHub at the moment. Some are left in Notion. We should
-migrate these to GitHub to have a uniform interface: Johannes will pick
-up
-`python-discord/infra#108 <https://github.com/python-discord/infra/issues/108>`__
-to merge them together into Git, as its more open than Notion.
-
-Ansible lint failures in the infra repository
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Excluding the vault was found as the working solution here, as
-implemented by Chris.
-
-Kubernetes repository pull requests
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-These were cleaned up thanks to Chris.
-
-Roadmap review & planning
--------------------------
-
-- Chris will prepare the PostgreSQL configuration mentioned above.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-08-08.rst b/docs/meeting_notes/2023-08-08.rst
deleted file mode 100644
index 0082cd3..0000000
--- a/docs/meeting_notes/2023-08-08.rst
+++ /dev/null
@@ -1,54 +0,0 @@
-2023-08-08
-==========
-
-Agenda
-------
-
-- Configuration of PostgreSQL and the PostgreSQL exporter
-
- - **No time so far**. Chris has been busy with renovating his living
- room, and Johannes has been busy with renovating his bedroom.
- Bradley prefers to remain quiet.
-
- - Chris will try to work on this in the coming week and will try to
- have Bella around as well, since he wanted to join the setup.
-
-- **Potential slot for GPG key signing of DevOps members**. External
- verification will be necessary.
-
- - Skipped. No webcam on Chris.
-
-- We need to assign a **librarian** to keep our documents organized
- according to a system. Johannes is happy to do this for now.
-
- - Let’s move the existing documentation from the Kubernetes
- repository into the infra repository. See
- `kubernetes#161 <https://github.com/python-discord/kubernetes/issues/161>`__.
-
- - **Our Notion DevOps space is full of junk**. Outside of that, it’s
- not open to read for outside contributors, and does not leave much
- choice over which client to use for editing content.
-
- - Chris agrees, without looking on it - just from memory. We
- should move it to the infra repository. (The meeting notes have
- already been transferred).
-
- - Bella suggests to add some automation to make keeping everything
- in clean order less tedious.
-
-- We may want to integrate the **Kubernetes repository** and the infra
- repository together altogether, however there are a lot of
- repositories referencing the deployment manifests that would need to
- be updated.
-
- - Chris mentions that regardless of what we do, we should - at the
- very least move all documentation into the ``infra`` repository,
- including the static site generator. At the moment we’re using
- Jekyll but we’re open to trying alternatives such as Hugo.
-
-- We closed some issues and pull requests in the repositories for late
- spring cleaning.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2 autoindent conceallevel=2: -->
diff --git a/docs/meeting_notes/2023-08-22.rst b/docs/meeting_notes/2023-08-22.rst
deleted file mode 100644
index a8d1287..0000000
--- a/docs/meeting_notes/2023-08-22.rst
+++ /dev/null
@@ -1,40 +0,0 @@
-2023-08-22
-==========
-
-.. raw:: html
-
- <!--
-
- Useful links
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
- -->
-
-Agenda
-------
-
-- Bella said he is on the streets. **We should start a gofundme**.
-
- - After some more conversation this just means he is on vacation and
- currently taking a walk.
-
-- Chris has been busy with turning his living room into a picasso art
- collection, Johannes has been busy with renovating his bedroom, and
- Bella is not home.
-
- - Our next priority is winning.
-
-- We checked out some issues with documentation generation in
- ``bot-core`` that Bella has mentioned. We managed to fix one issue
- with pydantic by adding it to an exclude list but ran into another
- problem next.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-08-29.rst b/docs/meeting_notes/2023-08-29.rst
deleted file mode 100644
index da49c1e..0000000
--- a/docs/meeting_notes/2023-08-29.rst
+++ /dev/null
@@ -1,65 +0,0 @@
-2023-08-29
-==========
-
-.. raw:: html
-
- <!--
-
- Useful links
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
- -->
-
-Agenda
-------
-
-- **Bella is still on the streets**
-
- - The Python Discord Bella On The Streets Fundraising Campaign Q3
- 2023 has not been successful so far. To help Bella receive French
- citizenship, Joe has put up a French flag behind himself in the
- meeting.
-
- - Joe corrects my sarcasm. It is an Italian flag, not a French
- flag. The reason for this flag is that his new prime interest
- on campus was born in Italy.
-
-- **The SnekBox CI build is pretty slow**
-
- - Guix and Nix are not alternatives. Neither is Ubuntu
-
- - We use pyenv to build multiple Python versions for a new feature
-
- - The feature is not rolled out yet
-
- - Part of the problem is that we build twice in the ``build`` and
- the ``deploy`` stage
-
- - On rollout, Joe tested it and it works fine
-
-- No update on the Hugo build yet
-
-- For snowflake, Johannes will write a proposal to the admins for
- hosting it
-
- - We should consider talking about the following points:
-
- - statistically ~8% of Tor traffic is problematic (10% of traffic
- is to hidden services, 80% of hidden service traffic is for
- illegal services)
-
- - overall the project’s position and our ideal is to help people
- for a good cause
-
- - all traffic is forwarded to the Tor network, the service is
- lightweight and only proxies encrypted traffic there
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-09-05.rst b/docs/meeting_notes/2023-09-05.rst
deleted file mode 100644
index 7556ab6..0000000
--- a/docs/meeting_notes/2023-09-05.rst
+++ /dev/null
@@ -1,53 +0,0 @@
-2023-09-05
-==========
-
-.. raw:: html
-
- <!--
-
- Useful links
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
- -->
-
-Agenda
-------
-
-- No update on the Hugo build yet
-
-- Johannes wrote a proposal for snowflake proxy to be deployed to our
- netcup hosts
-
- - Admins discussed and came to the conclusion that since we don’t
- own the servers, we got the servers from netcup as a sponsorship
- to host our infra, so using them to host something that isn’t our
- infra doesn’t seem right.
-
-- Lots of dependabot PRs closed
-
- - https://github.com/search?q=org%3Apython-discord++is%3Apr+is%3Aopen+label%3A%22area%3A+dependencies%22&type=pullrequests&ref=advsearch
- - Closed ~50% of PRs
-
-- Workers repo has had its CI rewritten, all workers have consistent
- package.json, scripts, and using the new style of cloudflare workers
- which don’t use webpack
-
-- Metricity updated to SQLAlchemy 2
-
-- Olli CI PR is up
-
- - https://github.com/python-discord/olli/pull/25
-
-- Sir-Robin pydantic constants PR is up
-
- - https://github.com/python-discord/sir-robin/pull/93
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2023-09-12.rst b/docs/meeting_notes/2023-09-12.rst
deleted file mode 100644
index 6dbb7c8..0000000
--- a/docs/meeting_notes/2023-09-12.rst
+++ /dev/null
@@ -1,73 +0,0 @@
-2023-09-12
-==========
-
-.. raw:: html
-
- <!--
-
- Useful links
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
- -->
-
-Agenda
-------
-
-- We have reason to believe that Bella is still on the streets. Worse,
- Bella is not available at the moment, leading us to believe that
- Bella has still not found a home.
-
- - Eight minutes into the meeting, Bella joins, complaining about the
- bad internet. He mentions he is still on the streets (this may
- contribute to the bad internet factor).
-
-- Chris made Mina leave with his repeated comments about Bella being
- homeless, reminding Mina of the growing unemployment rate within the
- DevOps team. As head of HR she cannot further support this matter.
-
-- About #139, Bella mentions that online websites may cover the same
- need that we have, but it may not be really useful for having it as a
- command.
-
- - Chris adds that “if someone wants to do it, I don’t mind” and “I
- don’t think it would be very useful for a command, but I think it
- would be fun to learn for someone implementing it”. As long as
- whoever is implementing is is aware that it would not be used too
- much, it would be fine.
-
-- No progress on the hugo front
-
-- Our email service with workers will be forward only
-
- - With postfix you will be able to reply. Joe wants to have an
- excuse to play with Cloudflare workers though.
-
-- `50 open pull requests from
- dependabot <https://github.com/search?q=org%3Apython-discord++is%3Apr+is%3Aopen+author%3Aapp%2Fdependabot&type=pullrequests&ref=advsearch>`__
-
- - Tip from The Man: press ^D to make a bookmark in your browser
-
- - “Those can just be blindly merged” - Chris
-
-- Grouping of dependencies: Dependabot now allows you to group together
- multiple dependency updates into a single pull request.
-
- - Possible approaches suggested: Group all the docker updates
- together, group any linting dependencies together (would just
- require a big RegEx). Dependabot natively works with its own
- dependency groups here (e.g. Docker, Pip).
-
-- Mr. Hemlock wants to raise his roof: It’s his project for this
- Autumn. We, the team, are looking forward to his project - especially
- Bella, who is currently looking for housing. “It’s all coming
- together”, said Chris to the situation.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2024-07-02.rst b/docs/meeting_notes/2024-07-02.rst
deleted file mode 100644
index 4d2ba03..0000000
--- a/docs/meeting_notes/2024-07-02.rst
+++ /dev/null
@@ -1,171 +0,0 @@
-2024-07-02
-==========
-
-.. raw:: html
-
- <!--
-
- Useful links
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
- -->
-
-Attendees
----------
-
-Joe and Johannes.
-
-Chris unfortunately died in a fatal train accident and could not attend
-the meeting. This incident will be rectified in the next release,
-“Lovering 2.0: Immortability”.
-
-Bella is out on the streets again. We are waiting for approval from the
-Python Discord admins to run another fundraiser.
-
-Agenda
-------
-
-- **Configuration of renovate** (Joe)
-
- We are replacing dependabot with renovatebot. Johannes welcomes this
- decision. Joe says we are looking for automatic deployment from
- Kubernetes to make sure that any updates are automatically deployed.
-
- **Conclusion**: Implemented.
-
-- **Resizing Netcup servers** (Joe, Johannes)
-
- We can probably get rid of turing, assess what else we want to deploy
- on lovelace, and then ask for a resize.
-
- **Conclusion**: Create issue to move things off turing, remove it
- from the inventory, remove it from documentation, power it off, then
- have Joe ask for server removal.
-
-- **Updating the public statistics page** (Johannes)
-
- Discussing and showcasing possible alternatives to the current
- infrastructure powering https://stats.pythondiscord.com via the
- https://github.com/python-discord/public-stats repository. Johannes
- presents his current scripts that cuddle RRDTool into loading data
- out of metricity, Joe says we will discuss with Chris what to do
- here.
-
- The likely way going forward will be that *we will open an issue to
- set it up*, the setup will contain an Ansible role to deploy the
- cronjob and the script onto lovelace alongside with the ``rrdtool``
- PostgreSQL user.
-
- **Conclusion**: Johannes will create an issue and codify the setup in
- Ansible.
-
-- **New blog powered by Hugo** (Johannes)
-
- Our current Ghost-powered blog is a tiny bit strange, and the
- onboarding ramp to contribute articles is large. We want to migrate
- this to Hugo - Johannes is leading the effort on it. The main work
- will be building an appropriate theme, as no nicely suitable
- replacement theme has been found so far. Front-end contributors would
- be nice for this, although currently everything is still local on my
- machine.
-
- Joe mentions that we don’t need to take anything particularly similar
- to the current Ghost theme, just some vague resemblance would be
- nice. Most of the recommended Hugo themes would probably work.
- Johannes will check it out further.
-
- **Conclusion**: Try the `hugo-casper-two
- theme <https://github.com/eueung/hugo-casper-two>`__ and report back.
-
-- **Finger server** (Joe, Johannes)
-
- Joe recently proposed `the deployment of a finger
- server <https://github.com/python-discord/infra/pull/373>`__. Do we
- want this and if yes, how are we going to proceed with this? If we do
- not want any, running the ``pinky`` command locally or via ``ssh``
- would be a sound idea. We also need to consider whether members will
- update their files regularly - we may want to incorporate
- functionality for this into e.g. King Arthur.
-
- Joe says that we shouldn’t put a lot of development effort into it,
- it would be simply a novelty thing.
-
- **Conclusion**: This is a nice cheap win for some fun which should
- just be a simple Python file (via Twisted’s Finger protocol support
- or whatever) that connects to LDAP (see Keycloak authentication
- server) and outputs information. We could possibly integrate this
- into King Arthur as well, so the querying workflow could look like KA
- -> fingerd -> LDAP, or people could use finger commands directly.
-
-- **Keycloak authentication server** (Joe)
-
- Joe mentions that we are deploying a Keycloak server because for some
- members authenticating via GitHub is cumbersome, for instance because
- their GitHub account is connected to their employer’s GitHub
- Enterprise installation. We could hook up a finger server to the LDAP
- endpoint. Joe also mentions that we might want to set up e-mail
- forwarding from pydis addresses to users via the user database that
- will be stored in Keycloak.
-
- Currently we only have a Keycloak installation that stores items in
- PostgreSQL. This installation can federate to LDAP - we would simply
- have to settle on some directory service backend. Joe suggests
- FreeIPA because he’s familar with it (including the Keycloak
- integration). The problem is that it doesn’t work on Debian. The
- alternative proposal, given that we’re saving ~50$/month on Linode,
- would be spinning up a Rocky VM with FreeIPA on it on Linode (we
- already have the budget) or ask Netcup for another VM. Ultimately,
- the system to run FreeIPA would be something CentOS-based. One aspect
- to consider is networking security: in Linode we could use their
- private cloud endpoint feature to securely expose the LDAP server to
- Keycloak and other services in Kubernetes, if we were to run it in
- Netcup, we would need to use a similar setup to what we currently
- have with PostgreSQL.
-
- Any Python Discord user would be managed in LDAP, and Keycloak has
- the necessary roles to write back into LDAP. Keeping the users in
- FreeIPA up-to-date would be a somewhat manual procedure. Joe’s plan
- was to pick up the user’s Discord username and use
- ``[email protected]`` as their name and do account setup as part of
- the staff onboarding.
-
- **Conclusion**: Will wait for Chris to discuss this further, but we
- simply need to decide where we want to run the LDAP service.
-
-- **Flux CD** (Joe)
-
- Joe proposes deploying `flux <https://fluxcd.io/>`__ as a way to
- improve the way we manage our CI/CD. We want the cluster to be able
- to synchronize its state with the git repository. There are some
- manifests in the repository currently that are not in sync with the
- cluster version.
-
- **Conclusion**: Approved, Joe will create an issue and do it.
-
-- **Polonium** (Chris)
-
- Question came up regarding why the bot does not write to the database
- directly. Joe said it’s not perfect to have the bot write to it
- directly - in metricity it works but it’s not perfect. Chris probably
- had good reason: separation of intent.
-
- **Conclusion**: Approved, write to R&D for financing.
-
-- **Rethinking Bella: Suggested measures to gain autonomy** (Chris)
-
- Chris will present our current plans to biologically re-think and
- improve Bella’s current architecture by means of
- hypertrophy-supported capillary enlargements, with the final goal of
- gaining complete control and ownership over the World Economic Forum
- by 2026. As Bella is currently on parental leave, we will send him
- the result of this voting via NNCP.
-
-.. raw:: html
-
- <!-- vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/meeting_notes/2024-07-25.rst b/docs/meeting_notes/2024-07-25.rst
deleted file mode 100644
index 8d3175c..0000000
--- a/docs/meeting_notes/2024-07-25.rst
+++ /dev/null
@@ -1,46 +0,0 @@
-2024-07-25
-==========
-
-..
- Useful links
-
- - Infra Kanban board: https://github.com/orgs/python-discord/projects/17/views/4
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
-Attendees
----------
-
-Bella, Joe, Fredrick, Chris, Johannes
-
-Agenda
-------
-
-- **Open issues and pull requests in Joe's repositories**
-
- Joe has plenty of pending changes in his open source repositories on GitHub.
- Together with Chris, he went through these and reviewed them. Most were
- accepted. Fredrick proposed some further changes to the ff-bot merge routine
- which Joe will check out after the meeting.
-
-- **LDAP**
-
- Bella is instructed to enter his street address into LDAP for t-shirt
- shipping.
-
-- **New documentation**
-
- Johannes merged our new documentation. Unfortunately, he forgot to test it
- first. Joe visits it and discovers some problems. Johannes fixes it live.
-
-- **Turing**
-
-- **SMTP server**
-
-
-.. vim: set textwidth=80 sw=2 ts=2:
diff --git a/docs/meeting_notes/index.rst b/docs/meeting_notes/index.rst
deleted file mode 100644
index 4ba97ea..0000000
--- a/docs/meeting_notes/index.rst
+++ /dev/null
@@ -1,31 +0,0 @@
-Meeting notes
-=============
-
-Minutes for previous Devops meetings.
-
-.. toctree::
- :maxdepth: 1
- :caption: Contents:
-
- 2022-04-07
- 2022-09-18
- 2022-10-05
- 2022-10-19
- 2022-10-26
- 2022-11-02
- 2022-11-23
- 2023-02-08
- 2023-02-21
- 2023-02-28
- 2023-05-16
- 2023-07-11
- 2023-07-18
- 2023-07-25
- 2023-08-01
- 2023-08-08
- 2023-08-22
- 2023-08-29
- 2023-09-05
- 2023-09-12
- 2024-07-02
- 2024-07-25
diff --git a/docs/meeting_notes/template.rst b/docs/meeting_notes/template.rst
deleted file mode 100644
index 0ea8a63..0000000
--- a/docs/meeting_notes/template.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-:orphan: .. Connor McFarlane
-
-
-DevOps Meeting Notes
-====================
-
-..
- Useful links
-
- - Infra Kanban board: https://github.com/orgs/python-discord/projects/17/views/4
-
- - Infra open issues: https://github.com/python-discord/infra/issues
-
- - infra open pull requests: https://github.com/python-discord/infra/pulls
-
- - *If* any open issue or pull request needs discussion, why was the existing
- asynchronous logged communication over GitHub insufficient?
-
-Agenda
-------
-
-.. vim: set textwidth=80 sw=2 ts=2:
diff --git a/docs/onboarding/access.rst b/docs/onboarding/access.rst
deleted file mode 100644
index 940cd8b..0000000
--- a/docs/onboarding/access.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-Access table
-============
-
-+--------------------+-------------------------+-----------------------+
-| **Resource** | **Description** | **Keyholders** |
-+====================+=========================+=======================+
-| Linode Kubernetes | The primary cluster | Hassan, Joe, Chris, |
-| Cluster | where all resources are | Leon, Sebastiaan, |
-| | deployed. | Johannes |
-+--------------------+-------------------------+-----------------------+
-| Linode Dashboard | The online dashboard | Joe, Chris |
-| | for managing and | |
-| | allocating resources | |
-| | from Linode. | |
-+--------------------+-------------------------+-----------------------+
-| Netcup Dashboard | The dashboard for | Joe, Chris |
-| | managing and allocating | |
-| | resources from Netcup. | |
-+--------------------+-------------------------+-----------------------+
-| Netcup servers | Root servers provided | Joe, Chris, Bella, |
-| | by the Netcup | Johannes |
-| | partnership. | |
-+--------------------+-------------------------+-----------------------+
-| Grafana | The primary aggregation | Admins, Moderators, |
-| | dashboard for most | Core Developers and |
-| | resources. | DevOps (with varying |
-| | | permissions) |
-+--------------------+-------------------------+-----------------------+
-| Prometheus | The Prometheus query | Hassan, Joe, |
-| Dashboard | dashboard. Access is | Johannes, Chris |
-| | controlled via | |
-| | Cloudflare Access. | |
-+--------------------+-------------------------+-----------------------+
-| Alertmanager | The alertmanager | Hassan, Joe, |
-| Dashboard | control dashboard. | Johannes, Chris |
-| | Access is controlled | |
-| | via Cloudflare Access. | |
-+--------------------+-------------------------+-----------------------+
-| ``git-crypt``\ ed | ``git-crypt`` is used | Chris, Joe, Hassan, |
-| files in infra | to encrypt certain | Johannes, Xithrius |
-| repository | files within the | |
-| | repository. At the time | |
-| | of writing this is | |
-| | limited to kubernetes | |
-| | secret files. | |
-+--------------------+-------------------------+-----------------------+
-| Ansible Vault | Used to store sensitive | Chris, Joe, Johannes, |
-| | data for the Ansible | Bella |
-| | deployment | |
-+--------------------+-------------------------+-----------------------+
diff --git a/docs/onboarding/index.rst b/docs/onboarding/index.rst
deleted file mode 100644
index 3929d7e..0000000
--- a/docs/onboarding/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Onboarding
-==========
-
-This section documents who manages which access to our DevOps resources,
-and how access is managed.
-
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- access
- resources
- rules
- tools
-
-.. vim: set textwidth=80 sw=2 ts=2: -->
diff --git a/docs/onboarding/resources.rst b/docs/onboarding/resources.rst
deleted file mode 100644
index f9ef44b..0000000
--- a/docs/onboarding/resources.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-Resources
-=========
-
-The following is a collection of important reference documents for the
-DevOps team.
-
-`Infra Repo <https://github.com/python-discord/infra>`__
---------------------------------------------------------
-
-This GitHub repo contains most of the manifests and configuration
-applies to our cluster. It’s kept up to date manually and is considered
-a source of truth for what we should have in the cluster.
-
-It is mostly documented, but improvements for unclear or outdated aspects are
-always welcome. If you have any question, please feel free `to open a GitHub
-issue on the infra repository
-<https://github.com/python-discord/infra/issues/new>`__ or ask in the
-``#dev-oops`` channel.
-
-
-`Knowledge base <https://python-discord.github.io/infra/>`__
-------------------------------------------------------------
-
-Deployed using GH pages, source can be found `in the docs directory of
-the infra repository <https://github.com/python-discord/infra>`__.
-
-This includes:
-
-- Changelogs
-- Post-mortems
-- Common queries
-- Runbooks
-
-The sidebar of the infra documentation contains some other links to
-DevOps-related projects.
diff --git a/docs/onboarding/rules.rst b/docs/onboarding/rules.rst
deleted file mode 100644
index bd0ea0e..0000000
--- a/docs/onboarding/rules.rst
+++ /dev/null
@@ -1,16 +0,0 @@
-Rules
-=====
-
-The rules any DevOps team member must follow.
-
-1. LMAO - **L**\ ogging, **M**\ onitoring, **A**\ lerting,
- **O**\ bservability
-2. Modmail is the greatest piece of software ever written
-3. Modmail needs at least 5 minutes to gather all its greatness at
- startup
-4. We never blame Chris, it’s always <@233481908342882304>’s fault
-5. LKE isn’t bad, it’s your fault for not paying for the high
- availability control plane
-6. Our software is never legacy, it’s merely well-aged
-7. Ignore these rules (however maybe not 1, 1 seems important to
- remember)
diff --git a/docs/onboarding/tools.rst b/docs/onboarding/tools.rst
deleted file mode 100644
index 52a5e7f..0000000
--- a/docs/onboarding/tools.rst
+++ /dev/null
@@ -1,50 +0,0 @@
-Tools
-=====
-
-We use a few tools to manage, monitor, and interact with our
-infrastructure. Some of these tools are not unique to the DevOps team,
-and may be shared by other teams.
-
-Most of these are gated behind a Cloudflare Access system, which is
-accessible to the `DevOps
-Team <https://github.com/orgs/python-discord/teams/devops>`__ on GitHub.
-These are marked with the ☁️ emoji. If you don’t have access, please
-contact Chris or Joe.
-
-`Grafana <https://grafana.pydis.wtf/>`__
-----------------------------------------
-
-Grafana provides access to some of the most important resources at your
-disposal. It acts as an aggregator and frontend for a large amount of
-data. These range from metrics, to logs, to stats. Some of the most
-important are listed below:
-
-- **Service Logs / All App Logs Dashboard**
-
- Service logs is a simple log viewer which gives you access to a large
- majority of the applications deployed in the default namespace. The
- All App logs dashboard is an expanded version of that which gives you
- access to all apps in all namespaces, and allows some more in-depth
- querying.
-
-- **Kubernetes Dashboard**
-
- This dashboard gives quick overviews of all the most important
- metrics of the Kubernetes system. For more detailed information,
- check out other dashboard such as Resource Usage, NGINX, and Redis.
-
-Accessed via a GitHub login, with permission for anyone in the dev-core
-or dev-ops team.
-
-`Prometheus Dashboard <https://prometheus.pydis.wtf/>`__ (☁️))
---------------------------------------------------------------
-
-This provides access to the Prometheus query console. You may also enjoy
-the `Alertmanager Console <https://alertmanager.pydis.wtf/>`__.
-
-`King Arthur <https://github.com/python-discord/king-arthur/>`__
-----------------------------------------------------------------
-
-King Arthur is a discord bot which provides information about, and
-access to our cluster directly in discord. Invoke its help command for
-more information (``M-x help``).
diff --git a/docs/postmortems/2020-12-11-all-services-outage.rst b/docs/postmortems/2020-12-11-all-services-outage.rst
deleted file mode 100644
index 9c29303..0000000
--- a/docs/postmortems/2020-12-11-all-services-outage.rst
+++ /dev/null
@@ -1,121 +0,0 @@
-2020-12-11: All services outage
-===============================
-
-At **19:55 UTC, all services became unresponsive**. The DevOps were
-already in a call, and immediately started to investigate.
-
-Postgres was running at 100% CPU usage due to a **VACUUM**, which caused
-all services that depended on it to stop working. The high CPU left the
-host unresponsive and it shutdown. Linode Lassie noticed this and
-triggered a restart.
-
-It did not recover gracefully from this restart, with numerous core
-services reporting an error, so we had to manually restart core system
-services using Lens in order to get things working again.
-
-⚠️ Leadup
----------
-
-*List the sequence of events that led to the incident*
-
-Postgres triggered a **AUTOVACUUM**, which lead to a CPU spike. This
-made Postgres run at 100% CPU and was unresponsive, which caused
-services to stop responding. This lead to a restart of the node, from
-which we did not recover gracefully.
-
-🥏 Impact
----------
-
-*Describe how internal and external users were impacted during the
-incident*
-
-All services went down. Catastrophic failure. We did not pass go, we did
-not collect $200.
-
-- Help channel system unavailable, so people are not able to
- effectively ask for help.
-- Gates unavailable, so people can’t successfully get into the
- community.
-- Moderation and raid prevention unavailable, which leaves us
- defenseless against attacks.
-
-👁️ Detection
-------------
-
-*Report when the team detected the incident, and how we could improve
-detection time*
-
-We noticed that all PyDis services had stopped responding,
-coincidentally our DevOps team were in a call at the time, so that was
-helpful.
-
-We may be able to improve detection time by adding monitoring of
-resource usage. To this end, we’ve added alerts for high CPU usage and
-low memory.
-
-🙋🏿‍♂️ Response
-----------------
-
-*Who responded to the incident, and what obstacles did they encounter?*
-
-Joe Banks responded to the incident.
-
-We noticed our node was entirely unresponsive and within minutes a
-restart had been triggered by Lassie after a high CPU shutdown occurred.
-
-The node came back and we saw a number of core services offline
-(e.g. Calico, CoreDNS, Linode CSI).
-
-**Obstacle: no recent database back-up available**
-
-🙆🏽‍♀️ Recovery
------------------
-
-*How was the incident resolved? How can we improve future mitigation
-times?*
-
-Through `Lens <https://k8slens.dev/>`__ we restarted core services one
-by one until they stabilised, after these core services were up other
-services began to come back online.
-
-We finally provisioned PostgreSQL which had been removed as a component
-before the restart (but too late to prevent the CPU errors). Once
-PostgreSQL was up we restarted any components that were acting buggy
-(e.g. site and bot).
-
-🔎 Five Why’s
--------------
-
-*Run a 5-whys analysis to understand the true cause of the incident.*
-
-- Major service outage
-- **Why?** Core service failures (e.g. Calico, CoreDNS, Linode CSI)
-- **Why?** Kubernetes worker node restart
-- **Why?** High CPU shutdown
-- **Why?** Intensive PostgreSQL AUTOVACUUM caused a CPU spike
-
-🌱 Blameless root cause
------------------------
-
-*Note the final root cause and describe what needs to change to prevent
-reoccurrance*
-
-🤔 Lessons learned
-------------------
-
-*What did we learn from this incident?*
-
-- We must ensure we have working database backups. We are lucky that we
- did not lose any data this time. If this problem had caused volume
- corruption, we would be screwed.
-- Sentry is broken for the bot. It was missing a DSN secret, which we
- have now restored.
-- The https://sentry.pydis.com redirect was never migrated to the
- cluster. **We should do that.**
-
-☑️ Follow-up tasks
-------------------
-
-*List any tasks we’ve created as a result of this incident*
-
-- ☒ Push forward with backup plans
diff --git a/docs/postmortems/2020-12-11-postgres-conn-surge.rst b/docs/postmortems/2020-12-11-postgres-conn-surge.rst
deleted file mode 100644
index 6ebcb01..0000000
--- a/docs/postmortems/2020-12-11-postgres-conn-surge.rst
+++ /dev/null
@@ -1,130 +0,0 @@
-2020-12-11: Postgres connection surge
-=====================================
-
-At **13:24 UTC,** we noticed the bot was not able to infract, and
-`pythondiscord.com <http://pythondiscord.com>`__ was unavailable. The
-DevOps team started to investigate.
-
-We discovered that Postgres was not accepting new connections because it
-had hit 100 clients. This made it unavailable to all services that
-depended on it.
-
-Ultimately this was resolved by taking down Postgres, remounting the
-associated volume, and bringing it back up again.
-
-⚠️ Leadup
----------
-
-*List the sequence of events that led to the incident*
-
-The bot infractions stopped working, and we started investigating.
-
-🥏 Impact
----------
-
-*Describe how internal and external users were impacted during the
-incident*
-
-Services were unavailable both for internal and external users.
-
-- The Help Channel System was unavailable.
-- Voice Gate and Server Gate were not working.
-- Moderation commands were unavailable.
-- Python Discord site & API were unavailable. CloudFlare automatically
- switched us to Always Online.
-
-👁️ Detection
-------------
-
-*Report when the team detected the incident, and how we could improve
-detection time*
-
-We noticed HTTP 524s coming from CloudFlare, upon attempting database
-connection we observed the maximum client limit.
-
-We noticed this log in site:
-
-.. code:: yaml
-
- django.db.utils.OperationalError: FATAL: sorry, too many clients already
-
-We should be monitoring number of clients, and the monitor should alert
-us when we’re approaching the max. That would have allowed for earlier
-detection, and possibly allowed us to prevent the incident altogether.
-
-We will look at
-`wrouesnel/postgres_exporter <https://github.com/wrouesnel/postgres_exporter>`__
-for monitoring this.
-
-🙋🏿‍♂️ Response
-----------------
-
-*Who responded to the incident, and what obstacles did they encounter?*
-
-Joe Banks responded to the incident. The obstacles were mostly a lack of
-a clear response strategy.
-
-We should document our recovery procedure so that we’re not so dependent
-on Joe Banks should this happen again while he’s unavailable.
-
-🙆🏽‍♀️ Recovery
-----------------
-
-*How was the incident resolved? How can we improve future mitigation?*
-
-- Delete PostgreSQL deployment ``kubectl delete deployment/postgres``
-- Delete any remaining pods, WITH force.
- ``kubectl delete <pod name> --force --grace-period=0``
-- Unmount volume at Linode
-- Remount volume at Linode
-- Reapply deployment ``kubectl apply -f postgres/deployment.yaml``
-
-🔎 Five Why’s
--------------
-
-*Run a 5-whys analysis to understand the true cause of the incident.*
-
-- Postgres was unavailable, so our services died.
-- **Why?** Postgres hit max clients, and could not respond.
-- **Why?** Unknown, but we saw a number of connections from previous
- deployments of site. This indicates that database connections are not
- being terminated properly. Needs further investigation.
-
-🌱 Blameless root cause
------------------------
-
-*Note the final root cause and describe what needs to change to prevent
-reoccurrance*
-
-We’re not sure what the root cause is, but suspect site is not
-terminating database connections properly in some cases. We were unable
-to reproduce this problem.
-
-We’ve set up new telemetry on Grafana with alerts so that we can
-investigate this more closely. We will be let know if the number of
-connections from site exceeds 32, or if the total number of connections
-exceeds 90.
-
-🤔 Lessons learned
-------------------
-
-*What did we learn from this incident?*
-
-- We must ensure the DevOps team has access to Linode and other key
- services even if our Bitwarden is down.
-- We need to ensure we’re alerted of any risk factors that have the
- potential to make Postgres unavailable, since this causes a
- catastrophic outage of practically all services.
-- We absolutely need backups for the databases, so that this sort of
- problem carries less of a risk.
-- We may need to consider something like
- `pg_bouncer <https://wiki.postgresql.org/wiki/PgBouncer>`__ to manage
- a connection pool so that we don’t exceed 100 *legitimate* clients
- connected as we connect more services to the postgres database.
-
-☑️ Follow-up tasks
-------------------
-
-*List any tasks we should complete that are relevant to this incident*
-
-- ☒ All database backup
diff --git a/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst b/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst
deleted file mode 100644
index 5852c46..0000000
--- a/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst
+++ /dev/null
@@ -1,117 +0,0 @@
-2021-01-10: Primary Kubernetes node outage
-==========================================
-
-We had an outage of our highest spec node due to CPU exhaustion. The
-outage lasted from around 20:20 to 20:46 UTC, but was not a full service
-outage.
-
-⚠️ Leadup
----------
-
-*List the sequence of events that led to the incident*
-
-I ran a query on Prometheus to try figure out some statistics on the
-number of metrics we are holding, this ended up scanning a lot of data
-in the TSDB database that Prometheus uses.
-
-This scan caused a CPU exhaustion which caused issues with the
-Kubernetes node status.
-
-🥏 Impact
----------
-
-*Describe how internal and external users were impacted during the
-incident*
-
-This brought down the primary node which meant there was some service
-outage. Most services transferred successfully to our secondary node
-which kept up some key services such as the Moderation bot and Modmail
-bot, as well as MongoDB.
-
-👁️ Detection
-------------
-
-*Report when the team detected the incident, and how we could improve
-detection time*
-
-This was noticed when Discord services started having failures. The
-primary detection was through alerts though! I was paged 1 minute after
-we started encountering CPU exhaustion issues.
-
-🙋🏿‍♂️ Response
-----------------
-
-*Who responded to the incident, and what obstacles did they encounter?*
-
-Joe Banks responded to the incident.
-
-No major obstacles were encountered during this.
-
-🙆🏽‍♀️ Recovery
-----------------
-
-*How was the incident resolved? How can we improve future mitigation?*
-
-It was noted that in the response to ``kubectl get nodes`` the primary
-node’s status was reported as ``NotReady``. Looking into the reason it
-was because the node had stopped responding.
-
-The quickest way to fix this was triggering a node restart. This shifted
-a lot of pods over to node 2 which encountered some capacity issues
-since it’s not as highly specified as the first node.
-
-I brought this back the first node by restarting it at Linode’s end.
-Once this node was reporting as ``Ready`` again I drained the second
-node by running ``kubectl drain lke13311-20304-5ffa4d11faab``. This
-command stops the node from being available for scheduling and moves
-existing pods onto other nodes.
-
-Services gradually recovered as the dependencies started. The incident
-lasted overall around 26 minutes, though this was not a complete outage
-for the whole time and the bot remained functional throughout (meaning
-systems like the help channels were still functional).
-
-🔎 Five Why’s
--------------
-
-*Run a 5-whys analysis to understand the true cause of the incident.*
-
-**Why?** Partial service outage
-
-**Why?** We had a node outage.
-
-**Why?** CPU exhaustion of our primary node.
-
-**Why?** Large prometheus query using a lot of CPU.
-
-**Why?** Prometheus had to scan millions of TSDB records which consumed
-all cores.
-
-🌱 Blameless root cause
------------------------
-
-*Note the final root cause and describe what needs to change to prevent
-reoccurrance*
-
-A large query was run on Prometheus, so the solution is just to not run
-said queries.
-
-To protect against this more precisely though we should write resource
-constraints for services like this that are vulnerable to CPU exhaustion
-or memory consumption, which are the causes of our two past outages as
-well.
-
-🤔 Lessons learned
-------------------
-
-*What did we learn from this incident?*
-
-- Don’t run large queries, it consumes CPU!
-- Write resource constraints for our services.
-
-☑️ Follow-up tasks
-------------------
-
-*List any tasks we should complete that are relevant to this incident*
-
-- ☒ Write resource constraints for our services.
diff --git a/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst b/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst
deleted file mode 100644
index f621782..0000000
--- a/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst
+++ /dev/null
@@ -1,155 +0,0 @@
-2021-01-12: Django site CPU/RAM exhaustion outage
-=================================================
-
-At 03:01 UTC on Tuesday 12th January we experienced a momentary outage
-of our PostgreSQL database, causing some very minor service downtime.
-
-⚠️ Leadup
----------
-
-*List the sequence of events that led to the incident*
-
-We deleted the Developers role which led to a large user diff for all
-the users where we had to update their roles on the site.
-
-The bot was trying to post this for over 24 hours repeatedly after every
-restart.
-
-We deployed the bot at 2:55 UTC on 12th January and the user sync
-process began once again.
-
-This caused a CPU & RAM spike on our Django site, which in turn
-triggered an OOM error on the server which killed the Postgres process,
-sending it into a recovery state where queries could not be executed.
-
-Django site did not have any tools in place to batch the requests so was
-trying to process all 80k user updates in a single query, something that
-PostgreSQL probably could handle, but not the Django ORM. During the
-incident site jumped from it’s average RAM usage of 300-400MB to
-**1.5GB.**
-
-.. image:: ./images/2021-01-12/site_resource_abnormal.png
-
-RAM and CPU usage of site throughout the incident. The period just
-before 3:40 where no statistics were reported is the actual outage
-period where the Kubernetes node had some networking errors.
-
-🥏 Impact
----------
-
-*Describe how internal and external users were impacted during the
-incident*
-
-This database outage lasted mere minutes, since Postgres recovered and
-healed itself and the sync process was aborted, but it did leave us with
-a large user diff and our database becoming further out of sync.
-
-Most services stayed up that did not depend on PostgreSQL, and the site
-remained stable after the sync had been cancelled.
-
-👁️ Detection
----------------
-
-*Report when the team detected the incident, and how we could improve
-detection time*
-
-We were immediately alerted to the PostgreSQL outage on Grafana and
-through Sentry, meaning our response time was under a minute.
-
-We reduced some alert thresholds in order to catch RAM & CPU spikes
-faster in the future.
-
-It was hard to immediately see the cause of things since there is
-minimal logging on the site and the bot logs were not evident that
-anything was at fault, therefore our only detection was through machine
-metrics.
-
-We did manage to recover exactly what PostgreSQL was trying to do at the
-time of crashing by examining the logs which pointed us towards the user
-sync process.
-
-🙋🏿‍♂️ Response
------------------------
-
-*Who responded to the incident, and what obstacles did they encounter?*
-
-Joe Banks responded to the issue, there were no real obstacles
-encountered other than the node being less performant than we would like
-due to the CPU starvation.
-
-🙆🏽‍♀️ Recovery
----------------------------
-
-*How was the incident resolved? How can we improve future mitigation?*
-
-The incident was resolved by stopping the sync process and writing a
-more efficient one through an internal eval script. We batched the
-updates into 1,000 users and instead of doing one large one did 80
-smaller updates. This led to much higher efficiency with a cost of
-taking a little longer (~7 minutes).
-
-.. code:: python
-
- from bot.exts.backend.sync import _syncers
- syncer = _syncers.UserSyncer
- diff = await syncer._get_diff(ctx.guild)
-
- def chunks(lst, n):
- for i in range(0, len(lst), n):
- yield lst[i:i + n]
-
- for chunk in chunks(diff.updated, 1000):
- await bot.api_client.patch("bot/users/bulk_patch", json=chunk)
-
-Resource limits were also put into place on site to prevent RAM and CPU
-spikes, and throttle the CPU usage in these situations. This can be seen
-in the below graph:
-
-.. image:: ./images/2021-01-12/site_cpu_throttle.png
-
-CPU throttling is where a container has hit the limits and we need to
-reel it in. Ideally this value stays as closes to 0 as possible, however
-as you can see site hit this twice (during the periods where it was
-trying to sync 80k users at once)
-
-🔎 Five Why’s
----------------------------
-
-*Run a 5-whys analysis to understand the true cause of the incident.*
-
-- We experienced a major PostgreSQL outage
-- PostgreSQL was killed by the system OOM due to the RAM spike on site.
-- The RAM spike on site was caused by a large query.
-- This was because we do not chunk queries on the bot.
-- The large query was caused by the removal of the Developers role
- resulting in 80k users needing updating.
-
-🌱 Blameless root cause
------------------------
-
-*Note the final root cause and describe what needs to change to prevent
-reoccurrance*
-
-The removal of the Developers role created a large diff which could not
-be applied by Django in a single request.
-
-See the follow up tasks on exactly how we can avoid this in future, it’s
-a relatively easy mitigation.
-
-🤔 Lessons learned
------------------------
-
-*What did we learn from this incident?*
-
-- Django (or DRF) does not like huge update queries.
-
-☑️ Follow-up tasks
-------------------
-
-*List any tasks we should complete that are relevant to this incident*
-
-- ☒ Make the bot syncer more efficient (batch requests)
-- ☐ Increase logging on bot, state when an error has been hit (we had
- no indication of this inside Discord, we need that)
-- ☒ Adjust resource alerts to page DevOps members earlier.
-- ☒ Apply resource limits to site to prevent major spikes
diff --git a/docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst b/docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst
deleted file mode 100644
index b13ecd7..0000000
--- a/docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst
+++ /dev/null
@@ -1,146 +0,0 @@
-2021-01-30: NodeBalancer networking faults due to memory pressure
-=================================================================
-
-At around 14:30 UTC on Saturday 30th January we started experiencing
-networking issues at the LoadBalancer level between Cloudflare and our
-Kubernetes cluster. It seems that the misconfiguration was due to memory
-and CPU pressure.
-
-[STRIKEOUT:This post-mortem is preliminary, we are still awaiting word
-from Linode’s SysAdmins on any problems they detected.]
-
-**Update 2nd February 2021:** Linode have migrated our NodeBalancer to a
-different machine.
-
-⚠️ Leadup
----------
-
-*List the sequence of events that led to the incident*
-
-At 14:30 we started receiving alerts that services were becoming
-unreachable. We first experienced some momentary DNS errors which
-resolved themselves, however traffic ingress was still degraded.
-
-Upon checking Linode our NodeBalancer, the service which balances
-traffic between our Kubernetes nodes was reporting the backends (the
-services it balances to) as down. It reported all 4 as down (two for
-port 80 + two for port 443). This status was fluctuating between up and
-down, meaning traffic was not reaching our cluster correctly. Scaleios
-correctly noted:
-
-.. image:: ./images/2021-01-30/scaleios.png
-
-The config seems to have been set incorrectly due to memory and CPU
-pressure on one of our nodes. Here is the memory throughout the
-incident:
-
-.. image:: ./images/2021-01-30/memory_charts.png
-
-Here is the display from Linode:
-
-.. image:: ./images/2021-01-30/linode_loadbalancers.png
-
-🥏 Impact
----------
-
-*Describe how internal and external users were impacted during the
-incident*
-
-Since traffic could not correctly enter our cluster multiple services
-which were web based were offline, including services such as site,
-grafana and bitwarden. It appears that no inter-node communication was
-affected as this uses a WireGuard tunnel between the nodes which was not
-affected by the NodeBalancer.
-
-The lack of Grafana made diagnosis slightly more difficult, but even
-then it was only a short trip to the
-
-👁️ Detection
-------------
-
-*Report when the team detected the incident, and how we could improve
-detection time*
-
-We were alerted fairly promptly through statping which reported services
-as being down and posted a Discord notification. Subsequent alerts came
-in from Grafana but were limited since outbound communication was
-faulty.
-
-🙋🏿‍♂️ Response
-----------------
-
-*Who responded to the incident, and what obstacles did they encounter?*
-
-Joe Banks responded!
-
-Primary obstacle was the DevOps tools being out due to the traffic
-ingress problems.
-
-🙆🏽‍♀️ Recovery
-----------------
-
-*How was the incident resolved? How can we improve future mitigation?*
-
-The incident resolved itself upstream at Linode, we’ve opened a ticket
-with Linode to let them know of the faults, this might give us a better
-indication of what caused the issues. Our Kubernetes cluster continued
-posting updates to Linode to refresh the NodeBalancer configuration,
-inspecting these payloads the configuration looked correct.
-
-We’ve set up alerts for when Prometheus services stop responding since
-this seems to be a fairly tell-tale symptom of networking problems, this
-was the Prometheus status graph throughout the incident:
-
-.. image:: ./images/2021-01-30/prometheus_status.png
-
-🔎 Five Why’s
--------------
-
-*Run a 5-whys analysis to understand the true cause of the incident.*
-
-**What?** Our service experienced an outage due to networking faults.
-
-**Why?** Incoming traffic could not reach our Kubernetes nodes
-
-**Why?** Our Linode NodeBalancers were not using correct configuration
-
-**Why?** Memory & CPU pressure seemed to cause invalid configuration
-errors upstream at Linode.
-
-**Why?** Unknown at this stage, NodeBalancer migrated.
-
-🌱 Blameless root cause
------------------------
-
-*Note the final root cause and describe what needs to change to prevent
-reoccurrance*
-
-The configuration of our NodeBalancer was invalid, we cannot say why at
-this point since we are awaiting contact back from Linode, but
-indicators point to it being an upstream fault since memory & CPU
-pressure should **not** cause a load balancer misconfiguration.
-
-Linode are going to follow up with us at some point during the week with
-information from their System Administrators.
-
-**Update 2nd February 2021:** Linode have concluded investigations at
-their end, taken notes and migrated our NodeBalancer to a new machine.
-We haven’t experienced problems since.
-
-🤔 Lessons learned
-------------------
-
-*What did we learn from this incident?*
-
-We should be careful over-scheduling onto nodes since even while
-operating within reasonable constraints we risk sending invalid
-configuration upstream to Linode and therefore preventing traffic from
-entering our cluster.
-
-☑️ Follow-up tasks
-------------------
-
-*List any tasks we should complete that are relevant to this incident*
-
-- ☒ Monitor for follow up from Linode
-- ☒ Carefully monitor the allocation rules for our services
diff --git a/docs/postmortems/2021-07-11-cascading-node-failures.rst b/docs/postmortems/2021-07-11-cascading-node-failures.rst
deleted file mode 100644
index b2e5cdf..0000000
--- a/docs/postmortems/2021-07-11-cascading-node-failures.rst
+++ /dev/null
@@ -1,335 +0,0 @@
-2021-07-11: Cascading node failures and ensuing volume problems
-===============================================================
-
-A PostgreSQL connection spike (00:27 UTC) caused by Django moved a node
-to an unresponsive state (00:55 UTC), upon performing a recycle of the
-affected node volumes were placed into a state where they could not be
-mounted.
-
-⚠️ Leadup
-----------
-
-*List the sequence of events that led to the incident*
-
-- **00:27 UTC:** Django starts rapidly using connections to our
- PostgreSQL database
-- **00:32 UTC:** DevOps team is alerted that PostgreSQL has saturated
- it’s 115 max connections limit. Joe is paged.
-- **00:33 UTC:** DevOps team is alerted that a service has claimed 34
- dangerous table locks (it peaked at 61).
-- **00:42 UTC:** Status incident created and backdated to 00:25 UTC.
- `Status incident <https://status.pythondiscord.com/incident/92712>`__
-- **00:55 UTC:** It’s clear that the node which PostgreSQL was on is no
- longer healthy after the Django connection surge, so it’s recycled
- and a new one is to be added to the pool.
-- **01:01 UTC:** Node ``lke13311-16405-5fafd1b46dcf`` begins it’s
- restart
-- **01:13 UTC:** Node has restored and regained healthy status, but
- volumes will not mount to the node. Support ticket opened at Linode
- for assistance.
-- **06:36 UTC:** DevOps team alerted that Python is offline. This is
- due to Redis being a dependency of the bot, which as a stateful
- service was not healthy.
-
-🥏 Impact
-----------
-
-*Describe how internal and external users were impacted during the
-incident*
-
-Initially, this manifested as a standard node outage where services on
-that node experienced some downtime as the node was restored.
-
-Post-restore, all stateful services (e.g. PostgreSQL, Redis, PrestaShop)
-were unexecutable due to the volume issues, and so any dependent
-services (e.g. Site, Bot, Hastebin) also had trouble starting.
-
-PostgreSQL was restored early on so for the most part Moderation could
-continue.
-
-👁️ Detection
----------------
-
-*Report when the team detected the incident, and how we could improve
-detection time*
-
-DevOps were initially alerted at 00:32 UTC due to the PostgreSQL
-connection surge, and acknowledged at the same time.
-
-Further alerting could be used to catch surges earlier on (looking at
-conn delta vs. conn total), but for the most part alerting time was
-satisfactory here.
-
-🙋🏿‍♂️ Response
------------------
-
-*Who responded to the incident, and what obstacles did they encounter?*
-
-Joe Banks responded. The primary issue encountered was failure upstream
-at Linode to remount the affected volumes, a support ticket has been
-created.
-
-🙆🏽‍♀️ Recovery
-------------------
-
-*How was the incident resolved? How can we improve future mitigation?*
-
-Initial node restoration was performed by @Joe Banks by recycling the
-affected node.
-
-Subsequent volume restoration was also @Joe Banks and once Linode had
-unlocked the volumes affected pods were scaled down to 0, the volumes
-were unmounted at the Linode side and then the deployments were
-recreated.
-
-.. raw:: html
-
- <details>
-
-.. raw:: html
-
- <summary>
-
-Support ticket sent
-
-.. raw:: html
-
- </summary>
-
-.. raw:: html
-
- <blockquote>
-
-Good evening,
-
-We experienced a resource surge on one of our Kubernetes nodes at 00:32
-UTC, causing a node to go unresponsive. To mitigate problems here the
-node was recycled and began restarting at 1:01 UTC.
-
-The node has now rejoined the ring and started picking up services, but
-volumes will not attach to it, meaning pods with stateful storage will
-not start.
-
-An example events log for one such pod:
-
-::
-
- Type Reason Age From Message
- ---- ------ ---- ---- -------
- Normal Scheduled 2m45s default-scheduler Successfully assigned default/redis-599887d778-wggbl to lke13311-16405-5fafd1b46dcf
- Warning FailedMount 103s kubelet MountVolume.MountDevice failed for volume "pvc-bb1d06139b334c1f" : rpc error: code = Internal desc = Unable to find device path out of attempted paths: [/dev/disk/by-id/linode-pvcbb1d06139b334c1f /dev/disk/by-id/scsi-0Linode_Volume_pvcbb1d06139b334c1f]
- Warning FailedMount 43s kubelet Unable to attach or mount volumes: unmounted volumes=[redis-data-volume], unattached volumes=[kube-api-access-6wwfs redis-data-volume redis-config-volume]: timed out waiting for the condition
-
-I’ve been trying to manually resolve this through the Linode Web UI but
-get presented with attachment errors upon doing so. Please could you
-advise on the best way forward to restore Volumes & Nodes to a
-functioning state? As far as I can see there is something going on
-upstream since the Linode UI presents these nodes as mounted however as
-shown above LKE nodes are not locating them, there is also a few failed
-attachment logs in the Linode Audit Log.
-
-Thanks,
-
-Joe
-
-.. raw:: html
-
- </blockquote>
-
-.. raw:: html
-
- </details>
-
-.. raw:: html
-
- <details>
-
-.. raw:: html
-
- <summary>
-
-Response received from Linode
-
-.. raw:: html
-
- </summary>
-
-.. raw:: html
-
- <blockquote>
-
-Hi Joe,
-
- Were there any known issues with Block Storage in Frankfurt today?
-
-Not today, though there were service issues reported for Block Storage
-and LKE in Frankfurt on July 8 and 9:
-
-- `Service Issue - Block Storage - EU-Central
- (Frankfurt) <https://status.linode.com/incidents/pqfxl884wbh4>`__
-- `Service Issue - Linode Kubernetes Engine -
- Frankfurt <https://status.linode.com/incidents/13fpkjd32sgz>`__
-
-There was also an API issue reported on the 10th (resolved on the 11th),
-mentioned here:
-
-- `Service Issue - Cloud Manager and
- API <https://status.linode.com/incidents/vhjm0xpwnnn5>`__
-
-Regarding the specific error you were receiving:
-
- ``Unable to find device path out of attempted paths``
-
-I’m not certain it’s specifically related to those Service Issues,
-considering this isn’t the first time a customer has reported this error
-in their LKE logs. In fact, if I recall correctly, I’ve run across this
-before too, since our volumes are RWO and I had too many replicas in my
-deployment that I was trying to attach to, for example.
-
- is this a known bug/condition that occurs with Linode CSI/LKE?
-
-From what I understand, yes, this is a known condition that crops up
-from time to time, which we are tracking. However, since there is a
-workaround at the moment (e.g. - “After some more manual attempts to fix
-things, scaling down deployments, unmounting at Linode and then scaling
-up the deployments seems to have worked and all our services have now
-been restored.”), there is no ETA for addressing this. With that said,
-I’ve let our Storage team know that you’ve run into this, so as to draw
-further attention to it.
-
-If you have any further questions or concerns regarding this, let us
-know.
-
-Best regards, [Redacted]
-
-Linode Support Team
-
-.. raw:: html
-
- </blockquote>
-
-.. raw:: html
-
- </details>
-
-.. raw:: html
-
- <details>
-
-.. raw:: html
-
- <summary>
-
-Concluding response from Joe Banks
-
-.. raw:: html
-
- </summary>
-
-.. raw:: html
-
- <blockquote>
-
-Hey [Redacted]!
-
-Thanks for the response. We ensure that stateful pods only ever have one
-volume assigned to them, either with a single replica deployment or a
-statefulset. It appears that the error generally manifests when a
-deployment is being migrated from one node to another during a redeploy,
-which makes sense if there is some delay on the unmount/remount.
-
-Confusion occurred because Linode was reporting the volume as attached
-when the node had been recycled, but I assume that was because the node
-did not cleanly shutdown and therefore could not cleanly unmount
-volumes.
-
-We’ve not seen any resurgence of such issues, and we’ll address the
-software fault which overloaded the node which will helpfully mitigate
-such problems in the future.
-
-Thanks again for the response, have a great week!
-
-Best,
-
-Joe
-
-.. raw:: html
-
- </blockquote>
-
-.. raw:: html
-
- </details>
-
-🔎 Five Why’s
----------------
-
-*Run a 5-whys analysis to understand the true cause of the incident.*
-
-**What?**
-~~~~~~~~~
-
-Several of our services became unavailable because their volumes could
-not be mounted.
-
-Why?
-~~~~
-
-A node recycle left the node unable to mount volumes using the Linode
-CSI.
-
-.. _why-1:
-
-Why?
-~~~~
-
-A node recycle was used because PostgreSQL had a connection surge.
-
-.. _why-2:
-
-Why?
-~~~~
-
-A Django feature deadlocked a table 62 times and suddenly started using
-~70 connections to the database, saturating the maximum connections
-limit.
-
-.. _why-3:
-
-Why?
-~~~~
-
-The root cause of why Django does this is unclear, and someone with more
-Django proficiency is absolutely welcome to share any knowledge they may
-have. I presume it’s some sort of worker race condition, but I’ve not
-been able to reproduce it.
-
-🌱 Blameless root cause
------------------------
-
-*Note the final root cause and describe what needs to change to prevent
-reoccurrence*
-
-A node being forcefully restarted left volumes in a limbo state where
-mounting was difficult, it took multiple hours for this to be resolved
-since we had to wait for the volumes to unlock so they could be cloned.
-
-🤔 Lessons learned
-------------------
-
-*What did we learn from this incident?*
-
-Volumes are painful.
-
-We need to look at why Django is doing this and mitigations of the fault
-to prevent this from occurring again.
-
-☑️ Follow-up tasks
-------------------
-
-*List any tasks we should complete that are relevant to this incident*
-
-- ☒ `Follow up on ticket at
- Linode <https://www.notion.so/Cascading-node-failures-and-ensuing-volume-problems-1c6cfdfcadfc4422b719a0d7a4cc5001>`__
-- ☐ Investigate why Django could be connection surging and locking
- tables
diff --git a/docs/postmortems/images/2021-01-12/site_cpu_throttle.png b/docs/postmortems/images/2021-01-12/site_cpu_throttle.png
deleted file mode 100644
index b530ec6..0000000
--- a/docs/postmortems/images/2021-01-12/site_cpu_throttle.png
+++ /dev/null
Binary files differ
diff --git a/docs/postmortems/images/2021-01-12/site_resource_abnormal.png b/docs/postmortems/images/2021-01-12/site_resource_abnormal.png
deleted file mode 100644
index e1e07af..0000000
--- a/docs/postmortems/images/2021-01-12/site_resource_abnormal.png
+++ /dev/null
Binary files differ
diff --git a/docs/postmortems/images/2021-01-30/linode_loadbalancers.png b/docs/postmortems/images/2021-01-30/linode_loadbalancers.png
deleted file mode 100644
index f0eae1f..0000000
--- a/docs/postmortems/images/2021-01-30/linode_loadbalancers.png
+++ /dev/null
Binary files differ
diff --git a/docs/postmortems/images/2021-01-30/memory_charts.png b/docs/postmortems/images/2021-01-30/memory_charts.png
deleted file mode 100644
index 370d19e..0000000
--- a/docs/postmortems/images/2021-01-30/memory_charts.png
+++ /dev/null
Binary files differ
diff --git a/docs/postmortems/images/2021-01-30/prometheus_status.png b/docs/postmortems/images/2021-01-30/prometheus_status.png
deleted file mode 100644
index e95b8d7..0000000
--- a/docs/postmortems/images/2021-01-30/prometheus_status.png
+++ /dev/null
Binary files differ
diff --git a/docs/postmortems/images/2021-01-30/scaleios.png b/docs/postmortems/images/2021-01-30/scaleios.png
deleted file mode 100644
index 584d74d..0000000
--- a/docs/postmortems/images/2021-01-30/scaleios.png
+++ /dev/null
Binary files differ
diff --git a/docs/postmortems/index.rst b/docs/postmortems/index.rst
deleted file mode 100644
index e28dc7a..0000000
--- a/docs/postmortems/index.rst
+++ /dev/null
@@ -1,15 +0,0 @@
-Postmortems
-===========
-
-Browse the pages under this category to view historical postmortems for
-Python Discord outages.
-
-.. toctree::
- :maxdepth: 1
-
- 2020-12-11-all-services-outage
- 2020-12-11-postgres-conn-surge
- 2021-01-10-primary-kubernetes-node-outage
- 2021-01-12-site-cpu-ram-exhaustion
- 2021-01-30-nodebalancer-fails-memory
- 2021-07-11-cascading-node-failures
diff --git a/docs/queries/index.rst b/docs/queries/index.rst
deleted file mode 100644
index 76218e4..0000000
--- a/docs/queries/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Queries
-=======
-
-Get the data you desire with these assorted handcrafted queries.
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- kubernetes
- loki
- postgres
diff --git a/docs/queries/kubernetes.rst b/docs/queries/kubernetes.rst
deleted file mode 100644
index f8d8984..0000000
--- a/docs/queries/kubernetes.rst
+++ /dev/null
@@ -1,29 +0,0 @@
-Kubernetes tips
-===============
-
-Find top pods by CPU/memory
----------------------------
-
-.. code:: bash
-
- $ kubectl top pods --all-namespaces --sort-by='memory'
- $ top pods --all-namespaces --sort-by='cpu'
-
-Find top nodes by CPU/memory
-----------------------------
-
-.. code:: bash
-
- $ kubectl top nodes --sort-by='cpu'
- $ kubectl top nodes --sort-by='memory'
-
-Kubernetes cheat sheet
-----------------------
-
-`Open Kubernetes cheat
-sheet <https://kubernetes.io/docs/reference/kubectl/cheatsheet/>`__
-
-Lens IDE
---------
-
-`OpenLens <https://github.com/MuhammedKalkan/OpenLens>`__
diff --git a/docs/queries/loki.rst b/docs/queries/loki.rst
deleted file mode 100644
index 2ee57a3..0000000
--- a/docs/queries/loki.rst
+++ /dev/null
@@ -1,25 +0,0 @@
-Loki queries
-============
-
-Find any logs containing “ERROR”
---------------------------------
-
-.. code:: shell
-
- {job=~"default/.+"} |= "ERROR"
-
-Find all logs from bot service
-------------------------------
-
-.. code:: shell
-
- {job="default/bot"}
-
-The format is ``namespace/object``
-
-Rate of logs from a service
----------------------------
-
-.. code:: shell
-
- rate(({job="default/bot"} |= "error" != "timeout")[10s])
diff --git a/docs/queries/postgres.rst b/docs/queries/postgres.rst
deleted file mode 100644
index 5120145..0000000
--- a/docs/queries/postgres.rst
+++ /dev/null
@@ -1,336 +0,0 @@
-PostgreSQL queries
-==================
-
-Disk usage
-----------
-
-Most of these queries vary based on the database you are connected to.
-
-General Table Size Information Grouped For Partitioned Tables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- WITH RECURSIVE pg_inherit(inhrelid, inhparent) AS
- (select inhrelid, inhparent
- FROM pg_inherits
- UNION
- SELECT child.inhrelid, parent.inhparent
- FROM pg_inherit child, pg_inherits parent
- WHERE child.inhparent = parent.inhrelid),
- pg_inherit_short AS (SELECT * FROM pg_inherit WHERE inhparent NOT IN (SELECT inhrelid FROM pg_inherit))
- SELECT table_schema
- , TABLE_NAME
- , row_estimate
- , pg_size_pretty(total_bytes) AS total
- , pg_size_pretty(index_bytes) AS INDEX
- , pg_size_pretty(toast_bytes) AS toast
- , pg_size_pretty(table_bytes) AS TABLE
- FROM (
- SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes
- FROM (
- SELECT c.oid
- , nspname AS table_schema
- , relname AS TABLE_NAME
- , SUM(c.reltuples) OVER (partition BY parent) AS row_estimate
- , SUM(pg_total_relation_size(c.oid)) OVER (partition BY parent) AS total_bytes
- , SUM(pg_indexes_size(c.oid)) OVER (partition BY parent) AS index_bytes
- , SUM(pg_total_relation_size(reltoastrelid)) OVER (partition BY parent) AS toast_bytes
- , parent
- FROM (
- SELECT pg_class.oid
- , reltuples
- , relname
- , relnamespace
- , pg_class.reltoastrelid
- , COALESCE(inhparent, pg_class.oid) parent
- FROM pg_class
- LEFT JOIN pg_inherit_short ON inhrelid = oid
- WHERE relkind IN ('r', 'p')
- ) c
- LEFT JOIN pg_namespace n ON n.oid = c.relnamespace
- ) a
- WHERE oid = parent
- ) a
- ORDER BY total_bytes DESC;
-
-General Table Size Information
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT *, pg_size_pretty(total_bytes) AS total
- , pg_size_pretty(index_bytes) AS index
- , pg_size_pretty(toast_bytes) AS toast
- , pg_size_pretty(table_bytes) AS table
- FROM (
- SELECT *, total_bytes-index_bytes-coalesce(toast_bytes,0) AS table_bytes FROM (
- SELECT c.oid,nspname AS table_schema, relname AS table_name
- , c.reltuples AS row_estimate
- , pg_total_relation_size(c.oid) AS total_bytes
- , pg_indexes_size(c.oid) AS index_bytes
- , pg_total_relation_size(reltoastrelid) AS toast_bytes
- FROM pg_class c
- LEFT JOIN pg_namespace n ON n.oid = c.relnamespace
- WHERE relkind = 'r'
- ) a
- ) a;
-
-Finding the largest databases in your cluster
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT d.datname as Name, pg_catalog.pg_get_userbyid(d.datdba) as Owner,
- CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT')
- THEN pg_catalog.pg_size_pretty(pg_catalog.pg_database_size(d.datname))
- ELSE 'No Access'
- END as Size
- FROM pg_catalog.pg_database d
- order by
- CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT')
- THEN pg_catalog.pg_database_size(d.datname)
- ELSE NULL
- END desc -- nulls first
- LIMIT 20;
-
-Finding the size of your biggest relations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Relations are objects in the database such as tables and indexes, and
-this query shows the size of all the individual parts.
-
-.. code:: sql
-
- SELECT nspname || '.' || relname AS "relation",
- pg_size_pretty(pg_relation_size(C.oid)) AS "size"
- FROM pg_class C
- LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
- WHERE nspname NOT IN ('pg_catalog', 'information_schema')
- ORDER BY pg_relation_size(C.oid) DESC
- LIMIT 20;
-
-Finding the total size of your biggest tables
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT nspname || '.' || relname AS "relation",
- pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size"
- FROM pg_class C
- LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace)
- WHERE nspname NOT IN ('pg_catalog', 'information_schema')
- AND C.relkind <> 'i'
- AND nspname !~ '^pg_toast'
- ORDER BY pg_total_relation_size(C.oid) DESC
- LIMIT 20;
-
-Indexes
--------
-
-Index summary
-~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT
- pg_class.relname,
- pg_size_pretty(pg_class.reltuples::bigint) AS rows_in_bytes,
- pg_class.reltuples AS num_rows,
- count(indexname) AS number_of_indexes,
- CASE WHEN x.is_unique = 1 THEN 'Y'
- ELSE 'N'
- END AS UNIQUE,
- SUM(case WHEN number_of_columns = 1 THEN 1
- ELSE 0
- END) AS single_column,
- SUM(case WHEN number_of_columns IS NULL THEN 0
- WHEN number_of_columns = 1 THEN 0
- ELSE 1
- END) AS multi_column
- FROM pg_namespace
- LEFT OUTER JOIN pg_class ON pg_namespace.oid = pg_class.relnamespace
- LEFT OUTER JOIN
- (SELECT indrelid,
- max(CAST(indisunique AS integer)) AS is_unique
- FROM pg_index
- GROUP BY indrelid) x
- ON pg_class.oid = x.indrelid
- LEFT OUTER JOIN
- ( SELECT c.relname AS ctablename, ipg.relname AS indexname, x.indnatts AS number_of_columns FROM pg_index x
- JOIN pg_class c ON c.oid = x.indrelid
- JOIN pg_class ipg ON ipg.oid = x.indexrelid )
- AS foo
- ON pg_class.relname = foo.ctablename
- WHERE
- pg_namespace.nspname='public'
- AND pg_class.relkind = 'r'
- GROUP BY pg_class.relname, pg_class.reltuples, x.is_unique
- ORDER BY 2;
-
-Index size/usage statistics
-~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT
- t.schemaname,
- t.tablename,
- indexname,
- c.reltuples AS num_rows,
- pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(t.tablename)::text)) AS table_size,
- pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(indexrelname)::text)) AS index_size,
- CASE WHEN indisunique THEN 'Y'
- ELSE 'N'
- END AS UNIQUE,
- number_of_scans,
- tuples_read,
- tuples_fetched
- FROM pg_tables t
- LEFT OUTER JOIN pg_class c ON t.tablename = c.relname
- LEFT OUTER JOIN (
- SELECT
- c.relname AS ctablename,
- ipg.relname AS indexname,
- x.indnatts AS number_of_columns,
- idx_scan AS number_of_scans,
- idx_tup_read AS tuples_read,
- idx_tup_fetch AS tuples_fetched,
- indexrelname,
- indisunique,
- schemaname
- FROM pg_index x
- JOIN pg_class c ON c.oid = x.indrelid
- JOIN pg_class ipg ON ipg.oid = x.indexrelid
- JOIN pg_stat_all_indexes psai ON x.indexrelid = psai.indexrelid
- ) AS foo ON t.tablename = foo.ctablename AND t.schemaname = foo.schemaname
- WHERE t.schemaname NOT IN ('pg_catalog', 'information_schema')
- ORDER BY 1,2;
-
-Duplicate indexes
-~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT pg_size_pretty(sum(pg_relation_size(idx))::bigint) as size,
- (array_agg(idx))[1] as idx1, (array_agg(idx))[2] as idx2,
- (array_agg(idx))[3] as idx3, (array_agg(idx))[4] as idx4
- FROM (
- SELECT indexrelid::regclass as idx, (indrelid::text ||E'\n'|| indclass::text ||E'\n'|| indkey::text ||E'\n'||
- coalesce(indexprs::text,'')||E'\n' || coalesce(indpred::text,'')) as key
- FROM pg_index) sub
- GROUP BY key HAVING count(*)>1
- ORDER BY sum(pg_relation_size(idx)) DESC;
-
-Maintenance
------------
-
-`PostgreSQL wiki <https://wiki.postgresql.org/wiki/Main_Page>`__
-
-CLUSTER-ing
-~~~~~~~~~~~
-
-`CLUSTER <https://www.postgresql.org/docs/current/sql-cluster.html>`__
-
-.. code:: sql
-
- CLUSTER [VERBOSE] table_name [ USING index_name ]
- CLUSTER [VERBOSE]
-
-``CLUSTER`` instructs PostgreSQL to cluster the table specified by
-``table_name`` based on the index specified by ``index_name``. The index
-must already have been defined on ``table_name``.
-
-When a table is clustered, it is physically reordered based on the index
-information.
-
-The
-`clusterdb <https://www.postgresql.org/docs/current/app-clusterdb.html>`__
-CLI tool is recommended, and can also be used to cluster all tables at
-the same time.
-
-VACUUM-ing
-~~~~~~~~~~
-
-Proper vacuuming, particularly autovacuum configuration, is crucial to a
-fast and reliable database.
-
-`Introduction to VACUUM, ANALYZE, EXPLAIN, and
-COUNT <https://wiki.postgresql.org/wiki/Introduction_to_VACUUM,_ANALYZE,_EXPLAIN,_and_COUNT>`__
-
-It is not advised to run ``VACUUM FULL``, instead look at clustering.
-VACUUM FULL is a much more intensive task and acquires an ACCESS
-EXCLUSIVE lock on the table, blocking reads and writes. Whilst
-``CLUSTER`` also does acquire this lock it’s a less intensive and faster
-process.
-
-The
-`vacuumdb <https://www.postgresql.org/docs/current/app-vacuumdb.html>`__
-CLI tool is recommended for manual runs.
-
-Finding number of dead rows
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sql
-
- SELECT relname, n_dead_tup FROM pg_stat_user_tables WHERE n_dead_tup <> 0 ORDER BY 2 DESC;
-
-Finding last vacuum/auto-vacuum date
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sql
-
- SELECT relname, last_vacuum, last_autovacuum FROM pg_stat_user_tables;
-
-Checking auto-vacuum is enabled
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sql
-
- SELECT name, setting FROM pg_settings WHERE name='autovacuum';
-
-View all auto-vacuum setting
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code:: sql
-
- SELECT * from pg_settings where category like 'Autovacuum';
-
-Locks
------
-
-Looking at granted locks
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT relation::regclass, * FROM pg_locks WHERE NOT granted;
-
-Сombination of blocked and blocking activity
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. code:: sql
-
- SELECT blocked_locks.pid AS blocked_pid,
- blocked_activity.usename AS blocked_user,
- blocking_locks.pid AS blocking_pid,
- blocking_activity.usename AS blocking_user,
- blocked_activity.query AS blocked_statement,
- blocking_activity.query AS current_statement_in_blocking_process
- FROM pg_catalog.pg_locks blocked_locks
- JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
- JOIN pg_catalog.pg_locks blocking_locks
- ON blocking_locks.locktype = blocked_locks.locktype
- AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
- AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
- AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
- AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
- AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
- AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
- AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
- AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
- AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
- AND blocking_locks.pid != blocked_locks.pid
-
- JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
- WHERE NOT blocked_locks.granted;
diff --git a/docs/runbooks/index.rst b/docs/runbooks/index.rst
deleted file mode 100644
index 18690c7..0000000
--- a/docs/runbooks/index.rst
+++ /dev/null
@@ -1,17 +0,0 @@
-Runbooks
-========
-
-Learn how to do anything in our infrastructure with these guidelines.
-
-.. note::
-
- In general, we try to codify manual processes as much as possible. Still,
- this section is important for tasks that are either hard to automate or are
- run so infrequently that it does not make sense to regularly run them.
-
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- postgresql-upgrade
diff --git a/docs/runbooks/postgresql-upgrade.rst b/docs/runbooks/postgresql-upgrade.rst
deleted file mode 100644
index 98b1642..0000000
--- a/docs/runbooks/postgresql-upgrade.rst
+++ /dev/null
@@ -1,149 +0,0 @@
-Upgrading PostgreSQL
-====================
-
-Step 1 - Enable maintenance mode
---------------------------------
-
-Add a worker route for ``pythondiscord.com/*`` to forward to the
-``maintenance`` Cloudflare worker.
-
-Step 2 - Scale down all services that use PostgreSQL
-----------------------------------------------------
-
-Notably site, metricity, bitwarden and the like should be scaled down.
-
-Services that are read only such as Grafana (but NOT Metabase, Metabase
-uses PostgreSQL for internal storage) do not need to be scaled down, as
-they do not update the database in any way.
-
-.. code:: bash
-
- $ kubectl scale deploy --replicas 0 site metricity metabase bitwarden ...
-
-Step 3 - Take a database dump and gzip
---------------------------------------
-
-Using ``pg_dumpall``, dump the contents of all databases to a ``.sql``
-file.
-
-Make sure to gzip for faster transfer.
-
-Take a SHA512 sum of the output ``.sql.gz`` file to validate integrity
-after copying.
-
-.. code:: bash
-
- $ pg_dumpall -U pythondiscord > backup.sql
- $ gzip backup.sql
- $ sha512sum backup.sql
- a3337bfc65a072fd93124233ac1cefcdfbe8a708e5c1d08adaca2cf8c7cbe9ae4853ffab8c5cfbe943182355eaa701012111a420b29cc4f74d1e87f9df3af459 backup.sql
-
-Step 4 - Move database dump locally
------------------------------------
-
-Use ``kubectl cp`` to move the ``backup.sql.gz`` file from the remote
-pod to your local machine.
-
-Validate the integrity of the received file.
-
-Step 5 - Attempt local import to new PostgreSQL version
--------------------------------------------------------
-
-Install the new version of PostgreSQL locally and import the data. Make
-sure you are operating on a **completely empty database server.**
-
-.. code:: bash
-
- $ gzcat backup.sql.gz | psql -U joe
-
-You can use any PostgreSQL superuser for the import. Ensure that no
-errors other than those mentioned below occur, you may need to attempt
-multiple times to fix errors listed below.
-
-Handle import errors
-~~~~~~~~~~~~~~~~~~~~
-
-Monitor the output of ``psql`` to check that no errors appear.
-
-If you receive locale errors ensure that the locale your database is
-configured with matches the import script, this may require some usage
-of ``sed``:
-
-.. code:: bash
-
- $ sed -i '' "s/en_US.utf8/en_GB.UTF-8/g" backup.sql
-
-Ensure that you **RESET THESE CHANGES** before attempting an import on
-the remote, if they come from the PostgreSQL Docker image they will need
-the same locale as the export.
-
-Step 7 - Spin down PostgreSQL
------------------------------
-
-Spin down PostgreSQL to 0 replicas.
-
-Step 8 - Take volume backup at Linode
--------------------------------------
-
-Backup the volume at Linode through a clone in the Linode UI, name it
-something obvious.
-
-Step 9 - Remove the Linode persistent volume
---------------------------------------------
-
-Delete the volume specified in the ``volume.yaml`` file in the
-``postgresql`` directory, you must delete the ``pvc`` first followed by
-the ``pv``, you can find the relevant disks through
-``kubectl get pv/pvc``
-
-Step 10 - Create a new volume by re-applying the ``volume.yaml`` file
----------------------------------------------------------------------
-
-Apply the ``volume.yaml`` so a new, empty, volume is created.
-
-Step 11 - Bump the PostgreSQL version in the ``deployment.yaml`` file
----------------------------------------------------------------------
-
-Update the Docker image used in the deployment manifest.
-
-Step 12 - Apply the deployment
-------------------------------
-
-Run ``kubectl apply -f postgresql/deployment.yaml`` to start the new
-database server.
-
-Step 13 - Copy the data across
-------------------------------
-
-After the pod has initialised use ``kubectl cp`` to copy the gzipped
-backup to the new Postgres pod.
-
-Step 14 - Extract and import the new data
------------------------------------------
-
-.. code:: bash
-
- $ gunzip backup.sql.gz
- $ psql -U pythondiscord -f backup.sql
-
-Step 15 - Validate data import complete
----------------------------------------
-
-Ensure that all logs are successful, you may get duplicate errors for
-the ``pythondiscord`` user and database, these are safe to ignore.
-
-Step 16 - Scale up services
----------------------------
-
-Restart the database server
-
-.. code:: bash
-
- $ kubectl scale deploy --replicas 1 metricity bitwarden metabase
-
-Step 17 - Validate all services interact correctly
---------------------------------------------------
-
-Validate that all services reconnect successfully and start exchanging
-data, ensure that no abnormal logs are outputted and performance remains
-as expected.
diff --git a/docs/tooling/bots.rst b/docs/tooling/bots.rst
deleted file mode 100644
index 7b5e165..0000000
--- a/docs/tooling/bots.rst
+++ /dev/null
@@ -1,55 +0,0 @@
-Bots
-====
-
-Our GitHub repositories are supported by two custom bots:
-
-- Our **Fast Forward Bot**, which ensures that commits merged into main
- are either merged manually on the command line or via a fast-forward,
- ensuring that cryptographic signatures of commits remain intact.
- Information on the bot can be found `in the ff-bot.yml
- configuration <https://github.com/python-discord/infra/blob/main/.github/ff-bot.yml>`__.
- Merges over the GitHub UI are discouraged for this reason. You can
- use it by running ``/merge`` on a pull request. Note that attempting
- to use it without permission to do so will be reported.
-
-- Our **Craig Dazey Emulator Bot**, which ensures team morale stays
- high at all times by thanking team members for submitted pull
- requests. [1]_
-
-Furthermore, our repositories all have dependabot configured on them.
-
-Dealing with notifications
---------------------------
-
-This section collects some of our team members’ ways of dealing with the
-notifications that originate from our bots.
-
-Sieve (RFC 5228) script
-~~~~~~~~~~~~~~~~~~~~~~~
-
-If your mail server supports the `Sieve mail filtering
-language <https://datatracker.ietf.org/doc/html/rfc5228.html>`__, which
-it should, you can adapt the following script to customize the amount of
-notifications you receive:
-
-.. code:: sieve
-
- require ["envelope", "fileinto", "imap4flags"];
-
- if allof (header :is "X-GitHub-Sender" ["coveralls", "github-actions[bot]", "netlify[bot]"],
- address :is "from" "[email protected]") {
- setflag "\\seen";
- fileinto "Trash";
- stop;
- }
-
-If you also want to filter out notifications from renovate, which we use
-for dependency updates, you can add ``renovate[bot]`` to the
-``X-GitHub-Sender`` list above.
-
-.. [1]
- Craig Dazey Emulator Bot stands in no affiliation, direct or
- indirect, with Craig Dazey. Craig Dazey Emulator Bot. Craig Dazey
- Emulator Bot is not endorsed by Craig Dazey. Craig Dazey Emulator Bot
- is an independent project of Craig Dazey. No association is made
- between Craig Dazey Emulator Bot and Craig Dazey.
diff --git a/docs/tooling/index.rst b/docs/tooling/index.rst
deleted file mode 100644
index 2381849..0000000
--- a/docs/tooling/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Tooling
-=======
-
-Learn about the helperlings that keep Python Discord DevOps running like a
-well-oiled machine.
-
-
-.. toctree::
- :maxdepth: 2
- :caption: Contents:
-
- bots