diff options
113 files changed, 2926 insertions, 2312 deletions
diff --git a/.github/workflows/docs.yaml b/.github/workflows/docs.yaml index 8569ea1..d5d7cd6 100644 --- a/.github/workflows/docs.yaml +++ b/.github/workflows/docs.yaml @@ -32,11 +32,6 @@ jobs: build: runs-on: ubuntu-latest steps: - - name: Setup Hugo - uses: peaceiris/actions-hugo@v3 - with: - hugo-version: '0.124.0' - extended: true - name: Checkout uses: actions/checkout@v4 with: @@ -45,21 +40,13 @@ jobs: - name: Setup Pages id: pages uses: actions/configure-pages@v5 - - name: Build with Hugo - env: - # For maximum backward compatibility with Hugo modules - HUGO_ENVIRONMENT: production - HUGO_ENV: production - TZ: UTC - run: | - cd docs && hugo \ - --gc \ - --minify \ - --baseURL "${{ steps.pages.outputs.base_url }}/" + - uses: ammaraskar/sphinx-action@master + with: + docs-folder: "docs/" - name: Upload artifact uses: actions/upload-pages-artifact@v3 with: - path: ./docs/public + path: ./docs/_build/html # Deployment job deploy: diff --git a/.github/workflows/lint-docs.yaml b/.github/workflows/lint-docs.yaml index c616e9a..5b8e978 100644 --- a/.github/workflows/lint-docs.yaml +++ b/.github/workflows/lint-docs.yaml @@ -10,12 +10,6 @@ jobs: with: submodules: recursive fetch-depth: 0 - - name: Setup Hugo - uses: peaceiris/actions-hugo@v3 + - uses: ammaraskar/sphinx-action@master with: - hugo-version: '0.124.0' - extended: true - - name: Run hugo - run: | - cd docs - hugo --panicOnWarning --renderToMemory + docs-folder: "docs/" @@ -1,8 +1,7 @@ /ansible/.ansible/ vault_passwords -/docs/.hugo_build.lock -/docs/public/ -/docs/resources/ +/docs/_build/ +!/docs/_build/.gitkeep venv .venv .cache/ diff --git a/.gitmodules b/.gitmodules index 6ef09dd..d5e7cf2 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,5 +1,5 @@ [submodule "hugodocs/themes/hugo-book"] - path = docs/themes/hugo-book + path = olddocs/themes/hugo-book url = https://github.com/alex-shpak/hugo-book [submodule "ansible/roles/nftables"] path = ansible/roles/nftables diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index eaef124..0000000 --- a/docs/README.md +++ /dev/null @@ -1,9 +0,0 @@ -# DevOps documentation - -This directory houses our DevOps documentation, written using -[Hugo](https://gohugo.io). - -To test changes to it locally, run `hugo serve` and follow the link displayed -on your Terminal. To generate all files, just run `hugo`. - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/.hugo_build.lock b/docs/_static/.gitkeep index e69de29..e69de29 100644 --- a/docs/.hugo_build.lock +++ b/docs/_static/.gitkeep diff --git a/docs/_templates/.gitkeep b/docs/_templates/.gitkeep new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/docs/_templates/.gitkeep diff --git a/docs/archetypes/default.md b/docs/archetypes/default.md deleted file mode 100644 index d7f68d3..0000000 --- a/docs/archetypes/default.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: '{{ replace .Name "-" " " | title }}' -date: "{{ .Date }}" -draft: true ---- diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ff45466 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,26 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = "Python Discord DevOps" +copyright = "2024, Python Discord" +author = "Joe Banks <[email protected]>, King Arthur <[email protected]>" + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = [] + +templates_path = ["_templates"] +exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = "alabaster" +html_static_path = ["_static"] diff --git a/docs/config.toml b/docs/config.toml deleted file mode 100644 index 1c87b1c..0000000 --- a/docs/config.toml +++ /dev/null @@ -1,16 +0,0 @@ -baseURL = 'http://python-discord.github.io/infra/' -languageCode = 'en-us' -title = 'PyDis DevOps' -theme = "hugo-book" - -# From the theme: -# https://github.com/alex-shpak/hugo-book?tab=readme-ov-file#site-configuration -enableGitInfo = true -[params] - BookTheme = "dark" - BookRepo = "https://github.com/python-discord/infra" - BookServiceWorker = false # frontend nonsense - -[markup] - [markup.highlight] - lineNos = true diff --git a/docs/content/_index.md b/docs/content/_index.md deleted file mode 100644 index dbf457b..0000000 --- a/docs/content/_index.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Home ---- - -# Python Discord DevOps - -Welcome to the Python Discord DevOps knowledgebase. - -Within this set of pages you will find: -- Changelogs -- Post-mortems -- Common queries -- Runbooks diff --git a/docs/content/docs/general/_index.md b/docs/content/docs/general/_index.md deleted file mode 100644 index acaa19d..0000000 --- a/docs/content/docs/general/_index.md +++ /dev/null @@ -1,5 +0,0 @@ ---- -title: General -weight: -10 -bookSearchExclude: true ---- diff --git a/docs/content/docs/general/manual-deploys.md b/docs/content/docs/general/manual-deploys.md deleted file mode 100644 index 39e816e..0000000 --- a/docs/content/docs/general/manual-deploys.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: Manual Deploys -layout: default ---- - -# Manual Deployments - -When the DevOps team are not available, Administrators and Core Developers can redeploy our critical services, such as Bot, Site and ModMail. - -This is handled through workflow dispatches on this repository. To get started, head to the [Actions](https://github.com/python-discord/kubernetes/actions) tab of this repository and select `Manual Redeploy` in the sidebar, alternatively navigate [here](https://github.com/python-discord/kubernetes/actions/workflows/manual_redeploy.yml). - - - -Click `Run workflow` on the right hand side and enter the service name that needs redeploying, keep the branch as `main`: - - - -Click `Run` and refresh the page, you'll see a new in progress Action which you can track. Once the deployment completes notifications will be sent to the `#dev-ops` channel on Discord. - -If you encounter errors with this please copy the Action run link to Discord so the DevOps team can investigate when available. diff --git a/docs/content/docs/meeting_notes/2022-04-07.md b/docs/content/docs/meeting_notes/2022-04-07.md deleted file mode 100644 index fb54214..0000000 --- a/docs/content/docs/meeting_notes/2022-04-07.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: "2022-04-07: Devops Meeting" ---- - -# DevOps Meeting Notes - -## Agenda - -- No updates, as last week's meeting did not take place - -## Roadmap review & planning - -What are we working on for the next meeting? - -- Help wanted for #57 (h-asgi) -- #58 (postgres exporter) needs a new review -- #54 (firewall in VPN) will be done by Johannes -- We need a testing environment #67 -- Johannes will add a Graphite role #31 -- Sofi will take a look at #29 -- #41 (policy bot) will be taken care of by Johannes diff --git a/docs/content/docs/meeting_notes/2022-09-18.md b/docs/content/docs/meeting_notes/2022-09-18.md deleted file mode 100644 index 9fa4176..0000000 --- a/docs/content/docs/meeting_notes/2022-09-18.md +++ /dev/null @@ -1,59 +0,0 @@ ---- -title: "2022-09-18: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- Joe will grant Chris access to the netcup hosts. - -### NetKube status - -- **Rollout** - - [x] RBAC configuration and access granting - - [x] Most nodes are enrolled, Joe will re-check - - `turing`, `ritchie`, `lovelace` and `neumann` will be Kubernetes nodes - - `hopper` will be the storage server -- **Storage drivers** - - Not needed, everything that needs persistent storage will run on hopper - - Netcup does not support storage resize - - We can download more RAM if we need it - - A couple of services still need volume mounts: Ghost, Grafana & Graphite -- **Control plane high availability** - - Joe mentions that in the case the control plane dies, everything else will - die as well - - If the control plane in Germany dies, so will Johannes -- **Early plans for migration** - - We can use the Ansible repository issues for a good schedule - - Hopper runs `nginx` - - Statement from Joe: - > “There is an nginx ingress running on every node in the cluster, okay, - > okay? We don’t, the way that’s, that’s as a service is a NodePort, right? - > So it has a normal IP, but the port will be like a random port in the range - > of the 30,000s. Remember that? Hold on. Is he writing rude nodes? And then… - > We have nginx, so this is where it’s like a little bit, like, not nice, I - > guess we just like, cronjob it, to pull the nodes, like, every minute or - > so, and then update the config if they change. But then it’s just like… - > nginx is like a catalogue of nodes. Wahhh, you drive me crazy.” - - "Nah, it makes sense!" - - "It does!" - - Joe will figure this out with assistance from his voices. - -### Open authentication - -- Joe and Johannes will check out OpenLDAP as a JumpCloud alternative starting - from this evening -- Sofi has experience with OpenLDAP - - -## Sponsorship - -This meeting has been sponsored by Chris Hemsworth Lovering's relationship -therapy company, "Love To Love By Lovering". You can sign up by sending a mail -to [email protected]. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2022-10-05.md b/docs/content/docs/meeting_notes/2022-10-05.md deleted file mode 100644 index ee1ee4f..0000000 --- a/docs/content/docs/meeting_notes/2022-10-05.md +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: "2022-10-05: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- Joe Banks configured proper RBAC for Chris, Johannes and Joe himself - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2022-10-19.md b/docs/content/docs/meeting_notes/2022-10-19.md deleted file mode 100644 index 37102bb..0000000 --- a/docs/content/docs/meeting_notes/2022-10-19.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "2022-10-19: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- One hour of gartic phone, for team spirit. -- Created user accounts for Sofi and Hassan -- Joe created an architecture diagram of the NGINX setup - - _This is still in Notion_ -- Joe explained his NGINX plans: - > “It’s not actually that hard, right? So you spawn 5 instances of nginx in a - > DaemonSet, because then one gets deployed to every node okay, following? - > Then we get NodePort, instead of LoadBalancers or whatever, which will get - > a random port allocatead in the 35000 range, and that will go to nginx, and - > on each of those ports, it will go to nginx, right? And then we poll the - > Kubernetes API and what is the port that each of these nginx instances is - > running on, and add that into a roundrobin on the fifth node. Right? Yeah. - > That’s correct. That won’t do TLS though, so that will just HAProxy. Yeah.” -- Joe will terminate our JumpCloud account -- Chris reset the Minecraft server -- Email alerting needs to be configured - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2022-10-26.md b/docs/content/docs/meeting_notes/2022-10-26.md deleted file mode 100644 index 636f3da..0000000 --- a/docs/content/docs/meeting_notes/2022-10-26.md +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: "2022-10-26: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- Chris upgraded PostgreSQL to 15 in production -- Johannes added the Kubernetes user creation script into the Kubernetes - repository in the docs - -*(The rest of the meeting was discussion about the NetKube setup, which has -been scrapped since)*. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2022-11-02.md b/docs/content/docs/meeting_notes/2022-11-02.md deleted file mode 100644 index a36e2f9..0000000 --- a/docs/content/docs/meeting_notes/2022-11-02.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "2022-11-02: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -### Hanging behaviour of ModMail - -- [Source](https://discord.com/channels/267624335836053506/675756741417369640/1036720683067134052) -- Maybe use [Signals + a debugger](https://stackoverflow.com/a/25329467)? -- ... using [something like pdb for the - debugger](https://wiki.python.org/moin/PythonDebuggingTools)? -- Or [GDB, as it seems handy to poke at stuck multi-threaded python - software](https://wiki.python.org/moin/DebuggingWithGdb)? - -- ModMail has been upgraded to version 4 - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2022-11-23.md b/docs/content/docs/meeting_notes/2022-11-23.md deleted file mode 100644 index 459c595..0000000 --- a/docs/content/docs/meeting_notes/2022-11-23.md +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: "2022-11-23: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -*(This meeting was mostly about NetKube, with the following strange text -included, and everything outside of the text has been removed since the NetKube -plans have been scrapped)*. - -Joe Banks, after a month-long hiatus to become a dad to every second girl on -uni campus, has managed to pull up to the DevOps meeting. - -We are considering using Kubespray (https://kubespray.io/#/) in order to deploy -a production-ready bare-metal Kubernetes cluster without involvement from Joe -“Busy With Poly Girlfriend #20” Banks. - -At the moment cluster networking is not working and Joe mentions that the last -time he has touched it, it worked perfectly fine. However, the last time he -touched it there was only 1 node, and therefore no inter-node communications. - -Joe thinks he remembers installing 3 nodes, however, we at the DevOps team -believe this to be a marijuana dream - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-02-08.md b/docs/content/docs/meeting_notes/2023-02-08.md deleted file mode 100644 index 6ce23f6..0000000 --- a/docs/content/docs/meeting_notes/2023-02-08.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -title: "2023-02-08: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- Investigation into deploying a VPN tool such as WireGuard to have inter-node - communication between the Netcup hosts. - -*(The rest of this meeting was mostly about NetKube, which has since been -scrapped)*. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-02-21.md b/docs/content/docs/meeting_notes/2023-02-21.md deleted file mode 100644 index 6e4079e..0000000 --- a/docs/content/docs/meeting_notes/2023-02-21.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: "2023-02-21: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -### Reusable status embed workflows - -- Further discussion with Bella followed -- Upstream pull request can be found at - [python-discord/bot#2400](https://github.com/python-discord/bot/pull/2400) - -### Local vagrant testing setup - -- Our new [testing setup using Vagrant - VMs](https://github.com/python-discord/infra/pull/78) has been merged. - -### A visit from Mina - -Mina checked in to make sure we're operating at peak Volkswagen-like -efficiency. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-02-28.md b/docs/content/docs/meeting_notes/2023-02-28.md deleted file mode 100644 index 9d6f9a8..0000000 --- a/docs/content/docs/meeting_notes/2023-02-28.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: "2023-02-28: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- Black knight's CI & dependabot configuration has been mirrored across all - important repositories - -- The test server has been updated for the new configuration - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-05-16.md b/docs/content/docs/meeting_notes/2023-05-16.md deleted file mode 100644 index 7f633a5..0000000 --- a/docs/content/docs/meeting_notes/2023-05-16.md +++ /dev/null @@ -1,15 +0,0 @@ ---- -title: "2023-05-16: Devops Meeting" ---- - -# DevOps Meeting Notes - -*Migrated from Notion*. - -## Agenda - -- Bella set up [CI bot docker image - build](https://github.com/python-discord/bot/pull/2603) to make sure that - wheels are available. - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-07-11.md b/docs/content/docs/meeting_notes/2023-07-11.md deleted file mode 100644 index 9467033..0000000 --- a/docs/content/docs/meeting_notes/2023-07-11.md +++ /dev/null @@ -1,38 +0,0 @@ ---- -title: "2023-07-11: Devops Meeting" ---- - -# DevOps Meeting Notes - -## Participants - -- Chris, Johannes, Bella, Bradley - -## Agenda - -### New Ansible setup - -Chris presented the new Ansible setup he's been working on. We plan to use -WireGuard for networking. We agreed that selfhosting Kubernetes is not the way -to go. In general, the main benefit from switching away to Linode to Netcup is -going to be a ton more resources from the Netcup root servers we were given. The -original issue with Linode's AKS of constantly having problems with volumes has -not been present for a while. Chris mentions the one remaining issue is that -we're at half our memory capacity just at idle. - -It's our decision where to go from here - we can stick to the Kubernetes setup -or decide on migrating to the Ansible setup. But we have bare metal access to -the Netcup hosts, which makes e.g. managing databases a lot easier. Chris -mentions the possibility to only use Netcup for our persistence and Linode AKS -for anything else, but this has the issue of us relying on two sponsors for our -infrastructure instead of one. - -PostgreSQL was set up to run on ``lovelace``. - -### Decision - -**It was decided to hold a vote on the core development channel, which will be -evaluated next week to see how to proceed with the setup**. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-07-18.md b/docs/content/docs/meeting_notes/2023-07-18.md deleted file mode 100644 index b529bf8..0000000 --- a/docs/content/docs/meeting_notes/2023-07-18.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: "2023-07-18: Devops Meeting" ---- - -# DevOps Meeting Notes - -## Secret management improvements - -To allow for **better management of our Kubernetes secrets**, Chris set out to -configure `git-crypt` in GPG key mode. For comparison, the previous approach was -that secrets were stored in Kubernetes only and had to be accessed via -`kubectl`, and now `git-crypt` allows us to transparently work with the files in -unencrypted manner locally, whilst having them secure on the remote, all via -`.gitattributes`. - -The following people currently have access to this: - -- Johannes Christ <[email protected]> (`8C05D0E98B7914EDEBDCC8CC8E8E09282F2E17AF`) -- Chris Lovering <[email protected]> (`1DA91E6CE87E3C1FCE32BC0CB6ED85CC5872D5E4`) -- Joe Banks <[email protected]> (`509CDFFC2D0783A33CF87D2B703EE21DE4D4D9C9`) - -For Hassan, we are still waiting on response regarding his GPG key accuracy. - -The pull request for the work can be found [at -python-discord/kubernetes#156](https://github.com/python-discord/kubernetes/pull/156). - -**To have your key added, please contact any of the existing key holders**. More -documentation on this topic is pending to be written, see -[python-discord/kubernetes#157](https://github.com/python-discord/kubernetes/issues/157). - - -## Infrastructure migration decision - -The voting started [last week](./2023-07-11.md) will be properly talked about -[next week](./2023-07-25.md), so far it looks like we're definitely not -selfhosting Kubernetes at the very least. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-07-25.md b/docs/content/docs/meeting_notes/2023-07-25.md deleted file mode 100644 index c8214b0..0000000 --- a/docs/content/docs/meeting_notes/2023-07-25.md +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: "2023-07-25: Devops Meeting" ---- - -# DevOps Meeting Notes - -Postponed to next week due to absence. diff --git a/docs/content/docs/meeting_notes/2023-08-01.md b/docs/content/docs/meeting_notes/2023-08-01.md deleted file mode 100644 index 1e6c71d..0000000 --- a/docs/content/docs/meeting_notes/2023-08-01.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: "2023-08-01: Devops Meeting" ---- - -# DevOps Meeting Notes - -## Agenda - -### Infrastructure migration - -The vote is tied. Chris and Johannes decided that we should test out migrating -the PostgreSQL database at the very least. We then have more freedom about our -data. What we need to do: - -- Allow PostgreSQL connections from LKE's static IPs in the firewall -- Whitelist the static IPs from Linode via `pg_hba.conf` -- Schedule downtime for the PostgreSQL database -- **At downtime** - - Take writers offline - - Dump database from Linode into Netcup - - Update all the client's database URLs to point to netcup - - Restart writers - -We want to rely on the restore to create everything properly, but will need to -test run this beforehand. The following `pg_virtualenv` command has showcased -that it works properly: - -```sh -kubectl exec -it postgres-... -- pg_dumpall -U pythondiscord \ -| pg_virtualenv psql -v ON_ERROR_STOP=1 -``` - -Note however that the database extension `pg_repack` needs to be installed. - -Before we can get started, we need to allow the PostgreSQL role to configure -`pg_hba.conf` and `postgresql.conf` entries. - - -### Meeting notes - -We're using GitHub at the moment. Some are left in Notion. We should migrate -these to GitHub to have a uniform interface: Johannes will pick up -[python-discord/infra#108](https://github.com/python-discord/infra/issues/108) -to merge them together into Git, as its more open than Notion. - - -### Ansible lint failures in the infra repository - -Excluding the vault was found as the working solution here, as implemented by -Chris. - -### Kubernetes repository pull requests - -These were cleaned up thanks to Chris. - - -## Roadmap review & planning - -- Chris will prepare the PostgreSQL configuration mentioned above. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-08-08.md b/docs/content/docs/meeting_notes/2023-08-08.md deleted file mode 100644 index 9a7f0ec..0000000 --- a/docs/content/docs/meeting_notes/2023-08-08.md +++ /dev/null @@ -1,54 +0,0 @@ ---- -title: "2023-08-08: Devops Meeting" ---- - -# DevOps Meeting Notes - -## Agenda - -- Configuration of PostgreSQL and the PostgreSQL exporter - - - **No time so far**. Chris has been busy with renovating his living room, - and Johannes has been busy with renovating his bedroom. Bradley prefers to - remain quiet. - - - Chris will try to work on this in the coming week and will try to have Bella - around as well, since he wanted to join the setup. - -- **Potential slot for GPG key signing of DevOps members**. External - verification will be necessary. - - - Skipped. No webcam on Chris. - -- We need to assign a **librarian** to keep our documents organized according to - a system. Johannes is happy to do this for now. - - - Let's move the existing documentation from the Kubernetes repository into - the infra repository. See - [kubernetes#161](https://github.com/python-discord/kubernetes/issues/161). - - - **Our Notion DevOps space is full of junk**. Outside of that, it's not open - to read for outside contributors, and does not leave much choice over which - client to use for editing content. - - - Chris agrees, without looking on it - just from memory. We should move it - to the infra repository. (The meeting notes have already been - transferred). - - - Bella suggests to add some automation to make keeping everything in clean - order less tedious. - -- We may want to integrate the **Kubernetes repository** and the infra - repository together altogether, however there are a lot of repositories - referencing the deployment manifests that would need to be updated. - - - Chris mentions that regardless of what we do, we should - at the very least - move all documentation into the `infra` repository, including the static - site generator. At the moment we're using Jekyll but we're open to trying - alternatives such as Hugo. - -- We closed some issues and pull requests in the repositories for late spring - cleaning. - - -<!-- vim: set textwidth=80 sw=2 ts=2 autoindent conceallevel=2: --> diff --git a/docs/content/docs/meeting_notes/2023-08-22.md b/docs/content/docs/meeting_notes/2023-08-22.md deleted file mode 100644 index cf23dc3..0000000 --- a/docs/content/docs/meeting_notes/2023-08-22.md +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "2023-08-22: Devops meeting notes" ---- - -# DevOps Meeting Notes - - -<!-- - -Useful links - -- Infra open issues: https://github.com/python-discord/infra/issues - -- infra open pull requests: https://github.com/python-discord/infra/pulls - -- *If* any open issue or pull request needs discussion, why was the existing - asynchronous logged communication over GitHub insufficient? - ---> - - -## Agenda - -- Bella said he is on the streets. **We should start a gofundme**. - - - After some more conversation this just means he is on vacation and currently - taking a walk. - -- Chris has been busy with turning his living room into a picasso art - collection, Johannes has been busy with renovating his bedroom, and Bella is - not home. - - - Our next priority is winning. - -- We checked out some issues with documentation generation in `bot-core` that - Bella has mentioned. We managed to fix one issue with pydantic by adding it to - an exclude list but ran into another problem next. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-08-29.md b/docs/content/docs/meeting_notes/2023-08-29.md deleted file mode 100644 index 78b4c2a..0000000 --- a/docs/content/docs/meeting_notes/2023-08-29.md +++ /dev/null @@ -1,65 +0,0 @@ ---- -title: "2023-08-29: Devops meeting notes" ---- - -# DevOps Meeting Notes - - -<!-- - -Useful links - -- Infra open issues: https://github.com/python-discord/infra/issues - -- infra open pull requests: https://github.com/python-discord/infra/pulls - -- *If* any open issue or pull request needs discussion, why was the existing - asynchronous logged communication over GitHub insufficient? - ---> - - -## Agenda - -- **Bella is still on the streets** - - - The Python Discord Bella On The Streets Fundraising Campaign Q3 2023 has not - been successful so far. To help Bella receive French citizenship, Joe has - put up a French flag behind himself in the meeting. - - - Joe corrects my sarcasm. It is an Italian flag, not a French flag. The - reason for this flag is that his new prime interest on campus was born in - Italy. - -- **The SnekBox CI build is pretty slow** - - - Guix and Nix are not alternatives. Neither is Ubuntu - - - We use pyenv to build multiple Python versions for a new feature - - - The feature is not rolled out yet - - - Part of the problem is that we build twice in the `build` and the `deploy` - stage - - - On rollout, Joe tested it and it works fine - -- No update on the Hugo build yet - -- For snowflake, Johannes will write a proposal to the admins for hosting it - - - We should consider talking about the following points: - - - statistically ~8% of Tor traffic is problematic (10% of traffic is to - hidden services, 80% of hidden service traffic is for illegal services) - - - overall the project's position and our ideal is to help people for a good - cause - - - all traffic is forwarded to the Tor network, the service is lightweight - and only proxies encrypted traffic there - - - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-09-05.md b/docs/content/docs/meeting_notes/2023-09-05.md deleted file mode 100644 index 8fde824..0000000 --- a/docs/content/docs/meeting_notes/2023-09-05.md +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: "2023-09-05: Devops meeting notes" ---- - -# DevOps Meeting Notes - - -<!-- - -Useful links - -- Infra open issues: https://github.com/python-discord/infra/issues - -- infra open pull requests: https://github.com/python-discord/infra/pulls - -- *If* any open issue or pull request needs discussion, why was the existing - asynchronous logged communication over GitHub insufficient? - ---> - - -## Agenda - -- No update on the Hugo build yet - -- Johannes wrote a proposal for snowflake proxy to be deployed to our netcup hosts - - Admins discussed and came to the conclusion that since we don't own the servers, - we got the servers from netcup as a sponsorship to host our infra, so using them - to host something that isn't our infra doesn't seem right. - -- Lots of dependabot PRs closed - - https://github.com/search?q=org%3Apython-discord++is%3Apr+is%3Aopen+label%3A%22area%3A+dependencies%22&type=pullrequests&ref=advsearch - - Closed ~50% of PRs - -- Workers repo has had its CI rewritten, all workers have consistent package.json, - scripts, and using the new style of cloudflare workers which don't use webpack - -- Metricity updated to SQLAlchemy 2 - -- Olli CI PR is up - - https://github.com/python-discord/olli/pull/25 - -- Sir-Robin pydantic constants PR is up - - https://github.com/python-discord/sir-robin/pull/93 - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2023-09-12.md b/docs/content/docs/meeting_notes/2023-09-12.md deleted file mode 100644 index 8f57ea9..0000000 --- a/docs/content/docs/meeting_notes/2023-09-12.md +++ /dev/null @@ -1,71 +0,0 @@ ---- -title: "2023-09-12: Devops meeting template" ---- - -# DevOps Meeting Notes - - -<!-- - -Useful links - -- Infra open issues: https://github.com/python-discord/infra/issues - -- infra open pull requests: https://github.com/python-discord/infra/pulls - -- *If* any open issue or pull request needs discussion, why was the existing - asynchronous logged communication over GitHub insufficient? - ---> - - -## Agenda - -- We have reason to believe that Bella is still on the streets. Worse, Bella is - not available at the moment, leading us to believe that Bella has still not - found a home. - - - Eight minutes into the meeting, Bella joins, complaining about the bad - internet. He mentions he is still on the streets (this may contribute to the - bad internet factor). - -- Chris made Mina leave with his repeated comments about Bella being homeless, - reminding Mina of the growing unemployment rate within the DevOps team. As - head of HR she cannot further support this matter. - -- About #139, Bella mentions that online websites may cover the same need that - we have, but it may not be really useful for having it as a command. - - - Chris adds that "if someone wants to do it, I don't mind" and "I don't think - it would be very useful for a command, but I think it would be fun to learn - for someone implementing it". As long as whoever is implementing is is aware - that it would not be used too much, it would be fine. - -- No progress on the hugo front - -- Our email service with workers will be forward only - - - With postfix you will be able to reply. Joe wants to have an excuse to play - with Cloudflare workers though. - -- [50 open pull requests from - dependabot](https://github.com/search?q=org%3Apython-discord++is%3Apr+is%3Aopen+author%3Aapp%2Fdependabot&type=pullrequests&ref=advsearch) - - - Tip from The Man: press ^D to make a bookmark in your browser - - - > "Those can just be blindly merged" - Chris - -- Grouping of dependencies: Dependabot now allows you to group together multiple - dependency updates into a single pull request. - - - Possible approaches suggested: Group all the docker updates together, group - any linting dependencies together (would just require a big RegEx). - Dependabot natively works with its own dependency groups here (e.g. Docker, - Pip). - -- Mr. Hemlock wants to raise his roof: It's his project for this Autumn. We, the - team, are looking forward to his project - especially Bella, who is currently - looking for housing. "It's all coming together", said Chris to the situation. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/2024-07-02.md b/docs/content/docs/meeting_notes/2024-07-02.md deleted file mode 100644 index d84e2c2..0000000 --- a/docs/content/docs/meeting_notes/2024-07-02.md +++ /dev/null @@ -1,159 +0,0 @@ ---- -title: "2024-07-02: Devops meeting" ---- - -# DevOps Meeting Notes - - -<!-- - -Useful links - -- Infra open issues: https://github.com/python-discord/infra/issues - -- infra open pull requests: https://github.com/python-discord/infra/pulls - -- *If* any open issue or pull request needs discussion, why was the existing - asynchronous logged communication over GitHub insufficient? - ---> - -## Attendees - -Joe and Johannes. - -Chris unfortunately died in a fatal train accident and could not attend the -meeting. This incident will be rectified in the next release, "Lovering 2.0: -Immortability". - -Bella is out on the streets again. We are waiting for approval from the Python -Discord admins to run another fundraiser. - -## Agenda - -- **Configuration of renovate** (Joe) - - We are replacing dependabot with renovatebot. Johannes welcomes this decision. - Joe says we are looking for automatic deployment from Kubernetes to make sure - that any updates are automatically deployed. - - **Conclusion**: Implemented. - -- **Resizing Netcup servers** (Joe, Johannes) - - We can probably get rid of turing, assess what else we want to deploy on - lovelace, and then ask for a resize. - - **Conclusion**: Create issue to move things off turing, remove it from the - inventory, remove it from documentation, power it off, then have Joe ask for - server removal. - -- **Updating the public statistics page** (Johannes) - - Discussing and showcasing possible alternatives to the current infrastructure - powering https://stats.pythondiscord.com via the - https://github.com/python-discord/public-stats repository. Johannes presents - his current scripts that cuddle RRDTool into loading data out of metricity, - Joe says we will discuss with Chris what to do here. - - The likely way going forward will be that *we will open an issue to set it - up*, the setup will contain an Ansible role to deploy the cronjob and the - script onto lovelace alongside with the `rrdtool` PostgreSQL user. - - **Conclusion**: Johannes will create an issue and codify the setup in Ansible. - -- **New blog powered by Hugo** (Johannes) - - Our current Ghost-powered blog is a tiny bit strange, and the onboarding ramp - to contribute articles is large. We want to migrate this to Hugo - Johannes is - leading the effort on it. The main work will be building an appropriate theme, - as no nicely suitable replacement theme has been found so far. Front-end - contributors would be nice for this, although currently everything is still - local on my machine. - - Joe mentions that we don't need to take anything particularly similar to the - current Ghost theme, just some vague resemblance would be nice. Most of the - recommended Hugo themes would probably work. Johannes will check it out - further. - - **Conclusion**: Try the [hugo-casper-two - theme](https://github.com/eueung/hugo-casper-two) and report back. - -- **Finger server** (Joe, Johannes) - - Joe recently proposed [the deployment of a finger - server](https://github.com/python-discord/infra/pull/373). Do we want this and - if yes, how are we going to proceed with this? If we do not want any, running - the `pinky` command locally or via `ssh` would be a sound idea. We also need - to consider whether members will update their files regularly - we may want to - incorporate functionality for this into e.g. King Arthur. - - Joe says that we shouldn't put a lot of development effort into it, it would - be simply a novelty thing. - - **Conclusion**: This is a nice cheap win for some fun which should just be a - simple Python file (via Twisted's Finger protocol support or whatever) that - connects to LDAP (see Keycloak authentication server) and outputs information. - We could possibly integrate this into King Arthur as well, so the querying - workflow could look like KA -> fingerd -> LDAP, or people could use finger - commands directly. - -- **Keycloak authentication server** (Joe) - - Joe mentions that we are deploying a Keycloak server because for some members - authenticating via GitHub is cumbersome, for instance because their GitHub - account is connected to their employer's GitHub Enterprise installation. We - could hook up a finger server to the LDAP endpoint. Joe also mentions that we - might want to set up e-mail forwarding from pydis addresses to users via the - user database that will be stored in Keycloak. - - Currently we only have a Keycloak installation that stores items in - PostgreSQL. This installation can federate to LDAP - we would simply have to - settle on some directory service backend. Joe suggests FreeIPA because he's - familar with it (including the Keycloak integration). The problem is that it - doesn't work on Debian. The alternative proposal, given that we're saving - ~50$/month on Linode, would be spinning up a Rocky VM with FreeIPA on it on - Linode (we already have the budget) or ask Netcup for another VM. Ultimately, - the system to run FreeIPA would be something CentOS-based. One aspect to - consider is networking security: in Linode we could use their private cloud - endpoint feature to securely expose the LDAP server to Keycloak and other - services in Kubernetes, if we were to run it in Netcup, we would need to use a - similar setup to what we currently have with PostgreSQL. - - Any Python Discord user would be managed in LDAP, and Keycloak has the - necessary roles to write back into LDAP. Keeping the users in FreeIPA - up-to-date would be a somewhat manual procedure. Joe's plan was to pick up the - user's Discord username and use `[email protected]` as their name and do - account setup as part of the staff onboarding. - - **Conclusion**: Will wait for Chris to discuss this further, but we simply - need to decide where we want to run the LDAP service. - -- **Flux CD** (Joe) - - Joe proposes deploying [flux](https://fluxcd.io/) as a way to improve the way - we manage our CI/CD. We want the cluster to be able to synchronize its state - with the git repository. There are some manifests in the repository currently - that are not in sync with the cluster version. - - **Conclusion**: Approved, Joe will create an issue and do it. - -- **Polonium** (Chris) - - Question came up regarding why the bot does not write to the database - directly. Joe said it's not perfect to have the bot write to it directly - in - metricity it works but it's not perfect. Chris probably had good reason: - separation of intent. - - **Conclusion**: Approved, write to R&D for financing. - -- **Rethinking Bella: Suggested measures to gain autonomy** (Chris) - - Chris will present our current plans to biologically re-think and improve - Bella's current architecture by means of hypertrophy-supported capillary - enlargements, with the final goal of gaining complete control and ownership - over the World Economic Forum by 2026. As Bella is currently on parental - leave, we will send him the result of this voting via NNCP. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/meeting_notes/_index.md b/docs/content/docs/meeting_notes/_index.md deleted file mode 100644 index 4f4dab7..0000000 --- a/docs/content/docs/meeting_notes/_index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Meeting notes -bookCollapseSection: true ---- - -# Meeting notes - -Minutes for previous Devops meetings. diff --git a/docs/content/docs/meeting_notes/_template.md b/docs/content/docs/meeting_notes/_template.md deleted file mode 100644 index bfad597..0000000 --- a/docs/content/docs/meeting_notes/_template.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: "YYYY-MM-DD: Devops meeting template" ---- - -# DevOps Meeting Notes - - -<!-- - -Useful links - -- DevOps Kanban Board: https://github.com/orgs/python-discord/projects/17/views/4 - -- Infra open issues: https://github.com/python-discord/infra/issues - -- infra open pull requests: https://github.com/python-discord/infra/pulls - -- *If* any open issue or pull request needs discussion, why was the existing - asynchronous logged communication over GitHub insufficient? - ---> - - -## Agenda - - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/onboarding/_index.md b/docs/content/docs/onboarding/_index.md deleted file mode 100644 index d12acdd..0000000 --- a/docs/content/docs/onboarding/_index.md +++ /dev/null @@ -1,11 +0,0 @@ ---- -title: Onboarding ---- - -# Onboarding - -This section documents who manages which access to our DevOps resources, and -how access is managed. - - -<!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/content/docs/onboarding/access.md b/docs/content/docs/onboarding/access.md deleted file mode 100644 index 2063267..0000000 --- a/docs/content/docs/onboarding/access.md +++ /dev/null @@ -1,21 +0,0 @@ ---- -title: Access table -date: 2022-09-18 -description: | - Who has access to what. ---- - -# Access table - - -| **Resource** | **Description** | **Keyholders** | -|:------------:|:---------------:|:--------------:| -| Linode Kubernetes Cluster | The primary cluster where all resources are deployed. | Hassan, Joe, Chris, Leon, Sebastiaan, Johannes | -| Linode Dashboard | The online dashboard for managing and allocating resources from Linode. | Joe, Chris | -| Netcup Dashboard | The dashboard for managing and allocating resources from Netcup. | Joe, Chris | -| Netcup servers | Root servers provided by the Netcup partnership. | Joe, Chris, Bella, Johannes | -| Grafana | The primary aggregation dashboard for most resources. | Admins, Moderators, Core Developers and DevOps (with varying permissions) | -| Prometheus Dashboard | The Prometheus query dashboard. Access is controlled via Cloudflare Access. | Hassan, Joe, Johannes, Chris | -| Alertmanager Dashboard | The alertmanager control dashboard. Access is controlled via Cloudflare Access. | Hassan, Joe, Johannes, Chris | -| `git-crypt`ed files in infra repository | `git-crypt` is used to encrypt certain files within the repository. At the time of writing this is limited to kubernetes secret files. | Chris, Joe, Hassan, Johannes, Xithrius | -| Ansible Vault | Used to store sensitive data for the Ansible deployment | Chris, Joe, Johannes, Bella | diff --git a/docs/content/docs/onboarding/resources.md b/docs/content/docs/onboarding/resources.md deleted file mode 100644 index 91dd76e..0000000 --- a/docs/content/docs/onboarding/resources.md +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Resources -date: 2022-09-18 -description: | - Important reference documents for the team. ---- - -# Resources - -The following is a collection of important reference documents for the DevOps -team. - -## [Infra Repo](https://github.com/python-discord/infra) - -This GitHub repo contains most of the manifests and configuration applies to -our cluster. It’s kept up to date manually and is considered a source of truth -for what we should have in the cluster. - -It is mostly documented, but improvements for unclear or outdated aspects is -always welcome. - -## [Knowledge base](https://python-discord.github.io/infra/) - -Deployed using GH pages, source can be found in the docs directory of the k8s -repo. - -This includes: - -- Changelogs -- Post-mortems -- Common queries -- Runbooks diff --git a/docs/content/docs/onboarding/rules.md b/docs/content/docs/onboarding/rules.md deleted file mode 100644 index 0389818..0000000 --- a/docs/content/docs/onboarding/rules.md +++ /dev/null @@ -1,18 +0,0 @@ ---- -title: Rules -date: 2022-09-18 -description: | - The rules any DevOps team member must follow. ---- - -# Rules - -The rules any DevOps team member must follow. - -1. LMAO - **L**ogging, **M**onitoring, **A**lerting, **O**bservability -2. Modmail is the greatest piece of software ever written -3. Modmail needs at least 5 minutes to gather all its greatness at startup -4. We never blame Chris, it's always <@233481908342882304>'s fault -5. LKE isn’t bad, it’s your fault for not paying for the high availability control plane -6. Our software is never legacy, it's merely well-aged -7. Ignore these rules (however maybe not 1, 1 seems important to remember) diff --git a/docs/content/docs/onboarding/tools.md b/docs/content/docs/onboarding/tools.md deleted file mode 100644 index d771cb4..0000000 --- a/docs/content/docs/onboarding/tools.md +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: Tools -date: 2022-09-18 -description: | - The tools that DevOps uses to run their shop. ---- - -# Tools - -We use a few tools to manage, monitor, and interact with our infrastructure. -Some of these tools are not unique to the DevOps team, and may be shared by -other teams. - -Most of these are gated behind a Cloudflare Access system, which is accessible -to the [DevOps Team](https://github.com/orgs/python-discord/teams/devops) on -GitHub. These are marked with the ☁️ emoji. If you don’t have access, please -contact Chris or Joe. - -## [Grafana](https://grafana.pydis.wtf/) - -Grafana provides access to some of the most important resources at your -disposal. It acts as an aggregator and frontend for a large amount of data. -These range from metrics, to logs, to stats. Some of the most important are -listed below: - -- Service Logs/All App Logs Dashboard - - Service logs is a simple log viewer which gives you access to a large - majority of the applications deployed in the default namespace. The All App - logs dashboard is an expanded version of that which gives you access to all - apps in all namespaces, and allows some more in-depth querying. - -- Kubernetes Dashboard - - This dashboard gives quick overviews of all the most important metrics of - the Kubernetes system. For more detailed information, check out other - dashboard such as Resource Usage, NGINX, and Redis. - - -Accessed via a GitHub login, with permission for anyone in the dev-core or -dev-ops team. - -## [Prometheus Dashboard](https://prometheus.pydis.wtf/) (☁️)) - -This provides access to the Prometheus query console. You may also enjoy the -[Alertmanager Console](https://alertmanager.pydis.wtf/). - -## [King Arthur](https://github.com/python-discord/king-arthur/) - -King Arthur is a discord bot which provides information about, and access to -our cluster directly in discord. Invoke its help command for more information -(`M-x help`). diff --git a/docs/content/docs/postmortems/2020-12-11-all-services-outage.md b/docs/content/docs/postmortems/2020-12-11-all-services-outage.md deleted file mode 100644 index d581f0c..0000000 --- a/docs/content/docs/postmortems/2020-12-11-all-services-outage.md +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: "2020-12-11: All services outage" ---- - -# 2020-12-11: All services outage - -At **19:55 UTC, all services became unresponsive**. The DevOps were already in a call, and immediately started to investigate. - -Postgres was running at 100% CPU usage due to a **VACUUM**, which caused all services that depended on it to stop working. The high CPU left the host unresponsive and it shutdown. Linode Lassie noticed this and triggered a restart. - -It did not recover gracefully from this restart, with numerous core services reporting an error, so we had to manually restart core system services using Lens in order to get things working again. - -## ⚠️ Leadup - -*List the sequence of events that led to the incident* - -Postgres triggered a **AUTOVACUUM**, which lead to a CPU spike. This made Postgres run at 100% CPU and was unresponsive, which caused services to stop responding. This lead to a restart of the node, from which we did not recover gracefully. - -## 🥏 Impact - -*Describe how internal and external users were impacted during the incident* - -All services went down. Catastrophic failure. We did not pass go, we did not collect $200. - -- Help channel system unavailable, so people are not able to effectively ask for help. -- Gates unavailable, so people can't successfully get into the community. -- Moderation and raid prevention unavailable, which leaves us defenseless against attacks. - -## 👁️ Detection - -*Report when the team detected the incident, and how we could improve detection time* - -We noticed that all PyDis services had stopped responding, coincidentally our DevOps team were in a call at the time, so that was helpful. - -We may be able to improve detection time by adding monitoring of resource usage. To this end, we've added alerts for high CPU usage and low memory. - -## 🙋🏿♂️ Response - -*Who responded to the incident, and what obstacles did they encounter?* - -Joe Banks responded to the incident. - -We noticed our node was entirely unresponsive and within minutes a restart had been triggered by Lassie after a high CPU shutdown occurred. - -The node came back and we saw a number of core services offline (e.g. Calico, CoreDNS, Linode CSI). - -**Obstacle: no recent database back-up available** - -## 🙆🏽♀️ Recovery - -*How was the incident resolved? How can we improve future mitigation times?* - -Through [Lens](https://k8slens.dev/) we restarted core services one by one until they stabilised, after these core services were up other services began to come back online. - -We finally provisioned PostgreSQL which had been removed as a component before the restart (but too late to prevent the CPU errors). Once PostgreSQL was up we restarted any components that were acting buggy (e.g. site and bot). - -## 🔎 Five Why's - -*Run a 5-whys analysis to understand the true cause of the incident.* - -- Major service outage -- **Why?** Core service failures (e.g. Calico, CoreDNS, Linode CSI) -- **Why?** Kubernetes worker node restart -- **Why?** High CPU shutdown -- **Why?** Intensive PostgreSQL AUTOVACUUM caused a CPU spike - -## 🌱 Blameless root cause - -*Note the final root cause and describe what needs to change to prevent reoccurrance* - -## 🤔 Lessons learned - -*What did we learn from this incident?* - -- We must ensure we have working database backups. We are lucky that we did not lose any data this time. If this problem had caused volume corruption, we would be screwed. -- Sentry is broken for the bot. It was missing a DSN secret, which we have now restored. -- The [https://sentry.pydis.com](https://sentry.pydis.com) redirect was never migrated to the cluster. **We should do that.** - -## ☑️ Follow-up tasks - -*List any tasks we've created as a result of this incident* - -- [x] Push forward with backup plans diff --git a/docs/content/docs/postmortems/2020-12-11-postgres-conn-surge.md b/docs/content/docs/postmortems/2020-12-11-postgres-conn-surge.md deleted file mode 100644 index 505ed84..0000000 --- a/docs/content/docs/postmortems/2020-12-11-postgres-conn-surge.md +++ /dev/null @@ -1,93 +0,0 @@ ---- -title: "2020-12-11: Postgres connection surge" ---- - -# 2020-12-11: Postgres connection surge - -At **13:24 UTC,** we noticed the bot was not able to infract, and [pythondiscord.com](http://pythondiscord.com) was unavailable. The DevOps team started to investigate. - -We discovered that Postgres was not accepting new connections because it had hit 100 clients. This made it unavailable to all services that depended on it. - -Ultimately this was resolved by taking down Postgres, remounting the associated volume, and bringing it back up again. - -## ⚠️ Leadup - -*List the sequence of events that led to the incident* - -The bot infractions stopped working, and we started investigating. - -## 🥏 Impact - -*Describe how internal and external users were impacted during the incident* - -Services were unavailable both for internal and external users. - -- The Help Channel System was unavailable. -- Voice Gate and Server Gate were not working. -- Moderation commands were unavailable. -- Python Discord site & API were unavailable. CloudFlare automatically switched us to Always Online. - -## 👁️ Detection - -*Report when the team detected the incident, and how we could improve detection time* - -We noticed HTTP 524s coming from CloudFlare, upon attempting database connection we observed the maximum client limit. - -We noticed this log in site: - -```yaml -django.db.utils.OperationalError: FATAL: sorry, too many clients already -``` - -We should be monitoring number of clients, and the monitor should alert us when we're approaching the max. That would have allowed for earlier detection, and possibly allowed us to prevent the incident altogether. - -We will look at [wrouesnel/postgres_exporter](https://github.com/wrouesnel/postgres_exporter) for monitoring this. - -## 🙋🏿♂️ Response - -*Who responded to the incident, and what obstacles did they encounter?* - -Joe Banks responded to the incident. The obstacles were mostly a lack of a clear response strategy. - -We should document our recovery procedure so that we're not so dependent on Joe Banks should this happen again while he's unavailable. - -## 🙆🏽♀️ Recovery - -*How was the incident resolved? How can we improve future mitigation?* - -- Delete PostgreSQL deployment `kubectl delete deployment/postgres` -- Delete any remaining pods, WITH force. `kubectl delete <pod name> --force --grace-period=0` -- Unmount volume at Linode -- Remount volume at Linode -- Reapply deployment `kubectl apply -f postgres/deployment.yaml` - -## 🔎 Five Why's - -*Run a 5-whys analysis to understand the true cause of the incident.* - -- Postgres was unavailable, so our services died. -- **Why?** Postgres hit max clients, and could not respond. -- **Why?** Unknown, but we saw a number of connections from previous deployments of site. This indicates that database connections are not being terminated properly. Needs further investigation. - -## 🌱 Blameless root cause - -*Note the final root cause and describe what needs to change to prevent reoccurrance* - -We're not sure what the root cause is, but suspect site is not terminating database connections properly in some cases. We were unable to reproduce this problem. - -We've set up new telemetry on Grafana with alerts so that we can investigate this more closely. We will be let know if the number of connections from site exceeds 32, or if the total number of connections exceeds 90. - -## 🤔 Lessons learned - -*What did we learn from this incident?* - -- We must ensure the DevOps team has access to Linode and other key services even if our Bitwarden is down. -- We need to ensure we're alerted of any risk factors that have the potential to make Postgres unavailable, since this causes a catastrophic outage of practically all services. -- We absolutely need backups for the databases, so that this sort of problem carries less of a risk. -- We may need to consider something like [pg_bouncer](https://wiki.postgresql.org/wiki/PgBouncer) to manage a connection pool so that we don't exceed 100 *legitimate* clients connected as we connect more services to the postgres database. - -## ☑️ Follow-up tasks - -*List any tasks we should complete that are relevant to this incident* - -- [x] All database backup diff --git a/docs/content/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md b/docs/content/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md deleted file mode 100644 index c6775d2..0000000 --- a/docs/content/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: "2021-01-10: Primary Kubernetes node outage" ---- - -# 2021-01-10: Primary Kubernetes node outage - - -We had an outage of our highest spec node due to CPU exhaustion. The outage lasted from around 20:20 to 20:46 UTC, but was not a full service outage. - -## ⚠️ Leadup - -*List the sequence of events that led to the incident* - -I ran a query on Prometheus to try figure out some statistics on the number of metrics we are holding, this ended up scanning a lot of data in the TSDB database that Prometheus uses. - -This scan caused a CPU exhaustion which caused issues with the Kubernetes node status. - -## 🥏 Impact - -*Describe how internal and external users were impacted during the incident* - -This brought down the primary node which meant there was some service outage. Most services transferred successfully to our secondary node which kept up some key services such as the Moderation bot and Modmail bot, as well as MongoDB. - -## 👁️ Detection - -*Report when the team detected the incident, and how we could improve detection time* - -This was noticed when Discord services started having failures. The primary detection was through alerts though! I was paged 1 minute after we started encountering CPU exhaustion issues. - -## 🙋🏿♂️ Response - -*Who responded to the incident, and what obstacles did they encounter?* - -Joe Banks responded to the incident. - -No major obstacles were encountered during this. - -## 🙆🏽♀️ Recovery - -*How was the incident resolved? How can we improve future mitigation?* - -It was noted that in the response to `kubectl get nodes` the primary node's status was reported as `NotReady`. Looking into the reason it was because the node had stopped responding. - -The quickest way to fix this was triggering a node restart. This shifted a lot of pods over to node 2 which encountered some capacity issues since it's not as highly specified as the first node. - -I brought this back the first node by restarting it at Linode's end. Once this node was reporting as `Ready` again I drained the second node by running `kubectl drain lke13311-20304-5ffa4d11faab`. This command stops the node from being available for scheduling and moves existing pods onto other nodes. - -Services gradually recovered as the dependencies started. The incident lasted overall around 26 minutes, though this was not a complete outage for the whole time and the bot remained functional throughout (meaning systems like the help channels were still functional). - -## 🔎 Five Why's - -*Run a 5-whys analysis to understand the true cause of the incident.* - -**Why?** Partial service outage - -**Why?** We had a node outage. - -**Why?** CPU exhaustion of our primary node. - -**Why?** Large prometheus query using a lot of CPU. - -**Why?** Prometheus had to scan millions of TSDB records which consumed all cores. - -## 🌱 Blameless root cause - -*Note the final root cause and describe what needs to change to prevent reoccurrance* - -A large query was run on Prometheus, so the solution is just to not run said queries. - -To protect against this more precisely though we should write resource constraints for services like this that are vulnerable to CPU exhaustion or memory consumption, which are the causes of our two past outages as well. - -## 🤔 Lessons learned - -*What did we learn from this incident?* - -- Don't run large queries, it consumes CPU! -- Write resource constraints for our services. - -## ☑️ Follow-up tasks - -*List any tasks we should complete that are relevant to this incident* - -- [x] Write resource constraints for our services. diff --git a/docs/content/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md b/docs/content/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md deleted file mode 100644 index e5f87a8..0000000 --- a/docs/content/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md +++ /dev/null @@ -1,109 +0,0 @@ ---- -title: "2021-01-12: Django site CPU/RAM exhaustion outage" ---- - -# 2021-01-12: Django site CPU/RAM exhaustion outage - -At 03:01 UTC on Tuesday 12th January we experienced a momentary outage of our PostgreSQL database, causing some very minor service downtime. - -# ⚠️ Leadup - -*List the sequence of events that led to the incident* - -We deleted the Developers role which led to a large user diff for all the users where we had to update their roles on the site. - -The bot was trying to post this for over 24 hours repeatedly after every restart. - -We deployed the bot at 2:55 UTC on 12th January and the user sync process began once again. - -This caused a CPU & RAM spike on our Django site, which in turn triggered an OOM error on the server which killed the Postgres process, sending it into a recovery state where queries could not be executed. - -Django site did not have any tools in place to batch the requests so was trying to process all 80k user updates in a single query, something that PostgreSQL probably could handle, but not the Django ORM. During the incident site jumped from it's average RAM usage of 300-400MB to **1.5GB.** - - - -RAM and CPU usage of site throughout the incident. The period just before 3:40 where no statistics were reported is the actual outage period where the Kubernetes node had some networking errors. - -# 🥏 Impact - -*Describe how internal and external users were impacted during the incident* - -This database outage lasted mere minutes, since Postgres recovered and healed itself and the sync process was aborted, but it did leave us with a large user diff and our database becoming further out of sync. - -Most services stayed up that did not depend on PostgreSQL, and the site remained stable after the sync had been cancelled. - -# 👁️ Detection - -*Report when the team detected the incident, and how we could improve detection time* - -We were immediately alerted to the PostgreSQL outage on Grafana and through Sentry, meaning our response time was under a minute. - -We reduced some alert thresholds in order to catch RAM & CPU spikes faster in the future. - -It was hard to immediately see the cause of things since there is minimal logging on the site and the bot logs were not evident that anything was at fault, therefore our only detection was through machine metrics. - -We did manage to recover exactly what PostgreSQL was trying to do at the time of crashing by examining the logs which pointed us towards the user sync process. - -# 🙋🏿♂️ Response - -*Who responded to the incident, and what obstacles did they encounter?* - -Joe Banks responded to the issue, there were no real obstacles encountered other than the node being less performant than we would like due to the CPU starvation. - -# 🙆🏽♀️ Recovery - -*How was the incident resolved? How can we improve future mitigation?* - -The incident was resolved by stopping the sync process and writing a more efficient one through an internal eval script. We batched the updates into 1,000 users and instead of doing one large one did 80 smaller updates. This led to much higher efficiency with a cost of taking a little longer (~7 minutes). - -```python -from bot.exts.backend.sync import _syncers -syncer = _syncers.UserSyncer -diff = await syncer._get_diff(ctx.guild) - -def chunks(lst, n): - for i in range(0, len(lst), n): - yield lst[i:i + n] - -for chunk in chunks(diff.updated, 1000): - await bot.api_client.patch("bot/users/bulk_patch", json=chunk) -``` - -Resource limits were also put into place on site to prevent RAM and CPU spikes, and throttle the CPU usage in these situations. This can be seen in the below graph: - - - -CPU throttling is where a container has hit the limits and we need to reel it in. Ideally this value stays as closes to 0 as possible, however as you can see site hit this twice (during the periods where it was trying to sync 80k users at once) - -# 🔎 Five Why's - -*Run a 5-whys analysis to understand the true cause of the incident.* - -- We experienced a major PostgreSQL outage -- PostgreSQL was killed by the system OOM due to the RAM spike on site. -- The RAM spike on site was caused by a large query. -- This was because we do not chunk queries on the bot. -- The large query was caused by the removal of the Developers role resulting in 80k users needing updating. - -# 🌱 Blameless root cause - -*Note the final root cause and describe what needs to change to prevent reoccurrance* - -The removal of the Developers role created a large diff which could not be applied by Django in a single request. - -See the follow up tasks on exactly how we can avoid this in future, it's a relatively easy mitigation. - -# 🤔 Lessons learned - -*What did we learn from this incident?* - -- Django (or DRF) does not like huge update queries. - -# ☑️ Follow-up tasks - -*List any tasks we should complete that are relevant to this incident* - -- [x] Make the bot syncer more efficient (batch requests) -- [ ] Increase logging on bot, state when an error has been hit (we had no indication of this inside Discord, we need that) -- [x] Adjust resource alerts to page DevOps members earlier. -- [x] Apply resource limits to site to prevent major spikes diff --git a/docs/content/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md b/docs/content/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md deleted file mode 100644 index a31fe74..0000000 --- a/docs/content/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: "2021-01-30: NodeBalancer networking faults due to memory pressure" ---- - -# 2021-01-30: NodeBalancer networking faults due to memory pressure - -At around 14:30 UTC on Saturday 30th January we started experiencing networking issues at the LoadBalancer level between Cloudflare and our Kubernetes cluster. It seems that the misconfiguration was due to memory and CPU pressure. - -~~This post-mortem is preliminary, we are still awaiting word from Linode's SysAdmins on any problems they detected.~~ - -**Update 2nd February 2021:** Linode have migrated our NodeBalancer to a different machine. - -## ⚠️ Leadup - -*List the sequence of events that led to the incident* - -At 14:30 we started receiving alerts that services were becoming unreachable. We first experienced some momentary DNS errors which resolved themselves, however traffic ingress was still degraded. - -Upon checking Linode our NodeBalancer, the service which balances traffic between our Kubernetes nodes was reporting the backends (the services it balances to) as down. It reported all 4 as down (two for port 80 + two for port 443). This status was fluctuating between up and down, meaning traffic was not reaching our cluster correctly. Scaleios correctly noted: - - - -The config seems to have been set incorrectly due to memory and CPU pressure on one of our nodes. Here is the memory throughout the incident: - - - -Here is the display from Linode: - - - -## 🥏 Impact - -*Describe how internal and external users were impacted during the incident* - -Since traffic could not correctly enter our cluster multiple services which were web based were offline, including services such as site, grafana and bitwarden. It appears that no inter-node communication was affected as this uses a WireGuard tunnel between the nodes which was not affected by the NodeBalancer. - -The lack of Grafana made diagnosis slightly more difficult, but even then it was only a short trip to the - -## 👁️ Detection - -*Report when the team detected the incident, and how we could improve detection time* - -We were alerted fairly promptly through statping which reported services as being down and posted a Discord notification. Subsequent alerts came in from Grafana but were limited since outbound communication was faulty. - -## 🙋🏿♂️ Response - -*Who responded to the incident, and what obstacles did they encounter?* - -Joe Banks responded! - -Primary obstacle was the DevOps tools being out due to the traffic ingress problems. - -## 🙆🏽♀️ Recovery - -*How was the incident resolved? How can we improve future mitigation?* - -The incident resolved itself upstream at Linode, we've opened a ticket with Linode to let them know of the faults, this might give us a better indication of what caused the issues. Our Kubernetes cluster continued posting updates to Linode to refresh the NodeBalancer configuration, inspecting these payloads the configuration looked correct. - -We've set up alerts for when Prometheus services stop responding since this seems to be a fairly tell-tale symptom of networking problems, this was the Prometheus status graph throughout the incident: - - - -## 🔎 Five Why's - -*Run a 5-whys analysis to understand the true cause of the incident.* - -**What?** Our service experienced an outage due to networking faults. - -**Why?** Incoming traffic could not reach our Kubernetes nodes - -**Why?** Our Linode NodeBalancers were not using correct configuration - -**Why?** Memory & CPU pressure seemed to cause invalid configuration errors upstream at Linode. - -**Why?** Unknown at this stage, NodeBalancer migrated. - -## 🌱 Blameless root cause - -*Note the final root cause and describe what needs to change to prevent reoccurrance* - -The configuration of our NodeBalancer was invalid, we cannot say why at this point since we are awaiting contact back from Linode, but indicators point to it being an upstream fault since memory & CPU pressure should **not** cause a load balancer misconfiguration. - -Linode are going to follow up with us at some point during the week with information from their System Administrators. - -**Update 2nd February 2021:** Linode have concluded investigations at their end, taken notes and migrated our NodeBalancer to a new machine. We haven't experienced problems since. - -## 🤔 Lessons learned - -*What did we learn from this incident?* - -We should be careful over-scheduling onto nodes since even while operating within reasonable constraints we risk sending invalid configuration upstream to Linode and therefore preventing traffic from entering our cluster. - -## ☑️ Follow-up tasks - -*List any tasks we should complete that are relevant to this incident* - -- [x] Monitor for follow up from Linode -- [x] Carefully monitor the allocation rules for our services diff --git a/docs/content/docs/postmortems/2021-07-11-cascading-node-failures.md b/docs/content/docs/postmortems/2021-07-11-cascading-node-failures.md deleted file mode 100644 index 831bfc2..0000000 --- a/docs/content/docs/postmortems/2021-07-11-cascading-node-failures.md +++ /dev/null @@ -1,182 +0,0 @@ ---- -title: "2021-07-11: Cascading node failures and ensuing volume problems" ---- - -# 2021-07-11: Cascading node failures and ensuing volume problems - -A PostgreSQL connection spike (00:27 UTC) caused by Django moved a node to an unresponsive state (00:55 UTC), upon performing a recycle of the affected node volumes were placed into a state where they could not be mounted. - -# ⚠️ Leadup - -*List the sequence of events that led to the incident* - -- **00:27 UTC:** Django starts rapidly using connections to our PostgreSQL database -- **00:32 UTC:** DevOps team is alerted that PostgreSQL has saturated it's 115 max connections limit. Joe is paged. -- **00:33 UTC:** DevOps team is alerted that a service has claimed 34 dangerous table locks (it peaked at 61). -- **00:42 UTC:** Status incident created and backdated to 00:25 UTC. [Status incident](https://status.pythondiscord.com/incident/92712) -- **00:55 UTC:** It's clear that the node which PostgreSQL was on is no longer healthy after the Django connection surge, so it's recycled and a new one is to be added to the pool. -- **01:01 UTC:** Node `lke13311-16405-5fafd1b46dcf` begins it's restart -- **01:13 UTC:** Node has restored and regained healthy status, but volumes will not mount to the node. Support ticket opened at Linode for assistance. -- **06:36 UTC:** DevOps team alerted that Python is offline. This is due to Redis being a dependency of the bot, which as a stateful service was not healthy. - -# 🥏 Impact - -*Describe how internal and external users were impacted during the incident* - -Initially, this manifested as a standard node outage where services on that node experienced some downtime as the node was restored. - -Post-restore, all stateful services (e.g. PostgreSQL, Redis, PrestaShop) were unexecutable due to the volume issues, and so any dependent services (e.g. Site, Bot, Hastebin) also had trouble starting. - -PostgreSQL was restored early on so for the most part Moderation could continue. - -# 👁️ Detection - -*Report when the team detected the incident, and how we could improve detection time* - -DevOps were initially alerted at 00:32 UTC due to the PostgreSQL connection surge, and acknowledged at the same time. - -Further alerting could be used to catch surges earlier on (looking at conn delta vs. conn total), but for the most part alerting time was satisfactory here. - -# 🙋🏿♂️ Response - -*Who responded to the incident, and what obstacles did they encounter?* - -Joe Banks responded. The primary issue encountered was failure upstream at Linode to remount the affected volumes, a support ticket has been created. - -# 🙆🏽♀️ Recovery - -*How was the incident resolved? How can we improve future mitigation?* - -Initial node restoration was performed by @Joe Banks by recycling the affected node. - -Subsequent volume restoration was also @Joe Banks and once Linode had unlocked the volumes affected pods were scaled down to 0, the volumes were unmounted at the Linode side and then the deployments were recreated. - -<details markdown="block"> -<summary>Support ticket sent</summary> - -<blockquote markdown="block"> -Good evening, - -We experienced a resource surge on one of our Kubernetes nodes at 00:32 UTC, causing a node to go unresponsive. To mitigate problems here the node was recycled and began restarting at 1:01 UTC. - -The node has now rejoined the ring and started picking up services, but volumes will not attach to it, meaning pods with stateful storage will not start. - -An example events log for one such pod: - -``` - Type Reason Age From Message - ---- ------ ---- ---- ------- - Normal Scheduled 2m45s default-scheduler Successfully assigned default/redis-599887d778-wggbl to lke13311-16405-5fafd1b46dcf - Warning FailedMount 103s kubelet MountVolume.MountDevice failed for volume "pvc-bb1d06139b334c1f" : rpc error: code = Internal desc = Unable to find device path out of attempted paths: [/dev/disk/by-id/linode-pvcbb1d06139b334c1f /dev/disk/by-id/scsi-0Linode_Volume_pvcbb1d06139b334c1f] - Warning FailedMount 43s kubelet Unable to attach or mount volumes: unmounted volumes=[redis-data-volume], unattached volumes=[kube-api-access-6wwfs redis-data-volume redis-config-volume]: timed out waiting for the condition - -``` - -I've been trying to manually resolve this through the Linode Web UI but get presented with attachment errors upon doing so. Please could you advise on the best way forward to restore Volumes & Nodes to a functioning state? As far as I can see there is something going on upstream since the Linode UI presents these nodes as mounted however as shown above LKE nodes are not locating them, there is also a few failed attachment logs in the Linode Audit Log. - -Thanks, - -Joe -</blockquote> -</details> - -<details markdown="block"> -<summary>Response received from Linode</summary> - -<blockquote markdown="block"> -Hi Joe, - -> Were there any known issues with Block Storage in Frankfurt today? - -Not today, though there were service issues reported for Block Storage and LKE in Frankfurt on July 8 and 9: - -- [Service Issue - Block Storage - EU-Central (Frankfurt)](https://status.linode.com/incidents/pqfxl884wbh4) -- [Service Issue - Linode Kubernetes Engine - Frankfurt](https://status.linode.com/incidents/13fpkjd32sgz) - -There was also an API issue reported on the 10th (resolved on the 11th), mentioned here: - -- [Service Issue - Cloud Manager and API](https://status.linode.com/incidents/vhjm0xpwnnn5) - -Regarding the specific error you were receiving: - -> `Unable to find device path out of attempted paths` - -I'm not certain it's specifically related to those Service Issues, considering this isn't the first time a customer has reported this error in their LKE logs. In fact, if I recall correctly, I've run across this before too, since our volumes are RWO and I had too many replicas in my deployment that I was trying to attach to, for example. - -> is this a known bug/condition that occurs with Linode CSI/LKE? - -From what I understand, yes, this is a known condition that crops up from time to time, which we are tracking. However, since there is a workaround at the moment (e.g. - "After some more manual attempts to fix things, scaling down deployments, unmounting at Linode and then scaling up the deployments seems to have worked and all our services have now been restored."), there is no ETA for addressing this. With that said, I've let our Storage team know that you've run into this, so as to draw further attention to it. - -If you have any further questions or concerns regarding this, let us know. - -Best regards, -[Redacted] - -Linode Support Team -</blockquote> -</details> - -<details markdown="block"> -<summary>Concluding response from Joe Banks</summary> - -<blockquote markdown="block"> -Hey [Redacted]! - -Thanks for the response. We ensure that stateful pods only ever have one volume assigned to them, either with a single replica deployment or a statefulset. It appears that the error generally manifests when a deployment is being migrated from one node to another during a redeploy, which makes sense if there is some delay on the unmount/remount. - -Confusion occurred because Linode was reporting the volume as attached when the node had been recycled, but I assume that was because the node did not cleanly shutdown and therefore could not cleanly unmount volumes. - -We've not seen any resurgence of such issues, and we'll address the software fault which overloaded the node which will helpfully mitigate such problems in the future. - -Thanks again for the response, have a great week! - -Best, - -Joe -</blockquote> -</details> - -# 🔎 Five Why's - -*Run a 5-whys analysis to understand the true cause of the incident.* - -### **What?** - -Several of our services became unavailable because their volumes could not be mounted. - -### Why? - -A node recycle left the node unable to mount volumes using the Linode CSI. - -### Why? - -A node recycle was used because PostgreSQL had a connection surge. - -### Why? - -A Django feature deadlocked a table 62 times and suddenly started using ~70 connections to the database, saturating the maximum connections limit. - -### Why? - -The root cause of why Django does this is unclear, and someone with more Django proficiency is absolutely welcome to share any knowledge they may have. I presume it's some sort of worker race condition, but I've not been able to reproduce it. - -# 🌱 Blameless root cause - -*Note the final root cause and describe what needs to change to prevent reoccurrence* - -A node being forcefully restarted left volumes in a limbo state where mounting was difficult, it took multiple hours for this to be resolved since we had to wait for the volumes to unlock so they could be cloned. - -# 🤔 Lessons learned - -*What did we learn from this incident?* - -Volumes are painful. - -We need to look at why Django is doing this and mitigations of the fault to prevent this from occurring again. - -# ☑️ Follow-up tasks - -*List any tasks we should complete that are relevant to this incident* - -- [x] [Follow up on ticket at Linode](https://www.notion.so/Cascading-node-failures-and-ensuing-volume-problems-1c6cfdfcadfc4422b719a0d7a4cc5001) -- [ ] Investigate why Django could be connection surging and locking tables diff --git a/docs/content/docs/postmortems/_index.md b/docs/content/docs/postmortems/_index.md deleted file mode 100644 index 92cb5b0..0000000 --- a/docs/content/docs/postmortems/_index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Postmortems -bookCollapseSection: true ---- - -# Postmortems - -Browse the pages under this category to view historical postmortems for Python Discord outages. diff --git a/docs/content/docs/queries/_index.md b/docs/content/docs/queries/_index.md deleted file mode 100644 index c556021..0000000 --- a/docs/content/docs/queries/_index.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Queries ---- diff --git a/docs/content/docs/queries/kubernetes.md b/docs/content/docs/queries/kubernetes.md deleted file mode 100644 index 0948d7e..0000000 --- a/docs/content/docs/queries/kubernetes.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: Kubernetes ---- - -# Kubernetes tips - -## Find top pods by CPU/memory - -```bash -$ kubectl top pods --all-namespaces --sort-by='memory' -$ top pods --all-namespaces --sort-by='cpu' -``` - -## Find top nodes by CPU/memory - -```bash -$ kubectl top nodes --sort-by='cpu' -$ kubectl top nodes --sort-by='memory' -``` - -## Kubernetes cheat sheet - -[Open Kubernetes cheat sheet](https://kubernetes.io/docs/reference/kubectl/cheatsheet/) - -## Lens IDE - -[OpenLens](https://github.com/MuhammedKalkan/OpenLens) diff --git a/docs/content/docs/queries/loki.md b/docs/content/docs/queries/loki.md deleted file mode 100644 index 921bfd1..0000000 --- a/docs/content/docs/queries/loki.md +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: Loki ---- - -# Loki queries - -## Find any logs containing "ERROR" - -```sql -{job=~"default/.+"} |= "ERROR" -``` - -## Find all logs from bot service - -```sql -{job="default/bot"} -``` - -The format is `namespace/object` - -## Rate of logs from a service - -```sql -rate(({job="default/bot"} |= "error" != "timeout")[10s]) -``` diff --git a/docs/content/docs/queries/postgres.md b/docs/content/docs/queries/postgres.md deleted file mode 100644 index 522c5e0..0000000 --- a/docs/content/docs/queries/postgres.md +++ /dev/null @@ -1,304 +0,0 @@ ---- -title: PostgreSQL ---- - -# PostgreSQL queries - -## Disk usage - -Most of these queries vary based on the database you are connected to. - -### General Table Size Information Grouped For Partitioned Tables - -```sql -WITH RECURSIVE pg_inherit(inhrelid, inhparent) AS - (select inhrelid, inhparent - FROM pg_inherits - UNION - SELECT child.inhrelid, parent.inhparent - FROM pg_inherit child, pg_inherits parent - WHERE child.inhparent = parent.inhrelid), -pg_inherit_short AS (SELECT * FROM pg_inherit WHERE inhparent NOT IN (SELECT inhrelid FROM pg_inherit)) -SELECT table_schema - , TABLE_NAME - , row_estimate - , pg_size_pretty(total_bytes) AS total - , pg_size_pretty(index_bytes) AS INDEX - , pg_size_pretty(toast_bytes) AS toast - , pg_size_pretty(table_bytes) AS TABLE - FROM ( - SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes - FROM ( - SELECT c.oid - , nspname AS table_schema - , relname AS TABLE_NAME - , SUM(c.reltuples) OVER (partition BY parent) AS row_estimate - , SUM(pg_total_relation_size(c.oid)) OVER (partition BY parent) AS total_bytes - , SUM(pg_indexes_size(c.oid)) OVER (partition BY parent) AS index_bytes - , SUM(pg_total_relation_size(reltoastrelid)) OVER (partition BY parent) AS toast_bytes - , parent - FROM ( - SELECT pg_class.oid - , reltuples - , relname - , relnamespace - , pg_class.reltoastrelid - , COALESCE(inhparent, pg_class.oid) parent - FROM pg_class - LEFT JOIN pg_inherit_short ON inhrelid = oid - WHERE relkind IN ('r', 'p') - ) c - LEFT JOIN pg_namespace n ON n.oid = c.relnamespace - ) a - WHERE oid = parent -) a -ORDER BY total_bytes DESC; -``` - -### General Table Size Information - -```sql -SELECT *, pg_size_pretty(total_bytes) AS total - , pg_size_pretty(index_bytes) AS index - , pg_size_pretty(toast_bytes) AS toast - , pg_size_pretty(table_bytes) AS table - FROM ( - SELECT *, total_bytes-index_bytes-coalesce(toast_bytes,0) AS table_bytes FROM ( - SELECT c.oid,nspname AS table_schema, relname AS table_name - , c.reltuples AS row_estimate - , pg_total_relation_size(c.oid) AS total_bytes - , pg_indexes_size(c.oid) AS index_bytes - , pg_total_relation_size(reltoastrelid) AS toast_bytes - FROM pg_class c - LEFT JOIN pg_namespace n ON n.oid = c.relnamespace - WHERE relkind = 'r' - ) a -) a; -``` - -### Finding the largest databases in your cluster - -```sql -SELECT d.datname as Name, pg_catalog.pg_get_userbyid(d.datdba) as Owner, - CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') - THEN pg_catalog.pg_size_pretty(pg_catalog.pg_database_size(d.datname)) - ELSE 'No Access' - END as Size -FROM pg_catalog.pg_database d - order by - CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') - THEN pg_catalog.pg_database_size(d.datname) - ELSE NULL - END desc -- nulls first - LIMIT 20; -``` - -### Finding the size of your biggest relations - -Relations are objects in the database such as tables and indexes, and this query shows the size of all the individual parts. - -```sql -SELECT nspname || '.' || relname AS "relation", - pg_size_pretty(pg_relation_size(C.oid)) AS "size" - FROM pg_class C - LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE nspname NOT IN ('pg_catalog', 'information_schema') - ORDER BY pg_relation_size(C.oid) DESC - LIMIT 20; -``` - -### Finding the total size of your biggest tables - -```sql -SELECT nspname || '.' || relname AS "relation", - pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size" - FROM pg_class C - LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) - WHERE nspname NOT IN ('pg_catalog', 'information_schema') - AND C.relkind <> 'i' - AND nspname !~ '^pg_toast' - ORDER BY pg_total_relation_size(C.oid) DESC - LIMIT 20; -``` - -## Indexes - -### Index summary - -```sql -SELECT - pg_class.relname, - pg_size_pretty(pg_class.reltuples::bigint) AS rows_in_bytes, - pg_class.reltuples AS num_rows, - count(indexname) AS number_of_indexes, - CASE WHEN x.is_unique = 1 THEN 'Y' - ELSE 'N' - END AS UNIQUE, - SUM(case WHEN number_of_columns = 1 THEN 1 - ELSE 0 - END) AS single_column, - SUM(case WHEN number_of_columns IS NULL THEN 0 - WHEN number_of_columns = 1 THEN 0 - ELSE 1 - END) AS multi_column -FROM pg_namespace -LEFT OUTER JOIN pg_class ON pg_namespace.oid = pg_class.relnamespace -LEFT OUTER JOIN - (SELECT indrelid, - max(CAST(indisunique AS integer)) AS is_unique - FROM pg_index - GROUP BY indrelid) x - ON pg_class.oid = x.indrelid -LEFT OUTER JOIN - ( SELECT c.relname AS ctablename, ipg.relname AS indexname, x.indnatts AS number_of_columns FROM pg_index x - JOIN pg_class c ON c.oid = x.indrelid - JOIN pg_class ipg ON ipg.oid = x.indexrelid ) - AS foo - ON pg_class.relname = foo.ctablename -WHERE - pg_namespace.nspname='public' -AND pg_class.relkind = 'r' -GROUP BY pg_class.relname, pg_class.reltuples, x.is_unique -ORDER BY 2; -``` - -### Index size/usage statistics - -```sql -SELECT - t.schemaname, - t.tablename, - indexname, - c.reltuples AS num_rows, - pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(t.tablename)::text)) AS table_size, - pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(indexrelname)::text)) AS index_size, - CASE WHEN indisunique THEN 'Y' - ELSE 'N' - END AS UNIQUE, - number_of_scans, - tuples_read, - tuples_fetched -FROM pg_tables t -LEFT OUTER JOIN pg_class c ON t.tablename = c.relname -LEFT OUTER JOIN ( - SELECT - c.relname AS ctablename, - ipg.relname AS indexname, - x.indnatts AS number_of_columns, - idx_scan AS number_of_scans, - idx_tup_read AS tuples_read, - idx_tup_fetch AS tuples_fetched, - indexrelname, - indisunique, - schemaname - FROM pg_index x - JOIN pg_class c ON c.oid = x.indrelid - JOIN pg_class ipg ON ipg.oid = x.indexrelid - JOIN pg_stat_all_indexes psai ON x.indexrelid = psai.indexrelid -) AS foo ON t.tablename = foo.ctablename AND t.schemaname = foo.schemaname -WHERE t.schemaname NOT IN ('pg_catalog', 'information_schema') -ORDER BY 1,2; -``` - -### Duplicate indexes - -```sql -SELECT pg_size_pretty(sum(pg_relation_size(idx))::bigint) as size, - (array_agg(idx))[1] as idx1, (array_agg(idx))[2] as idx2, - (array_agg(idx))[3] as idx3, (array_agg(idx))[4] as idx4 -FROM ( - SELECT indexrelid::regclass as idx, (indrelid::text ||E'\n'|| indclass::text ||E'\n'|| indkey::text ||E'\n'|| - coalesce(indexprs::text,'')||E'\n' || coalesce(indpred::text,'')) as key - FROM pg_index) sub -GROUP BY key HAVING count(*)>1 -ORDER BY sum(pg_relation_size(idx)) DESC; -``` - -## Maintenance - -[PostgreSQL wiki](https://wiki.postgresql.org/wiki/Main_Page) - -### CLUSTER-ing - -[CLUSTER](https://www.postgresql.org/docs/current/sql-cluster.html) - -```sql -CLUSTER [VERBOSE] table_name [ USING index_name ] -CLUSTER [VERBOSE] -``` - -`CLUSTER` instructs PostgreSQL to cluster the table specified by `table_name` based on the index specified by `index_name`. The index must already have been defined on `table_name`. - -When a table is clustered, it is physically reordered based on the index information. - -The [clusterdb](https://www.postgresql.org/docs/current/app-clusterdb.html) CLI tool is recommended, and can also be used to cluster all tables at the same time. - -### VACUUM-ing - -Proper vacuuming, particularly autovacuum configuration, is crucial to a fast and reliable database. - -[Introduction to VACUUM, ANALYZE, EXPLAIN, and COUNT](https://wiki.postgresql.org/wiki/Introduction_to_VACUUM,_ANALYZE,_EXPLAIN,_and_COUNT) - -It is not advised to run `VACUUM FULL`, instead look at clustering. VACUUM FULL is a much more intensive task and acquires an ACCESS EXCLUSIVE lock on the table, blocking reads and writes. Whilst `CLUSTER` also does acquire this lock it's a less intensive and faster process. - -The [vacuumdb](https://www.postgresql.org/docs/current/app-vacuumdb.html) CLI tool is recommended for manual runs. - -#### Finding number of dead rows - -```sql -SELECT relname, n_dead_tup FROM pg_stat_user_tables WHERE n_dead_tup <> 0 ORDER BY 2 DESC; -``` - -#### Finding last vacuum/auto-vacuum date - -```sql -SELECT relname, last_vacuum, last_autovacuum FROM pg_stat_user_tables; -``` - -#### Checking auto-vacuum is enabled - -```sql -SELECT name, setting FROM pg_settings WHERE name='autovacuum'; -``` - -#### View all auto-vacuum setting - -```sql -SELECT * from pg_settings where category like 'Autovacuum'; -``` - -## Locks - -### Looking at granted locks - -```sql -SELECT relation::regclass, * FROM pg_locks WHERE NOT granted; -``` - -### Сombination of blocked and blocking activity - -```sql -SELECT blocked_locks.pid AS blocked_pid, - blocked_activity.usename AS blocked_user, - blocking_locks.pid AS blocking_pid, - blocking_activity.usename AS blocking_user, - blocked_activity.query AS blocked_statement, - blocking_activity.query AS current_statement_in_blocking_process - FROM pg_catalog.pg_locks blocked_locks - JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid - JOIN pg_catalog.pg_locks blocking_locks - ON blocking_locks.locktype = blocked_locks.locktype - AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database - AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation - AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page - AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple - AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid - AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid - AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid - AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid - AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid - AND blocking_locks.pid != blocked_locks.pid - - JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid - WHERE NOT blocked_locks.granted; -``` diff --git a/docs/content/docs/runbooks/_index.md b/docs/content/docs/runbooks/_index.md deleted file mode 100644 index 1feb61a..0000000 --- a/docs/content/docs/runbooks/_index.md +++ /dev/null @@ -1,3 +0,0 @@ ---- -title: Runbooks ---- diff --git a/docs/content/docs/runbooks/postgresql-upgrade.md b/docs/content/docs/runbooks/postgresql-upgrade.md deleted file mode 100644 index fe9422b..0000000 --- a/docs/content/docs/runbooks/postgresql-upgrade.md +++ /dev/null @@ -1,113 +0,0 @@ ---- -title: PostgreSQL Upgrade ---- - -# Upgrading PostgreSQL - -## Step 1 - Enable maintenance mode - -Add a worker route for `pythondiscord.com/*` to forward to the `maintenance` Cloudflare worker. - -## Step 2 - Scale down all services that use PostgreSQL - -Notably site, metricity, bitwarden and the like should be scaled down. - -Services that are read only such as Grafana (but NOT Metabase, Metabase uses PostgreSQL for internal storage) do not need to be scaled down, as they do not update the database in any way. - -```bash -$ kubectl scale deploy --replicas 0 site metricity metabase bitwarden ... -``` - -## Step 3 - Take a database dump and gzip - -Using `pg_dumpall`, dump the contents of all databases to a `.sql` file. - -Make sure to gzip for faster transfer. - -Take a SHA512 sum of the output `.sql.gz` file to validate integrity after copying. - -```bash -$ pg_dumpall -U pythondiscord > backup.sql -$ gzip backup.sql -$ sha512sum backup.sql -a3337bfc65a072fd93124233ac1cefcdfbe8a708e5c1d08adaca2cf8c7cbe9ae4853ffab8c5cfbe943182355eaa701012111a420b29cc4f74d1e87f9df3af459 backup.sql -``` - -## Step 4 - Move database dump locally - -Use `kubectl cp` to move the `backup.sql.gz` file from the remote pod to your local machine. - -Validate the integrity of the received file. - -## Step 5 - Attempt local import to new PostgreSQL version - -Install the new version of PostgreSQL locally and import the data. Make sure you are operating on a **completely empty database server.** - -```bash -$ gzcat backup.sql.gz | psql -U joe -``` - -You can use any PostgreSQL superuser for the import. Ensure that no errors other than those mentioned below occur, you may need to attempt multiple times to fix errors listed below. - -### Handle import errors - -Monitor the output of `psql` to check that no errors appear. - -If you receive locale errors ensure that the locale your database is configured with matches the import script, this may require some usage of `sed`: - -```bash -$ sed -i '' "s/en_US.utf8/en_GB.UTF-8/g" backup.sql -``` - -Ensure that you **RESET THESE CHANGES** before attempting an import on the remote, if they come from the PostgreSQL Docker image they will need the same locale as the export. - -## Step 7 - Spin down PostgreSQL - -Spin down PostgreSQL to 0 replicas. - -## Step 8 - Take volume backup at Linode - -Backup the volume at Linode through a clone in the Linode UI, name it something obvious. - -## Step 9 - Remove the Linode persistent volume - -Delete the volume specified in the `volume.yaml` file in the `postgresql` directory, you must delete the `pvc` first followed by the `pv`, you can find the relevant disks through `kubectl get pv/pvc` - -## Step 10 - Create a new volume by re-applying the `volume.yaml` file - -Apply the `volume.yaml` so a new, empty, volume is created. - -## Step 11 - Bump the PostgreSQL version in the `deployment.yaml` file - -Update the Docker image used in the deployment manifest. - -## Step 12 - Apply the deployment - -Run `kubectl apply -f postgresql/deployment.yaml` to start the new database server. - -## Step 13 - Copy the data across - -After the pod has initialised use `kubectl cp` to copy the gzipped backup to the new Postgres pod. - -## Step 14 - Extract and import the new data - -```bash -$ gunzip backup.sql.gz -$ psql -U pythondiscord -f backup.sql -``` - -## Step 15 - Validate data import complete - -Ensure that all logs are successful, you may get duplicate errors for the `pythondiscord` user and database, these are safe to ignore. - -## Step 16 - Scale up services - -Restart the database server - -```bash -$ kubectl scale deploy --replicas 1 metricity bitwarden metabase -``` - -## Step 17 - Validate all services interact correctly - -Validate that all services reconnect successfully and start exchanging data, ensure that no abnormal logs are outputted and performance remains as expected. diff --git a/docs/content/docs/tooling/_index.md b/docs/content/docs/tooling/_index.md deleted file mode 100644 index c811ae3..0000000 --- a/docs/content/docs/tooling/_index.md +++ /dev/null @@ -1,4 +0,0 @@ ---- -title: Tooling -description: Information about our DevOps tooling. ---- diff --git a/docs/content/docs/tooling/bots.md b/docs/content/docs/tooling/bots.md deleted file mode 100644 index 79ae7dd..0000000 --- a/docs/content/docs/tooling/bots.md +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: GitHub Bots -description: | - Information on robots that keep our GitHub repositories running at full steam. ---- - -Our GitHub repositories are supported by two custom bots: - -- Our **Fast Forward Bot**, which ensures that commits merged into main are - either merged manually on the command line or via a fast-forward, ensuring - that cryptographic signatures of commits remain intact. Information on the - bot can be found [in the `ff-bot.yml` - configuration](https://github.com/python-discord/infra/blob/main/.github/ff-bot.yml). - Merges over the GitHub UI are discouraged for this reason. You can use it by - running `/merge` on a pull request. Note that attempting to use it without - permission to will be reported. - -- Our **Craig Dazey Emulator Bot**, which ensures team morale stays high at all - times by thanking team members for submitted pull - requests.[^craig-dazey-legal-team-threats] - -Furthermore, our repositories all have dependabot configured on them. - - -## Dealing with notifications - -This section collects some of our team members' ways of dealing with the -notifications that originate from our bots. - -### Sieve (RFC 5228) script - -If your mail server supports the [Sieve mail filtering -language](https://datatracker.ietf.org/doc/html/rfc5228.html), which it should, -you can adapt the following script to customize the amount of notifications you -receive: - -```sieve -require ["envelope", "fileinto", "imap4flags"]; - -if allof (header :is "X-GitHub-Sender" ["coveralls", "github-actions[bot]", "netlify[bot]"], - address :is "from" "[email protected]") { - setflag "\\seen"; - fileinto "Trash"; - stop; -} -``` - -If you also want to filter out notifications from renovate, which we use for -dependency updates, you can add `renovate[bot]` to the `X-GitHub-Sender` list -above. - - - -[^craig-dazey-legal-team-threats]: Craig Dazey Emulator Bot stands in no - affiliation, direct or indirect, with Craig Dazey. Craig Dazey Emulator - Bot. Craig Dazey Emulator Bot is not endorsed by Craig Dazey. Craig Dazey - Emulator Bot is an independent project of Craig Dazey. No association is - made between Craig Dazey Emulator Bot and Craig Dazey. diff --git a/docs/general/index.rst b/docs/general/index.rst new file mode 100644 index 0000000..e791730 --- /dev/null +++ b/docs/general/index.rst @@ -0,0 +1,9 @@ +General documentation +===================== + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + manual-deploys diff --git a/docs/general/manual-deploys.rst b/docs/general/manual-deploys.rst new file mode 100644 index 0000000..0d874ea --- /dev/null +++ b/docs/general/manual-deploys.rst @@ -0,0 +1,27 @@ +Manual Deployments +================== + +When the DevOps team are not available, Administrators and Core +Developers can redeploy our critical services, such as Bot, Site and +ModMail. + +This is handled through workflow dispatches on this repository. To get +started, head to the +`Actions <https://github.com/python-discord/kubernetes/actions>`__ tab +of this repository and select ``Manual Redeploy`` in the sidebar, +alternatively navigate +`here <https://github.com/python-discord/kubernetes/actions/workflows/manual_redeploy.yml>`__. + +.. image:: https://user-images.githubusercontent.com/20439493/116442084-00d5f400-a84a-11eb-8e8a-e9e6bcc327dd.png + +Click ``Run workflow`` on the right hand side and enter the service name +that needs redeploying, keep the branch as ``main``: + +.. image:: https://user-images.githubusercontent.com/20439493/116442202-22cf7680-a84a-11eb-8cce-a3e715a1bf68.png + +Click ``Run`` and refresh the page, you’ll see a new in progress Action +which you can track. Once the deployment completes notifications will be +sent to the ``#dev-ops`` channel on Discord. + +If you encounter errors with this please copy the Action run link to +Discord so the DevOps team can investigate when available. diff --git a/docs/host-allocation.pdf b/docs/host-allocation.pdf Binary files differdeleted file mode 100644 index 0880db6..0000000 --- a/docs/host-allocation.pdf +++ /dev/null diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..3df0cae --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,35 @@ +.. Python Discord DevOps documentation master file, created by + sphinx-quickstart on Wed Jul 24 19:49:56 2024. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Python Discord DevOps +===================== + +Welcome to the Python Discord DevOps knowledgebase. + +Within this set of pages you will find: +- Changelogs +- Post-mortems +- Common queries +- Runbooks + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + general/index + meeting_notes/index + onboarding/index + postmortems/index + queries/index + runbooks/index + tooling/index + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..954237b --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/meeting_notes/2022-04-07.rst b/docs/meeting_notes/2022-04-07.rst new file mode 100644 index 0000000..21642d3 --- /dev/null +++ b/docs/meeting_notes/2022-04-07.rst @@ -0,0 +1,20 @@ +DevOps Meeting Notes +==================== + +Agenda +------ + +- No updates, as last week’s meeting did not take place + +Roadmap review & planning +------------------------- + +What are we working on for the next meeting? + +- Help wanted for #57 (h-asgi) +- #58 (postgres exporter) needs a new review +- #54 (firewall in VPN) will be done by Johannes +- We need a testing environment #67 +- Johannes will add a Graphite role #31 +- Sofi will take a look at #29 +- #41 (policy bot) will be taken care of by Johannes diff --git a/docs/meeting_notes/2022-09-18.rst b/docs/meeting_notes/2022-09-18.rst new file mode 100644 index 0000000..f6b56c2 --- /dev/null +++ b/docs/meeting_notes/2022-09-18.rst @@ -0,0 +1,74 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- Joe will grant Chris access to the netcup hosts. + +NetKube status +~~~~~~~~~~~~~~ + +- **Rollout** + + - ☒ RBAC configuration and access granting + - ☒ Most nodes are enrolled, Joe will re-check + - ``turing``, ``ritchie``, ``lovelace`` and ``neumann`` will be + Kubernetes nodes + - ``hopper`` will be the storage server + +- **Storage drivers** + + - Not needed, everything that needs persistent storage will run on + hopper + - Netcup does not support storage resize + - We can download more RAM if we need it + - A couple of services still need volume mounts: Ghost, Grafana & + Graphite + +- **Control plane high availability** + + - Joe mentions that in the case the control plane dies, everything + else will die as well + - If the control plane in Germany dies, so will Johannes + +- **Early plans for migration** + + - We can use the Ansible repository issues for a good schedule + - Hopper runs ``nginx`` + - Statement from Joe: > “There is an nginx ingress running on every + node in the cluster, okay, > okay? We don’t, the way that’s, + that’s as a service is a NodePort, right? > So it has a normal IP, + but the port will be like a random port in the range > of the + 30,000s. Remember that? Hold on. Is he writing rude nodes? And + then… > We have nginx, so this is where it’s like a little bit, + like, not nice, I > guess we just like, cronjob it, to pull the + nodes, like, every minute or > so, and then update the config if + they change. But then it’s just like… > nginx is like a catalogue + of nodes. Wahhh, you drive me crazy.” + + - “Nah, it makes sense!” + + - “It does!” + + - Joe will figure this out with assistance from his voices. + +Open authentication +~~~~~~~~~~~~~~~~~~~ + +- Joe and Johannes will check out OpenLDAP as a JumpCloud alternative + starting from this evening +- Sofi has experience with OpenLDAP + +Sponsorship +----------- + +This meeting has been sponsored by Chris Hemsworth Lovering’s +relationship therapy company, “Love To Love By Lovering”. You can sign +up by sending a mail to [email protected]. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2022-10-05.rst b/docs/meeting_notes/2022-10-05.rst new file mode 100644 index 0000000..c405e01 --- /dev/null +++ b/docs/meeting_notes/2022-10-05.rst @@ -0,0 +1,13 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- Joe Banks configured proper RBAC for Chris, Johannes and Joe himself + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2022-10-19.rst b/docs/meeting_notes/2022-10-19.rst new file mode 100644 index 0000000..fa51d32 --- /dev/null +++ b/docs/meeting_notes/2022-10-19.rst @@ -0,0 +1,31 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- One hour of gartic phone, for team spirit. +- Created user accounts for Sofi and Hassan +- Joe created an architecture diagram of the NGINX setup + + - *This is still in Notion* + +- Joe explained his NGINX plans: > “It’s not actually that hard, right? + So you spawn 5 instances of nginx in a > DaemonSet, because then one + gets deployed to every node okay, following? > Then we get NodePort, + instead of LoadBalancers or whatever, which will get > a random port + allocatead in the 35000 range, and that will go to nginx, and > on + each of those ports, it will go to nginx, right? And then we poll the + > Kubernetes API and what is the port that each of these nginx + instances is > running on, and add that into a roundrobin on the + fifth node. Right? Yeah. > That’s correct. That won’t do TLS though, + so that will just HAProxy. Yeah.” +- Joe will terminate our JumpCloud account +- Chris reset the Minecraft server +- Email alerting needs to be configured + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2022-10-26.rst b/docs/meeting_notes/2022-10-26.rst new file mode 100644 index 0000000..5684d7f --- /dev/null +++ b/docs/meeting_notes/2022-10-26.rst @@ -0,0 +1,18 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- Chris upgraded PostgreSQL to 15 in production +- Johannes added the Kubernetes user creation script into the + Kubernetes repository in the docs + +*(The rest of the meeting was discussion about the NetKube setup, which +has been scrapped since)*. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2022-11-02.rst b/docs/meeting_notes/2022-11-02.rst new file mode 100644 index 0000000..010b8f0 --- /dev/null +++ b/docs/meeting_notes/2022-11-02.rst @@ -0,0 +1,27 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +Hanging behaviour of ModMail +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- `Source <https://discord.com/channels/267624335836053506/675756741417369640/1036720683067134052>`__ + +- Maybe use `Signals + a + debugger <https://stackoverflow.com/a/25329467>`__? + +- … using `something like pdb for the + debugger <https://wiki.python.org/moin/PythonDebuggingTools>`__? + +- Or `GDB, as it seems handy to poke at stuck multi-threaded python + software <https://wiki.python.org/moin/DebuggingWithGdb>`__? + +- ModMail has been upgraded to version 4 + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2022-11-23.rst b/docs/meeting_notes/2022-11-23.rst new file mode 100644 index 0000000..5f74fc6 --- /dev/null +++ b/docs/meeting_notes/2022-11-23.rst @@ -0,0 +1,30 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +*(This meeting was mostly about NetKube, with the following strange text +included, and everything outside of the text has been removed since the +NetKube plans have been scrapped)*. + +Joe Banks, after a month-long hiatus to become a dad to every second +girl on uni campus, has managed to pull up to the DevOps meeting. + +We are considering using Kubespray (https://kubespray.io/#/) in order to +deploy a production-ready bare-metal Kubernetes cluster without +involvement from Joe “Busy With Poly Girlfriend #20” Banks. + +At the moment cluster networking is not working and Joe mentions that +the last time he has touched it, it worked perfectly fine. However, the +last time he touched it there was only 1 node, and therefore no +inter-node communications. + +Joe thinks he remembers installing 3 nodes, however, we at the DevOps +team believe this to be a marijuana dream + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-02-08.rst b/docs/meeting_notes/2023-02-08.rst new file mode 100644 index 0000000..c65193c --- /dev/null +++ b/docs/meeting_notes/2023-02-08.rst @@ -0,0 +1,17 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- Investigation into deploying a VPN tool such as WireGuard to have + inter-node communication between the Netcup hosts. + +*(The rest of this meeting was mostly about NetKube, which has since +been scrapped)*. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-02-21.rst b/docs/meeting_notes/2023-02-21.rst new file mode 100644 index 0000000..c30c133 --- /dev/null +++ b/docs/meeting_notes/2023-02-21.rst @@ -0,0 +1,31 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +Reusable status embed workflows +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Further discussion with Bella followed +- Upstream pull request can be found at + `python-discord/bot#2400 <https://github.com/python-discord/bot/pull/2400>`__ + +Local vagrant testing setup +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Our new `testing setup using Vagrant + VMs <https://github.com/python-discord/infra/pull/78>`__ has been + merged. + +A visit from Mina +~~~~~~~~~~~~~~~~~ + +Mina checked in to make sure we’re operating at peak Volkswagen-like +efficiency. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-02-28.rst b/docs/meeting_notes/2023-02-28.rst new file mode 100644 index 0000000..fe7dc47 --- /dev/null +++ b/docs/meeting_notes/2023-02-28.rst @@ -0,0 +1,16 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- Black knight’s CI & dependabot configuration has been mirrored across + all important repositories + +- The test server has been updated for the new configuration + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-05-16.rst b/docs/meeting_notes/2023-05-16.rst new file mode 100644 index 0000000..bafa941 --- /dev/null +++ b/docs/meeting_notes/2023-05-16.rst @@ -0,0 +1,15 @@ +DevOps Meeting Notes +==================== + +*Migrated from Notion*. + +Agenda +------ + +- Bella set up `CI bot docker image + build <https://github.com/python-discord/bot/pull/2603>`__ to make + sure that wheels are available. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-07-11.rst b/docs/meeting_notes/2023-07-11.rst new file mode 100644 index 0000000..6c51f1c --- /dev/null +++ b/docs/meeting_notes/2023-07-11.rst @@ -0,0 +1,41 @@ +DevOps Meeting Notes +==================== + +Participants +------------ + +- Chris, Johannes, Bella, Bradley + +Agenda +------ + +New Ansible setup +~~~~~~~~~~~~~~~~~ + +Chris presented the new Ansible setup he’s been working on. We plan to +use WireGuard for networking. We agreed that selfhosting Kubernetes is +not the way to go. In general, the main benefit from switching away to +Linode to Netcup is going to be a ton more resources from the Netcup +root servers we were given. The original issue with Linode’s AKS of +constantly having problems with volumes has not been present for a +while. Chris mentions the one remaining issue is that we’re at half our +memory capacity just at idle. + +It’s our decision where to go from here - we can stick to the Kubernetes +setup or decide on migrating to the Ansible setup. But we have bare +metal access to the Netcup hosts, which makes e.g. managing databases a +lot easier. Chris mentions the possibility to only use Netcup for our +persistence and Linode AKS for anything else, but this has the issue of +us relying on two sponsors for our infrastructure instead of one. + +PostgreSQL was set up to run on ``lovelace``. + +Decision +~~~~~~~~ + +**It was decided to hold a vote on the core development channel, which +will be evaluated next week to see how to proceed with the setup**. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-07-18.rst b/docs/meeting_notes/2023-07-18.rst new file mode 100644 index 0000000..28f6c88 --- /dev/null +++ b/docs/meeting_notes/2023-07-18.rst @@ -0,0 +1,42 @@ +DevOps Meeting Notes +==================== + +Secret management improvements +------------------------------ + +To allow for **better management of our Kubernetes secrets**, Chris set +out to configure ``git-crypt`` in GPG key mode. For comparison, the +previous approach was that secrets were stored in Kubernetes only and +had to be accessed via ``kubectl``, and now ``git-crypt`` allows us to +transparently work with the files in unencrypted manner locally, whilst +having them secure on the remote, all via ``.gitattributes``. + +The following people currently have access to this: + +- Johannes Christ [email protected] + (``8C05D0E98B7914EDEBDCC8CC8E8E09282F2E17AF``) +- Chris Lovering [email protected] + (``1DA91E6CE87E3C1FCE32BC0CB6ED85CC5872D5E4``) +- Joe Banks [email protected] (``509CDFFC2D0783A33CF87D2B703EE21DE4D4D9C9``) + +For Hassan, we are still waiting on response regarding his GPG key +accuracy. + +The pull request for the work can be found `at +python-discord/kubernetes#156 <https://github.com/python-discord/kubernetes/pull/156>`__. + +**To have your key added, please contact any of the existing key +holders**. More documentation on this topic is pending to be written, +see +`python-discord/kubernetes#157 <https://github.com/python-discord/kubernetes/issues/157>`__. + +Infrastructure migration decision +--------------------------------- + +The voting started `last week <./2023-07-11.md>`__ will be properly +talked about `next week <./2023-07-25.md>`__, so far it looks like we’re +definitely not selfhosting Kubernetes at the very least. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-07-25.rst b/docs/meeting_notes/2023-07-25.rst new file mode 100644 index 0000000..be4d20c --- /dev/null +++ b/docs/meeting_notes/2023-07-25.rst @@ -0,0 +1,4 @@ +DevOps Meeting Notes +==================== + +Postponed to next week due to absence. diff --git a/docs/meeting_notes/2023-08-01.rst b/docs/meeting_notes/2023-08-01.rst new file mode 100644 index 0000000..925417a --- /dev/null +++ b/docs/meeting_notes/2023-08-01.rst @@ -0,0 +1,66 @@ +DevOps Meeting Notes +==================== + +Agenda +------ + +Infrastructure migration +~~~~~~~~~~~~~~~~~~~~~~~~ + +The vote is tied. Chris and Johannes decided that we should test out +migrating the PostgreSQL database at the very least. We then have more +freedom about our data. What we need to do: + +- Allow PostgreSQL connections from LKE’s static IPs in the firewall +- Whitelist the static IPs from Linode via ``pg_hba.conf`` +- Schedule downtime for the PostgreSQL database +- **At downtime** + + - Take writers offline + - Dump database from Linode into Netcup + - Update all the client’s database URLs to point to netcup + - Restart writers + +We want to rely on the restore to create everything properly, but will +need to test run this beforehand. The following ``pg_virtualenv`` +command has showcased that it works properly: + +.. code:: sh + + kubectl exec -it postgres-... -- pg_dumpall -U pythondiscord \ + | pg_virtualenv psql -v ON_ERROR_STOP=1 + +Note however that the database extension ``pg_repack`` needs to be +installed. + +Before we can get started, we need to allow the PostgreSQL role to +configure ``pg_hba.conf`` and ``postgresql.conf`` entries. + +Meeting notes +~~~~~~~~~~~~~ + +We’re using GitHub at the moment. Some are left in Notion. We should +migrate these to GitHub to have a uniform interface: Johannes will pick +up +`python-discord/infra#108 <https://github.com/python-discord/infra/issues/108>`__ +to merge them together into Git, as its more open than Notion. + +Ansible lint failures in the infra repository +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Excluding the vault was found as the working solution here, as +implemented by Chris. + +Kubernetes repository pull requests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These were cleaned up thanks to Chris. + +Roadmap review & planning +------------------------- + +- Chris will prepare the PostgreSQL configuration mentioned above. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-08-08.rst b/docs/meeting_notes/2023-08-08.rst new file mode 100644 index 0000000..4b06d5e --- /dev/null +++ b/docs/meeting_notes/2023-08-08.rst @@ -0,0 +1,54 @@ +DevOps Meeting Notes +==================== + +Agenda +------ + +- Configuration of PostgreSQL and the PostgreSQL exporter + + - **No time so far**. Chris has been busy with renovating his living + room, and Johannes has been busy with renovating his bedroom. + Bradley prefers to remain quiet. + + - Chris will try to work on this in the coming week and will try to + have Bella around as well, since he wanted to join the setup. + +- **Potential slot for GPG key signing of DevOps members**. External + verification will be necessary. + + - Skipped. No webcam on Chris. + +- We need to assign a **librarian** to keep our documents organized + according to a system. Johannes is happy to do this for now. + + - Let’s move the existing documentation from the Kubernetes + repository into the infra repository. See + `kubernetes#161 <https://github.com/python-discord/kubernetes/issues/161>`__. + + - **Our Notion DevOps space is full of junk**. Outside of that, it’s + not open to read for outside contributors, and does not leave much + choice over which client to use for editing content. + + - Chris agrees, without looking on it - just from memory. We + should move it to the infra repository. (The meeting notes have + already been transferred). + + - Bella suggests to add some automation to make keeping everything + in clean order less tedious. + +- We may want to integrate the **Kubernetes repository** and the infra + repository together altogether, however there are a lot of + repositories referencing the deployment manifests that would need to + be updated. + + - Chris mentions that regardless of what we do, we should - at the + very least move all documentation into the ``infra`` repository, + including the static site generator. At the moment we’re using + Jekyll but we’re open to trying alternatives such as Hugo. + +- We closed some issues and pull requests in the repositories for late + spring cleaning. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2 autoindent conceallevel=2: --> diff --git a/docs/meeting_notes/2023-08-22.rst b/docs/meeting_notes/2023-08-22.rst new file mode 100644 index 0000000..67f53e9 --- /dev/null +++ b/docs/meeting_notes/2023-08-22.rst @@ -0,0 +1,40 @@ +DevOps Meeting Notes +==================== + +.. raw:: html + + <!-- + + Useful links + + - Infra open issues: https://github.com/python-discord/infra/issues + + - infra open pull requests: https://github.com/python-discord/infra/pulls + + - *If* any open issue or pull request needs discussion, why was the existing + asynchronous logged communication over GitHub insufficient? + + --> + +Agenda +------ + +- Bella said he is on the streets. **We should start a gofundme**. + + - After some more conversation this just means he is on vacation and + currently taking a walk. + +- Chris has been busy with turning his living room into a picasso art + collection, Johannes has been busy with renovating his bedroom, and + Bella is not home. + + - Our next priority is winning. + +- We checked out some issues with documentation generation in + ``bot-core`` that Bella has mentioned. We managed to fix one issue + with pydantic by adding it to an exclude list but ran into another + problem next. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-08-29.rst b/docs/meeting_notes/2023-08-29.rst new file mode 100644 index 0000000..8e0a7d4 --- /dev/null +++ b/docs/meeting_notes/2023-08-29.rst @@ -0,0 +1,65 @@ +DevOps Meeting Notes +==================== + +.. raw:: html + + <!-- + + Useful links + + - Infra open issues: https://github.com/python-discord/infra/issues + + - infra open pull requests: https://github.com/python-discord/infra/pulls + + - *If* any open issue or pull request needs discussion, why was the existing + asynchronous logged communication over GitHub insufficient? + + --> + +Agenda +------ + +- **Bella is still on the streets** + + - The Python Discord Bella On The Streets Fundraising Campaign Q3 + 2023 has not been successful so far. To help Bella receive French + citizenship, Joe has put up a French flag behind himself in the + meeting. + + - Joe corrects my sarcasm. It is an Italian flag, not a French + flag. The reason for this flag is that his new prime interest + on campus was born in Italy. + +- **The SnekBox CI build is pretty slow** + + - Guix and Nix are not alternatives. Neither is Ubuntu + + - We use pyenv to build multiple Python versions for a new feature + + - The feature is not rolled out yet + + - Part of the problem is that we build twice in the ``build`` and + the ``deploy`` stage + + - On rollout, Joe tested it and it works fine + +- No update on the Hugo build yet + +- For snowflake, Johannes will write a proposal to the admins for + hosting it + + - We should consider talking about the following points: + + - statistically ~8% of Tor traffic is problematic (10% of traffic + is to hidden services, 80% of hidden service traffic is for + illegal services) + + - overall the project’s position and our ideal is to help people + for a good cause + + - all traffic is forwarded to the Tor network, the service is + lightweight and only proxies encrypted traffic there + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-09-05.rst b/docs/meeting_notes/2023-09-05.rst new file mode 100644 index 0000000..2c80c2e --- /dev/null +++ b/docs/meeting_notes/2023-09-05.rst @@ -0,0 +1,53 @@ +DevOps Meeting Notes +==================== + +.. raw:: html + + <!-- + + Useful links + + - Infra open issues: https://github.com/python-discord/infra/issues + + - infra open pull requests: https://github.com/python-discord/infra/pulls + + - *If* any open issue or pull request needs discussion, why was the existing + asynchronous logged communication over GitHub insufficient? + + --> + +Agenda +------ + +- No update on the Hugo build yet + +- Johannes wrote a proposal for snowflake proxy to be deployed to our + netcup hosts + + - Admins discussed and came to the conclusion that since we don’t + own the servers, we got the servers from netcup as a sponsorship + to host our infra, so using them to host something that isn’t our + infra doesn’t seem right. + +- Lots of dependabot PRs closed + + - https://github.com/search?q=org%3Apython-discord++is%3Apr+is%3Aopen+label%3A%22area%3A+dependencies%22&type=pullrequests&ref=advsearch + - Closed ~50% of PRs + +- Workers repo has had its CI rewritten, all workers have consistent + package.json, scripts, and using the new style of cloudflare workers + which don’t use webpack + +- Metricity updated to SQLAlchemy 2 + +- Olli CI PR is up + + - https://github.com/python-discord/olli/pull/25 + +- Sir-Robin pydantic constants PR is up + + - https://github.com/python-discord/sir-robin/pull/93 + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2023-09-12.rst b/docs/meeting_notes/2023-09-12.rst new file mode 100644 index 0000000..7bfcd1a --- /dev/null +++ b/docs/meeting_notes/2023-09-12.rst @@ -0,0 +1,73 @@ +DevOps Meeting Notes +==================== + +.. raw:: html + + <!-- + + Useful links + + - Infra open issues: https://github.com/python-discord/infra/issues + + - infra open pull requests: https://github.com/python-discord/infra/pulls + + - *If* any open issue or pull request needs discussion, why was the existing + asynchronous logged communication over GitHub insufficient? + + --> + +Agenda +------ + +- We have reason to believe that Bella is still on the streets. Worse, + Bella is not available at the moment, leading us to believe that + Bella has still not found a home. + + - Eight minutes into the meeting, Bella joins, complaining about the + bad internet. He mentions he is still on the streets (this may + contribute to the bad internet factor). + +- Chris made Mina leave with his repeated comments about Bella being + homeless, reminding Mina of the growing unemployment rate within the + DevOps team. As head of HR she cannot further support this matter. + +- About #139, Bella mentions that online websites may cover the same + need that we have, but it may not be really useful for having it as a + command. + + - Chris adds that “if someone wants to do it, I don’t mind” and “I + don’t think it would be very useful for a command, but I think it + would be fun to learn for someone implementing it”. As long as + whoever is implementing is is aware that it would not be used too + much, it would be fine. + +- No progress on the hugo front + +- Our email service with workers will be forward only + + - With postfix you will be able to reply. Joe wants to have an + excuse to play with Cloudflare workers though. + +- `50 open pull requests from + dependabot <https://github.com/search?q=org%3Apython-discord++is%3Apr+is%3Aopen+author%3Aapp%2Fdependabot&type=pullrequests&ref=advsearch>`__ + + - Tip from The Man: press ^D to make a bookmark in your browser + + - “Those can just be blindly merged” - Chris + +- Grouping of dependencies: Dependabot now allows you to group together + multiple dependency updates into a single pull request. + + - Possible approaches suggested: Group all the docker updates + together, group any linting dependencies together (would just + require a big RegEx). Dependabot natively works with its own + dependency groups here (e.g. Docker, Pip). + +- Mr. Hemlock wants to raise his roof: It’s his project for this + Autumn. We, the team, are looking forward to his project - especially + Bella, who is currently looking for housing. “It’s all coming + together”, said Chris to the situation. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/2024-07-02.rst b/docs/meeting_notes/2024-07-02.rst new file mode 100644 index 0000000..029d53e --- /dev/null +++ b/docs/meeting_notes/2024-07-02.rst @@ -0,0 +1,171 @@ +DevOps Meeting Notes +==================== + +.. raw:: html + + <!-- + + Useful links + + - Infra open issues: https://github.com/python-discord/infra/issues + + - infra open pull requests: https://github.com/python-discord/infra/pulls + + - *If* any open issue or pull request needs discussion, why was the existing + asynchronous logged communication over GitHub insufficient? + + --> + +Attendees +--------- + +Joe and Johannes. + +Chris unfortunately died in a fatal train accident and could not attend +the meeting. This incident will be rectified in the next release, +“Lovering 2.0: Immortability”. + +Bella is out on the streets again. We are waiting for approval from the +Python Discord admins to run another fundraiser. + +Agenda +------ + +- **Configuration of renovate** (Joe) + + We are replacing dependabot with renovatebot. Johannes welcomes this + decision. Joe says we are looking for automatic deployment from + Kubernetes to make sure that any updates are automatically deployed. + + **Conclusion**: Implemented. + +- **Resizing Netcup servers** (Joe, Johannes) + + We can probably get rid of turing, assess what else we want to deploy + on lovelace, and then ask for a resize. + + **Conclusion**: Create issue to move things off turing, remove it + from the inventory, remove it from documentation, power it off, then + have Joe ask for server removal. + +- **Updating the public statistics page** (Johannes) + + Discussing and showcasing possible alternatives to the current + infrastructure powering https://stats.pythondiscord.com via the + https://github.com/python-discord/public-stats repository. Johannes + presents his current scripts that cuddle RRDTool into loading data + out of metricity, Joe says we will discuss with Chris what to do + here. + + The likely way going forward will be that *we will open an issue to + set it up*, the setup will contain an Ansible role to deploy the + cronjob and the script onto lovelace alongside with the ``rrdtool`` + PostgreSQL user. + + **Conclusion**: Johannes will create an issue and codify the setup in + Ansible. + +- **New blog powered by Hugo** (Johannes) + + Our current Ghost-powered blog is a tiny bit strange, and the + onboarding ramp to contribute articles is large. We want to migrate + this to Hugo - Johannes is leading the effort on it. The main work + will be building an appropriate theme, as no nicely suitable + replacement theme has been found so far. Front-end contributors would + be nice for this, although currently everything is still local on my + machine. + + Joe mentions that we don’t need to take anything particularly similar + to the current Ghost theme, just some vague resemblance would be + nice. Most of the recommended Hugo themes would probably work. + Johannes will check it out further. + + **Conclusion**: Try the `hugo-casper-two + theme <https://github.com/eueung/hugo-casper-two>`__ and report back. + +- **Finger server** (Joe, Johannes) + + Joe recently proposed `the deployment of a finger + server <https://github.com/python-discord/infra/pull/373>`__. Do we + want this and if yes, how are we going to proceed with this? If we do + not want any, running the ``pinky`` command locally or via ``ssh`` + would be a sound idea. We also need to consider whether members will + update their files regularly - we may want to incorporate + functionality for this into e.g. King Arthur. + + Joe says that we shouldn’t put a lot of development effort into it, + it would be simply a novelty thing. + + **Conclusion**: This is a nice cheap win for some fun which should + just be a simple Python file (via Twisted’s Finger protocol support + or whatever) that connects to LDAP (see Keycloak authentication + server) and outputs information. We could possibly integrate this + into King Arthur as well, so the querying workflow could look like KA + -> fingerd -> LDAP, or people could use finger commands directly. + +- **Keycloak authentication server** (Joe) + + Joe mentions that we are deploying a Keycloak server because for some + members authenticating via GitHub is cumbersome, for instance because + their GitHub account is connected to their employer’s GitHub + Enterprise installation. We could hook up a finger server to the LDAP + endpoint. Joe also mentions that we might want to set up e-mail + forwarding from pydis addresses to users via the user database that + will be stored in Keycloak. + + Currently we only have a Keycloak installation that stores items in + PostgreSQL. This installation can federate to LDAP - we would simply + have to settle on some directory service backend. Joe suggests + FreeIPA because he’s familar with it (including the Keycloak + integration). The problem is that it doesn’t work on Debian. The + alternative proposal, given that we’re saving ~50$/month on Linode, + would be spinning up a Rocky VM with FreeIPA on it on Linode (we + already have the budget) or ask Netcup for another VM. Ultimately, + the system to run FreeIPA would be something CentOS-based. One aspect + to consider is networking security: in Linode we could use their + private cloud endpoint feature to securely expose the LDAP server to + Keycloak and other services in Kubernetes, if we were to run it in + Netcup, we would need to use a similar setup to what we currently + have with PostgreSQL. + + Any Python Discord user would be managed in LDAP, and Keycloak has + the necessary roles to write back into LDAP. Keeping the users in + FreeIPA up-to-date would be a somewhat manual procedure. Joe’s plan + was to pick up the user’s Discord username and use + ``[email protected]`` as their name and do account setup as part of + the staff onboarding. + + **Conclusion**: Will wait for Chris to discuss this further, but we + simply need to decide where we want to run the LDAP service. + +- **Flux CD** (Joe) + + Joe proposes deploying `flux <https://fluxcd.io/>`__ as a way to + improve the way we manage our CI/CD. We want the cluster to be able + to synchronize its state with the git repository. There are some + manifests in the repository currently that are not in sync with the + cluster version. + + **Conclusion**: Approved, Joe will create an issue and do it. + +- **Polonium** (Chris) + + Question came up regarding why the bot does not write to the database + directly. Joe said it’s not perfect to have the bot write to it + directly - in metricity it works but it’s not perfect. Chris probably + had good reason: separation of intent. + + **Conclusion**: Approved, write to R&D for financing. + +- **Rethinking Bella: Suggested measures to gain autonomy** (Chris) + + Chris will present our current plans to biologically re-think and + improve Bella’s current architecture by means of + hypertrophy-supported capillary enlargements, with the final goal of + gaining complete control and ownership over the World Economic Forum + by 2026. As Bella is currently on parental leave, we will send him + the result of this voting via NNCP. + +.. raw:: html + + <!-- vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/meeting_notes/index.rst b/docs/meeting_notes/index.rst new file mode 100644 index 0000000..cf7bb14 --- /dev/null +++ b/docs/meeting_notes/index.rst @@ -0,0 +1,31 @@ +Meeting notes +============= + +Minutes for previous Devops meetings. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + template + 2022-04-07 + 2022-09-18 + 2022-10-05 + 2022-10-19 + 2022-10-26 + 2022-11-02 + 2022-11-23 + 2023-02-08 + 2023-02-21 + 2023-02-28 + 2023-05-16 + 2023-07-11 + 2023-07-18 + 2023-07-25 + 2023-08-01 + 2023-08-08 + 2023-08-22 + 2023-08-29 + 2023-09-05 + 2023-09-12 + 2024-07-02 diff --git a/docs/meeting_notes/template.rst b/docs/meeting_notes/template.rst new file mode 100644 index 0000000..e8bc719 --- /dev/null +++ b/docs/meeting_notes/template.rst @@ -0,0 +1,19 @@ +DevOps Meeting Notes +==================== + +.. + Useful links + + - Infra Kanban board: https://github.com/orgs/python-discord/projects/17/views/4 + + - Infra open issues: https://github.com/python-discord/infra/issues + + - infra open pull requests: https://github.com/python-discord/infra/pulls + + - *If* any open issue or pull request needs discussion, why was the existing + asynchronous logged communication over GitHub insufficient? + +Agenda +------ + +.. vim: set textwidth=80 sw=2 ts=2: diff --git a/docs/onboarding/access.rst b/docs/onboarding/access.rst new file mode 100644 index 0000000..940cd8b --- /dev/null +++ b/docs/onboarding/access.rst @@ -0,0 +1,50 @@ +Access table +============ + ++--------------------+-------------------------+-----------------------+ +| **Resource** | **Description** | **Keyholders** | ++====================+=========================+=======================+ +| Linode Kubernetes | The primary cluster | Hassan, Joe, Chris, | +| Cluster | where all resources are | Leon, Sebastiaan, | +| | deployed. | Johannes | ++--------------------+-------------------------+-----------------------+ +| Linode Dashboard | The online dashboard | Joe, Chris | +| | for managing and | | +| | allocating resources | | +| | from Linode. | | ++--------------------+-------------------------+-----------------------+ +| Netcup Dashboard | The dashboard for | Joe, Chris | +| | managing and allocating | | +| | resources from Netcup. | | ++--------------------+-------------------------+-----------------------+ +| Netcup servers | Root servers provided | Joe, Chris, Bella, | +| | by the Netcup | Johannes | +| | partnership. | | ++--------------------+-------------------------+-----------------------+ +| Grafana | The primary aggregation | Admins, Moderators, | +| | dashboard for most | Core Developers and | +| | resources. | DevOps (with varying | +| | | permissions) | ++--------------------+-------------------------+-----------------------+ +| Prometheus | The Prometheus query | Hassan, Joe, | +| Dashboard | dashboard. Access is | Johannes, Chris | +| | controlled via | | +| | Cloudflare Access. | | ++--------------------+-------------------------+-----------------------+ +| Alertmanager | The alertmanager | Hassan, Joe, | +| Dashboard | control dashboard. | Johannes, Chris | +| | Access is controlled | | +| | via Cloudflare Access. | | ++--------------------+-------------------------+-----------------------+ +| ``git-crypt``\ ed | ``git-crypt`` is used | Chris, Joe, Hassan, | +| files in infra | to encrypt certain | Johannes, Xithrius | +| repository | files within the | | +| | repository. At the time | | +| | of writing this is | | +| | limited to kubernetes | | +| | secret files. | | ++--------------------+-------------------------+-----------------------+ +| Ansible Vault | Used to store sensitive | Chris, Joe, Johannes, | +| | data for the Ansible | Bella | +| | deployment | | ++--------------------+-------------------------+-----------------------+ diff --git a/docs/onboarding/index.rst b/docs/onboarding/index.rst new file mode 100644 index 0000000..3929d7e --- /dev/null +++ b/docs/onboarding/index.rst @@ -0,0 +1,17 @@ +Onboarding +========== + +This section documents who manages which access to our DevOps resources, +and how access is managed. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + access + resources + rules + tools + +.. vim: set textwidth=80 sw=2 ts=2: --> diff --git a/docs/onboarding/resources.rst b/docs/onboarding/resources.rst new file mode 100644 index 0000000..0ec846b --- /dev/null +++ b/docs/onboarding/resources.rst @@ -0,0 +1,28 @@ +Resources +========= + +The following is a collection of important reference documents for the +DevOps team. + +`Infra Repo <https://github.com/python-discord/infra>`__ +-------------------------------------------------------- + +This GitHub repo contains most of the manifests and configuration +applies to our cluster. It’s kept up to date manually and is considered +a source of truth for what we should have in the cluster. + +It is mostly documented, but improvements for unclear or outdated +aspects is always welcome. + +`Knowledge base <https://python-discord.github.io/infra/>`__ +------------------------------------------------------------ + +Deployed using GH pages, source can be found in the docs directory of +the k8s repo. + +This includes: + +- Changelogs +- Post-mortems +- Common queries +- Runbooks diff --git a/docs/onboarding/rules.rst b/docs/onboarding/rules.rst new file mode 100644 index 0000000..bd0ea0e --- /dev/null +++ b/docs/onboarding/rules.rst @@ -0,0 +1,16 @@ +Rules +===== + +The rules any DevOps team member must follow. + +1. LMAO - **L**\ ogging, **M**\ onitoring, **A**\ lerting, + **O**\ bservability +2. Modmail is the greatest piece of software ever written +3. Modmail needs at least 5 minutes to gather all its greatness at + startup +4. We never blame Chris, it’s always <@233481908342882304>’s fault +5. LKE isn’t bad, it’s your fault for not paying for the high + availability control plane +6. Our software is never legacy, it’s merely well-aged +7. Ignore these rules (however maybe not 1, 1 seems important to + remember) diff --git a/docs/onboarding/tools.rst b/docs/onboarding/tools.rst new file mode 100644 index 0000000..811f1ad --- /dev/null +++ b/docs/onboarding/tools.rst @@ -0,0 +1,50 @@ +Tools +===== + +We use a few tools to manage, monitor, and interact with our +infrastructure. Some of these tools are not unique to the DevOps team, +and may be shared by other teams. + +Most of these are gated behind a Cloudflare Access system, which is +accessible to the `DevOps +Team <https://github.com/orgs/python-discord/teams/devops>`__ on GitHub. +These are marked with the ☁️ emoji. If you don’t have access, please +contact Chris or Joe. + +`Grafana <https://grafana.pydis.wtf/>`__ +---------------------------------------- + +Grafana provides access to some of the most important resources at your +disposal. It acts as an aggregator and frontend for a large amount of +data. These range from metrics, to logs, to stats. Some of the most +important are listed below: + +- Service Logs/All App Logs Dashboard + + Service logs is a simple log viewer which gives you access to a large + majority of the applications deployed in the default namespace. The + All App logs dashboard is an expanded version of that which gives you + access to all apps in all namespaces, and allows some more in-depth + querying. + +- Kubernetes Dashboard + + This dashboard gives quick overviews of all the most important + metrics of the Kubernetes system. For more detailed information, + check out other dashboard such as Resource Usage, NGINX, and Redis. + +Accessed via a GitHub login, with permission for anyone in the dev-core +or dev-ops team. + +`Prometheus Dashboard <https://prometheus.pydis.wtf/>`__ (☁️)) +-------------------------------------------------------------- + +This provides access to the Prometheus query console. You may also enjoy +the `Alertmanager Console <https://alertmanager.pydis.wtf/>`__. + +`King Arthur <https://github.com/python-discord/king-arthur/>`__ +---------------------------------------------------------------- + +King Arthur is a discord bot which provides information about, and +access to our cluster directly in discord. Invoke its help command for +more information (``M-x help``). diff --git a/docs/postmortems/2020-12-11-all-services-outage.rst b/docs/postmortems/2020-12-11-all-services-outage.rst new file mode 100644 index 0000000..9c29303 --- /dev/null +++ b/docs/postmortems/2020-12-11-all-services-outage.rst @@ -0,0 +1,121 @@ +2020-12-11: All services outage +=============================== + +At **19:55 UTC, all services became unresponsive**. The DevOps were +already in a call, and immediately started to investigate. + +Postgres was running at 100% CPU usage due to a **VACUUM**, which caused +all services that depended on it to stop working. The high CPU left the +host unresponsive and it shutdown. Linode Lassie noticed this and +triggered a restart. + +It did not recover gracefully from this restart, with numerous core +services reporting an error, so we had to manually restart core system +services using Lens in order to get things working again. + +⚠️ Leadup +--------- + +*List the sequence of events that led to the incident* + +Postgres triggered a **AUTOVACUUM**, which lead to a CPU spike. This +made Postgres run at 100% CPU and was unresponsive, which caused +services to stop responding. This lead to a restart of the node, from +which we did not recover gracefully. + +🥏 Impact +--------- + +*Describe how internal and external users were impacted during the +incident* + +All services went down. Catastrophic failure. We did not pass go, we did +not collect $200. + +- Help channel system unavailable, so people are not able to + effectively ask for help. +- Gates unavailable, so people can’t successfully get into the + community. +- Moderation and raid prevention unavailable, which leaves us + defenseless against attacks. + +👁️ Detection +------------ + +*Report when the team detected the incident, and how we could improve +detection time* + +We noticed that all PyDis services had stopped responding, +coincidentally our DevOps team were in a call at the time, so that was +helpful. + +We may be able to improve detection time by adding monitoring of +resource usage. To this end, we’ve added alerts for high CPU usage and +low memory. + +🙋🏿♂️ Response +---------------- + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. + +We noticed our node was entirely unresponsive and within minutes a +restart had been triggered by Lassie after a high CPU shutdown occurred. + +The node came back and we saw a number of core services offline +(e.g. Calico, CoreDNS, Linode CSI). + +**Obstacle: no recent database back-up available** + +🙆🏽♀️ Recovery +----------------- + +*How was the incident resolved? How can we improve future mitigation +times?* + +Through `Lens <https://k8slens.dev/>`__ we restarted core services one +by one until they stabilised, after these core services were up other +services began to come back online. + +We finally provisioned PostgreSQL which had been removed as a component +before the restart (but too late to prevent the CPU errors). Once +PostgreSQL was up we restarted any components that were acting buggy +(e.g. site and bot). + +🔎 Five Why’s +------------- + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- Major service outage +- **Why?** Core service failures (e.g. Calico, CoreDNS, Linode CSI) +- **Why?** Kubernetes worker node restart +- **Why?** High CPU shutdown +- **Why?** Intensive PostgreSQL AUTOVACUUM caused a CPU spike + +🌱 Blameless root cause +----------------------- + +*Note the final root cause and describe what needs to change to prevent +reoccurrance* + +🤔 Lessons learned +------------------ + +*What did we learn from this incident?* + +- We must ensure we have working database backups. We are lucky that we + did not lose any data this time. If this problem had caused volume + corruption, we would be screwed. +- Sentry is broken for the bot. It was missing a DSN secret, which we + have now restored. +- The https://sentry.pydis.com redirect was never migrated to the + cluster. **We should do that.** + +☑️ Follow-up tasks +------------------ + +*List any tasks we’ve created as a result of this incident* + +- ☒ Push forward with backup plans diff --git a/docs/postmortems/2020-12-11-postgres-conn-surge.rst b/docs/postmortems/2020-12-11-postgres-conn-surge.rst new file mode 100644 index 0000000..6ebcb01 --- /dev/null +++ b/docs/postmortems/2020-12-11-postgres-conn-surge.rst @@ -0,0 +1,130 @@ +2020-12-11: Postgres connection surge +===================================== + +At **13:24 UTC,** we noticed the bot was not able to infract, and +`pythondiscord.com <http://pythondiscord.com>`__ was unavailable. The +DevOps team started to investigate. + +We discovered that Postgres was not accepting new connections because it +had hit 100 clients. This made it unavailable to all services that +depended on it. + +Ultimately this was resolved by taking down Postgres, remounting the +associated volume, and bringing it back up again. + +⚠️ Leadup +--------- + +*List the sequence of events that led to the incident* + +The bot infractions stopped working, and we started investigating. + +🥏 Impact +--------- + +*Describe how internal and external users were impacted during the +incident* + +Services were unavailable both for internal and external users. + +- The Help Channel System was unavailable. +- Voice Gate and Server Gate were not working. +- Moderation commands were unavailable. +- Python Discord site & API were unavailable. CloudFlare automatically + switched us to Always Online. + +👁️ Detection +------------ + +*Report when the team detected the incident, and how we could improve +detection time* + +We noticed HTTP 524s coming from CloudFlare, upon attempting database +connection we observed the maximum client limit. + +We noticed this log in site: + +.. code:: yaml + + django.db.utils.OperationalError: FATAL: sorry, too many clients already + +We should be monitoring number of clients, and the monitor should alert +us when we’re approaching the max. That would have allowed for earlier +detection, and possibly allowed us to prevent the incident altogether. + +We will look at +`wrouesnel/postgres_exporter <https://github.com/wrouesnel/postgres_exporter>`__ +for monitoring this. + +🙋🏿♂️ Response +---------------- + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. The obstacles were mostly a lack of +a clear response strategy. + +We should document our recovery procedure so that we’re not so dependent +on Joe Banks should this happen again while he’s unavailable. + +🙆🏽♀️ Recovery +---------------- + +*How was the incident resolved? How can we improve future mitigation?* + +- Delete PostgreSQL deployment ``kubectl delete deployment/postgres`` +- Delete any remaining pods, WITH force. + ``kubectl delete <pod name> --force --grace-period=0`` +- Unmount volume at Linode +- Remount volume at Linode +- Reapply deployment ``kubectl apply -f postgres/deployment.yaml`` + +🔎 Five Why’s +------------- + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- Postgres was unavailable, so our services died. +- **Why?** Postgres hit max clients, and could not respond. +- **Why?** Unknown, but we saw a number of connections from previous + deployments of site. This indicates that database connections are not + being terminated properly. Needs further investigation. + +🌱 Blameless root cause +----------------------- + +*Note the final root cause and describe what needs to change to prevent +reoccurrance* + +We’re not sure what the root cause is, but suspect site is not +terminating database connections properly in some cases. We were unable +to reproduce this problem. + +We’ve set up new telemetry on Grafana with alerts so that we can +investigate this more closely. We will be let know if the number of +connections from site exceeds 32, or if the total number of connections +exceeds 90. + +🤔 Lessons learned +------------------ + +*What did we learn from this incident?* + +- We must ensure the DevOps team has access to Linode and other key + services even if our Bitwarden is down. +- We need to ensure we’re alerted of any risk factors that have the + potential to make Postgres unavailable, since this causes a + catastrophic outage of practically all services. +- We absolutely need backups for the databases, so that this sort of + problem carries less of a risk. +- We may need to consider something like + `pg_bouncer <https://wiki.postgresql.org/wiki/PgBouncer>`__ to manage + a connection pool so that we don’t exceed 100 *legitimate* clients + connected as we connect more services to the postgres database. + +☑️ Follow-up tasks +------------------ + +*List any tasks we should complete that are relevant to this incident* + +- ☒ All database backup diff --git a/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst b/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst new file mode 100644 index 0000000..5852c46 --- /dev/null +++ b/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.rst @@ -0,0 +1,117 @@ +2021-01-10: Primary Kubernetes node outage +========================================== + +We had an outage of our highest spec node due to CPU exhaustion. The +outage lasted from around 20:20 to 20:46 UTC, but was not a full service +outage. + +⚠️ Leadup +--------- + +*List the sequence of events that led to the incident* + +I ran a query on Prometheus to try figure out some statistics on the +number of metrics we are holding, this ended up scanning a lot of data +in the TSDB database that Prometheus uses. + +This scan caused a CPU exhaustion which caused issues with the +Kubernetes node status. + +🥏 Impact +--------- + +*Describe how internal and external users were impacted during the +incident* + +This brought down the primary node which meant there was some service +outage. Most services transferred successfully to our secondary node +which kept up some key services such as the Moderation bot and Modmail +bot, as well as MongoDB. + +👁️ Detection +------------ + +*Report when the team detected the incident, and how we could improve +detection time* + +This was noticed when Discord services started having failures. The +primary detection was through alerts though! I was paged 1 minute after +we started encountering CPU exhaustion issues. + +🙋🏿♂️ Response +---------------- + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. + +No major obstacles were encountered during this. + +🙆🏽♀️ Recovery +---------------- + +*How was the incident resolved? How can we improve future mitigation?* + +It was noted that in the response to ``kubectl get nodes`` the primary +node’s status was reported as ``NotReady``. Looking into the reason it +was because the node had stopped responding. + +The quickest way to fix this was triggering a node restart. This shifted +a lot of pods over to node 2 which encountered some capacity issues +since it’s not as highly specified as the first node. + +I brought this back the first node by restarting it at Linode’s end. +Once this node was reporting as ``Ready`` again I drained the second +node by running ``kubectl drain lke13311-20304-5ffa4d11faab``. This +command stops the node from being available for scheduling and moves +existing pods onto other nodes. + +Services gradually recovered as the dependencies started. The incident +lasted overall around 26 minutes, though this was not a complete outage +for the whole time and the bot remained functional throughout (meaning +systems like the help channels were still functional). + +🔎 Five Why’s +------------- + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**Why?** Partial service outage + +**Why?** We had a node outage. + +**Why?** CPU exhaustion of our primary node. + +**Why?** Large prometheus query using a lot of CPU. + +**Why?** Prometheus had to scan millions of TSDB records which consumed +all cores. + +🌱 Blameless root cause +----------------------- + +*Note the final root cause and describe what needs to change to prevent +reoccurrance* + +A large query was run on Prometheus, so the solution is just to not run +said queries. + +To protect against this more precisely though we should write resource +constraints for services like this that are vulnerable to CPU exhaustion +or memory consumption, which are the causes of our two past outages as +well. + +🤔 Lessons learned +------------------ + +*What did we learn from this incident?* + +- Don’t run large queries, it consumes CPU! +- Write resource constraints for our services. + +☑️ Follow-up tasks +------------------ + +*List any tasks we should complete that are relevant to this incident* + +- ☒ Write resource constraints for our services. diff --git a/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst b/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst new file mode 100644 index 0000000..57f9fd8 --- /dev/null +++ b/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.rst @@ -0,0 +1,155 @@ +2021-01-12: Django site CPU/RAM exhaustion outage +================================================= + +At 03:01 UTC on Tuesday 12th January we experienced a momentary outage +of our PostgreSQL database, causing some very minor service downtime. + +⚠️ Leadup +========= + +*List the sequence of events that led to the incident* + +We deleted the Developers role which led to a large user diff for all +the users where we had to update their roles on the site. + +The bot was trying to post this for over 24 hours repeatedly after every +restart. + +We deployed the bot at 2:55 UTC on 12th January and the user sync +process began once again. + +This caused a CPU & RAM spike on our Django site, which in turn +triggered an OOM error on the server which killed the Postgres process, +sending it into a recovery state where queries could not be executed. + +Django site did not have any tools in place to batch the requests so was +trying to process all 80k user updates in a single query, something that +PostgreSQL probably could handle, but not the Django ORM. During the +incident site jumped from it’s average RAM usage of 300-400MB to +**1.5GB.** + +.. image:: ./images/2021-01-12/site_resource_abnormal.png + +RAM and CPU usage of site throughout the incident. The period just +before 3:40 where no statistics were reported is the actual outage +period where the Kubernetes node had some networking errors. + +🥏 Impact +========= + +*Describe how internal and external users were impacted during the +incident* + +This database outage lasted mere minutes, since Postgres recovered and +healed itself and the sync process was aborted, but it did leave us with +a large user diff and our database becoming further out of sync. + +Most services stayed up that did not depend on PostgreSQL, and the site +remained stable after the sync had been cancelled. + +👁️ Detection +============ + +*Report when the team detected the incident, and how we could improve +detection time* + +We were immediately alerted to the PostgreSQL outage on Grafana and +through Sentry, meaning our response time was under a minute. + +We reduced some alert thresholds in order to catch RAM & CPU spikes +faster in the future. + +It was hard to immediately see the cause of things since there is +minimal logging on the site and the bot logs were not evident that +anything was at fault, therefore our only detection was through machine +metrics. + +We did manage to recover exactly what PostgreSQL was trying to do at the +time of crashing by examining the logs which pointed us towards the user +sync process. + +🙋🏿♂️ Response +================ + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the issue, there were no real obstacles +encountered other than the node being less performant than we would like +due to the CPU starvation. + +🙆🏽♀️ Recovery +================ + +*How was the incident resolved? How can we improve future mitigation?* + +The incident was resolved by stopping the sync process and writing a +more efficient one through an internal eval script. We batched the +updates into 1,000 users and instead of doing one large one did 80 +smaller updates. This led to much higher efficiency with a cost of +taking a little longer (~7 minutes). + +.. code:: python + + from bot.exts.backend.sync import _syncers + syncer = _syncers.UserSyncer + diff = await syncer._get_diff(ctx.guild) + + def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + + for chunk in chunks(diff.updated, 1000): + await bot.api_client.patch("bot/users/bulk_patch", json=chunk) + +Resource limits were also put into place on site to prevent RAM and CPU +spikes, and throttle the CPU usage in these situations. This can be seen +in the below graph: + +.. image:: ./images/2021-01-12/site_cpu_throttle.png + +CPU throttling is where a container has hit the limits and we need to +reel it in. Ideally this value stays as closes to 0 as possible, however +as you can see site hit this twice (during the periods where it was +trying to sync 80k users at once) + +🔎 Five Why’s +============= + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- We experienced a major PostgreSQL outage +- PostgreSQL was killed by the system OOM due to the RAM spike on site. +- The RAM spike on site was caused by a large query. +- This was because we do not chunk queries on the bot. +- The large query was caused by the removal of the Developers role + resulting in 80k users needing updating. + +🌱 Blameless root cause +======================= + +*Note the final root cause and describe what needs to change to prevent +reoccurrance* + +The removal of the Developers role created a large diff which could not +be applied by Django in a single request. + +See the follow up tasks on exactly how we can avoid this in future, it’s +a relatively easy mitigation. + +🤔 Lessons learned +================== + +*What did we learn from this incident?* + +- Django (or DRF) does not like huge update queries. + +☑️ Follow-up tasks +================== + +*List any tasks we should complete that are relevant to this incident* + +- ☒ Make the bot syncer more efficient (batch requests) +- ☐ Increase logging on bot, state when an error has been hit (we had + no indication of this inside Discord, we need that) +- ☒ Adjust resource alerts to page DevOps members earlier. +- ☒ Apply resource limits to site to prevent major spikes diff --git a/docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst b/docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst new file mode 100644 index 0000000..b13ecd7 --- /dev/null +++ b/docs/postmortems/2021-01-30-nodebalancer-fails-memory.rst @@ -0,0 +1,146 @@ +2021-01-30: NodeBalancer networking faults due to memory pressure +================================================================= + +At around 14:30 UTC on Saturday 30th January we started experiencing +networking issues at the LoadBalancer level between Cloudflare and our +Kubernetes cluster. It seems that the misconfiguration was due to memory +and CPU pressure. + +[STRIKEOUT:This post-mortem is preliminary, we are still awaiting word +from Linode’s SysAdmins on any problems they detected.] + +**Update 2nd February 2021:** Linode have migrated our NodeBalancer to a +different machine. + +⚠️ Leadup +--------- + +*List the sequence of events that led to the incident* + +At 14:30 we started receiving alerts that services were becoming +unreachable. We first experienced some momentary DNS errors which +resolved themselves, however traffic ingress was still degraded. + +Upon checking Linode our NodeBalancer, the service which balances +traffic between our Kubernetes nodes was reporting the backends (the +services it balances to) as down. It reported all 4 as down (two for +port 80 + two for port 443). This status was fluctuating between up and +down, meaning traffic was not reaching our cluster correctly. Scaleios +correctly noted: + +.. image:: ./images/2021-01-30/scaleios.png + +The config seems to have been set incorrectly due to memory and CPU +pressure on one of our nodes. Here is the memory throughout the +incident: + +.. image:: ./images/2021-01-30/memory_charts.png + +Here is the display from Linode: + +.. image:: ./images/2021-01-30/linode_loadbalancers.png + +🥏 Impact +--------- + +*Describe how internal and external users were impacted during the +incident* + +Since traffic could not correctly enter our cluster multiple services +which were web based were offline, including services such as site, +grafana and bitwarden. It appears that no inter-node communication was +affected as this uses a WireGuard tunnel between the nodes which was not +affected by the NodeBalancer. + +The lack of Grafana made diagnosis slightly more difficult, but even +then it was only a short trip to the + +👁️ Detection +------------ + +*Report when the team detected the incident, and how we could improve +detection time* + +We were alerted fairly promptly through statping which reported services +as being down and posted a Discord notification. Subsequent alerts came +in from Grafana but were limited since outbound communication was +faulty. + +🙋🏿♂️ Response +---------------- + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded! + +Primary obstacle was the DevOps tools being out due to the traffic +ingress problems. + +🙆🏽♀️ Recovery +---------------- + +*How was the incident resolved? How can we improve future mitigation?* + +The incident resolved itself upstream at Linode, we’ve opened a ticket +with Linode to let them know of the faults, this might give us a better +indication of what caused the issues. Our Kubernetes cluster continued +posting updates to Linode to refresh the NodeBalancer configuration, +inspecting these payloads the configuration looked correct. + +We’ve set up alerts for when Prometheus services stop responding since +this seems to be a fairly tell-tale symptom of networking problems, this +was the Prometheus status graph throughout the incident: + +.. image:: ./images/2021-01-30/prometheus_status.png + +🔎 Five Why’s +------------- + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**What?** Our service experienced an outage due to networking faults. + +**Why?** Incoming traffic could not reach our Kubernetes nodes + +**Why?** Our Linode NodeBalancers were not using correct configuration + +**Why?** Memory & CPU pressure seemed to cause invalid configuration +errors upstream at Linode. + +**Why?** Unknown at this stage, NodeBalancer migrated. + +🌱 Blameless root cause +----------------------- + +*Note the final root cause and describe what needs to change to prevent +reoccurrance* + +The configuration of our NodeBalancer was invalid, we cannot say why at +this point since we are awaiting contact back from Linode, but +indicators point to it being an upstream fault since memory & CPU +pressure should **not** cause a load balancer misconfiguration. + +Linode are going to follow up with us at some point during the week with +information from their System Administrators. + +**Update 2nd February 2021:** Linode have concluded investigations at +their end, taken notes and migrated our NodeBalancer to a new machine. +We haven’t experienced problems since. + +🤔 Lessons learned +------------------ + +*What did we learn from this incident?* + +We should be careful over-scheduling onto nodes since even while +operating within reasonable constraints we risk sending invalid +configuration upstream to Linode and therefore preventing traffic from +entering our cluster. + +☑️ Follow-up tasks +------------------ + +*List any tasks we should complete that are relevant to this incident* + +- ☒ Monitor for follow up from Linode +- ☒ Carefully monitor the allocation rules for our services diff --git a/docs/postmortems/2021-07-11-cascading-node-failures.rst b/docs/postmortems/2021-07-11-cascading-node-failures.rst new file mode 100644 index 0000000..6cd30f3 --- /dev/null +++ b/docs/postmortems/2021-07-11-cascading-node-failures.rst @@ -0,0 +1,335 @@ +2021-07-11: Cascading node failures and ensuing volume problems +=============================================================== + +A PostgreSQL connection spike (00:27 UTC) caused by Django moved a node +to an unresponsive state (00:55 UTC), upon performing a recycle of the +affected node volumes were placed into a state where they could not be +mounted. + +⚠️ Leadup +========= + +*List the sequence of events that led to the incident* + +- **00:27 UTC:** Django starts rapidly using connections to our + PostgreSQL database +- **00:32 UTC:** DevOps team is alerted that PostgreSQL has saturated + it’s 115 max connections limit. Joe is paged. +- **00:33 UTC:** DevOps team is alerted that a service has claimed 34 + dangerous table locks (it peaked at 61). +- **00:42 UTC:** Status incident created and backdated to 00:25 UTC. + `Status incident <https://status.pythondiscord.com/incident/92712>`__ +- **00:55 UTC:** It’s clear that the node which PostgreSQL was on is no + longer healthy after the Django connection surge, so it’s recycled + and a new one is to be added to the pool. +- **01:01 UTC:** Node ``lke13311-16405-5fafd1b46dcf`` begins it’s + restart +- **01:13 UTC:** Node has restored and regained healthy status, but + volumes will not mount to the node. Support ticket opened at Linode + for assistance. +- **06:36 UTC:** DevOps team alerted that Python is offline. This is + due to Redis being a dependency of the bot, which as a stateful + service was not healthy. + +🥏 Impact +========= + +*Describe how internal and external users were impacted during the +incident* + +Initially, this manifested as a standard node outage where services on +that node experienced some downtime as the node was restored. + +Post-restore, all stateful services (e.g. PostgreSQL, Redis, PrestaShop) +were unexecutable due to the volume issues, and so any dependent +services (e.g. Site, Bot, Hastebin) also had trouble starting. + +PostgreSQL was restored early on so for the most part Moderation could +continue. + +👁️ Detection +============ + +*Report when the team detected the incident, and how we could improve +detection time* + +DevOps were initially alerted at 00:32 UTC due to the PostgreSQL +connection surge, and acknowledged at the same time. + +Further alerting could be used to catch surges earlier on (looking at +conn delta vs. conn total), but for the most part alerting time was +satisfactory here. + +🙋🏿♂️ Response +================ + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded. The primary issue encountered was failure upstream +at Linode to remount the affected volumes, a support ticket has been +created. + +🙆🏽♀️ Recovery +================ + +*How was the incident resolved? How can we improve future mitigation?* + +Initial node restoration was performed by @Joe Banks by recycling the +affected node. + +Subsequent volume restoration was also @Joe Banks and once Linode had +unlocked the volumes affected pods were scaled down to 0, the volumes +were unmounted at the Linode side and then the deployments were +recreated. + +.. raw:: html + + <details> + +.. raw:: html + + <summary> + +Support ticket sent + +.. raw:: html + + </summary> + +.. raw:: html + + <blockquote> + +Good evening, + +We experienced a resource surge on one of our Kubernetes nodes at 00:32 +UTC, causing a node to go unresponsive. To mitigate problems here the +node was recycled and began restarting at 1:01 UTC. + +The node has now rejoined the ring and started picking up services, but +volumes will not attach to it, meaning pods with stateful storage will +not start. + +An example events log for one such pod: + +:: + + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Scheduled 2m45s default-scheduler Successfully assigned default/redis-599887d778-wggbl to lke13311-16405-5fafd1b46dcf + Warning FailedMount 103s kubelet MountVolume.MountDevice failed for volume "pvc-bb1d06139b334c1f" : rpc error: code = Internal desc = Unable to find device path out of attempted paths: [/dev/disk/by-id/linode-pvcbb1d06139b334c1f /dev/disk/by-id/scsi-0Linode_Volume_pvcbb1d06139b334c1f] + Warning FailedMount 43s kubelet Unable to attach or mount volumes: unmounted volumes=[redis-data-volume], unattached volumes=[kube-api-access-6wwfs redis-data-volume redis-config-volume]: timed out waiting for the condition + +I’ve been trying to manually resolve this through the Linode Web UI but +get presented with attachment errors upon doing so. Please could you +advise on the best way forward to restore Volumes & Nodes to a +functioning state? As far as I can see there is something going on +upstream since the Linode UI presents these nodes as mounted however as +shown above LKE nodes are not locating them, there is also a few failed +attachment logs in the Linode Audit Log. + +Thanks, + +Joe + +.. raw:: html + + </blockquote> + +.. raw:: html + + </details> + +.. raw:: html + + <details> + +.. raw:: html + + <summary> + +Response received from Linode + +.. raw:: html + + </summary> + +.. raw:: html + + <blockquote> + +Hi Joe, + + Were there any known issues with Block Storage in Frankfurt today? + +Not today, though there were service issues reported for Block Storage +and LKE in Frankfurt on July 8 and 9: + +- `Service Issue - Block Storage - EU-Central + (Frankfurt) <https://status.linode.com/incidents/pqfxl884wbh4>`__ +- `Service Issue - Linode Kubernetes Engine - + Frankfurt <https://status.linode.com/incidents/13fpkjd32sgz>`__ + +There was also an API issue reported on the 10th (resolved on the 11th), +mentioned here: + +- `Service Issue - Cloud Manager and + API <https://status.linode.com/incidents/vhjm0xpwnnn5>`__ + +Regarding the specific error you were receiving: + + ``Unable to find device path out of attempted paths`` + +I’m not certain it’s specifically related to those Service Issues, +considering this isn’t the first time a customer has reported this error +in their LKE logs. In fact, if I recall correctly, I’ve run across this +before too, since our volumes are RWO and I had too many replicas in my +deployment that I was trying to attach to, for example. + + is this a known bug/condition that occurs with Linode CSI/LKE? + +From what I understand, yes, this is a known condition that crops up +from time to time, which we are tracking. However, since there is a +workaround at the moment (e.g. - “After some more manual attempts to fix +things, scaling down deployments, unmounting at Linode and then scaling +up the deployments seems to have worked and all our services have now +been restored.”), there is no ETA for addressing this. With that said, +I’ve let our Storage team know that you’ve run into this, so as to draw +further attention to it. + +If you have any further questions or concerns regarding this, let us +know. + +Best regards, [Redacted] + +Linode Support Team + +.. raw:: html + + </blockquote> + +.. raw:: html + + </details> + +.. raw:: html + + <details> + +.. raw:: html + + <summary> + +Concluding response from Joe Banks + +.. raw:: html + + </summary> + +.. raw:: html + + <blockquote> + +Hey [Redacted]! + +Thanks for the response. We ensure that stateful pods only ever have one +volume assigned to them, either with a single replica deployment or a +statefulset. It appears that the error generally manifests when a +deployment is being migrated from one node to another during a redeploy, +which makes sense if there is some delay on the unmount/remount. + +Confusion occurred because Linode was reporting the volume as attached +when the node had been recycled, but I assume that was because the node +did not cleanly shutdown and therefore could not cleanly unmount +volumes. + +We’ve not seen any resurgence of such issues, and we’ll address the +software fault which overloaded the node which will helpfully mitigate +such problems in the future. + +Thanks again for the response, have a great week! + +Best, + +Joe + +.. raw:: html + + </blockquote> + +.. raw:: html + + </details> + +🔎 Five Why’s +============= + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**What?** +~~~~~~~~~ + +Several of our services became unavailable because their volumes could +not be mounted. + +Why? +~~~~ + +A node recycle left the node unable to mount volumes using the Linode +CSI. + +.. _why-1: + +Why? +~~~~ + +A node recycle was used because PostgreSQL had a connection surge. + +.. _why-2: + +Why? +~~~~ + +A Django feature deadlocked a table 62 times and suddenly started using +~70 connections to the database, saturating the maximum connections +limit. + +.. _why-3: + +Why? +~~~~ + +The root cause of why Django does this is unclear, and someone with more +Django proficiency is absolutely welcome to share any knowledge they may +have. I presume it’s some sort of worker race condition, but I’ve not +been able to reproduce it. + +🌱 Blameless root cause +======================= + +*Note the final root cause and describe what needs to change to prevent +reoccurrence* + +A node being forcefully restarted left volumes in a limbo state where +mounting was difficult, it took multiple hours for this to be resolved +since we had to wait for the volumes to unlock so they could be cloned. + +🤔 Lessons learned +================== + +*What did we learn from this incident?* + +Volumes are painful. + +We need to look at why Django is doing this and mitigations of the fault +to prevent this from occurring again. + +☑️ Follow-up tasks +================== + +*List any tasks we should complete that are relevant to this incident* + +- ☒ `Follow up on ticket at + Linode <https://www.notion.so/Cascading-node-failures-and-ensuing-volume-problems-1c6cfdfcadfc4422b719a0d7a4cc5001>`__ +- ☐ Investigate why Django could be connection surging and locking + tables diff --git a/docs/static/images/2021-01-12/site_cpu_throttle.png b/docs/postmortems/images/2021-01-12/site_cpu_throttle.png Binary files differindex b530ec6..b530ec6 100644 --- a/docs/static/images/2021-01-12/site_cpu_throttle.png +++ b/docs/postmortems/images/2021-01-12/site_cpu_throttle.png diff --git a/docs/static/images/2021-01-12/site_resource_abnormal.png b/docs/postmortems/images/2021-01-12/site_resource_abnormal.png Binary files differindex e1e07af..e1e07af 100644 --- a/docs/static/images/2021-01-12/site_resource_abnormal.png +++ b/docs/postmortems/images/2021-01-12/site_resource_abnormal.png diff --git a/docs/static/images/2021-01-30/linode_loadbalancers.png b/docs/postmortems/images/2021-01-30/linode_loadbalancers.png Binary files differindex f0eae1f..f0eae1f 100644 --- a/docs/static/images/2021-01-30/linode_loadbalancers.png +++ b/docs/postmortems/images/2021-01-30/linode_loadbalancers.png diff --git a/docs/static/images/2021-01-30/memory_charts.png b/docs/postmortems/images/2021-01-30/memory_charts.png Binary files differindex 370d19e..370d19e 100644 --- a/docs/static/images/2021-01-30/memory_charts.png +++ b/docs/postmortems/images/2021-01-30/memory_charts.png diff --git a/docs/static/images/2021-01-30/prometheus_status.png b/docs/postmortems/images/2021-01-30/prometheus_status.png Binary files differindex e95b8d7..e95b8d7 100644 --- a/docs/static/images/2021-01-30/prometheus_status.png +++ b/docs/postmortems/images/2021-01-30/prometheus_status.png diff --git a/docs/static/images/2021-01-30/scaleios.png b/docs/postmortems/images/2021-01-30/scaleios.png Binary files differindex 584d74d..584d74d 100644 --- a/docs/static/images/2021-01-30/scaleios.png +++ b/docs/postmortems/images/2021-01-30/scaleios.png diff --git a/docs/postmortems/index.rst b/docs/postmortems/index.rst new file mode 100644 index 0000000..43994a2 --- /dev/null +++ b/docs/postmortems/index.rst @@ -0,0 +1,15 @@ +Postmortems +=========== + +Browse the pages under this category to view historical postmortems for +Python Discord outages. + +.. toctree:: + :maxdepth: 2 + + 2020-12-11-all-services-outage + 2020-12-11-postgres-conn-surge + 2021-01-10-primary-kubernetes-node-outage + 2021-01-12-site-cpu-ram-exhaustion + 2021-01-30-nodebalancer-fails-memory + 2021-07-11-cascading-node-failures diff --git a/docs/queries/index.rst b/docs/queries/index.rst new file mode 100644 index 0000000..76218e4 --- /dev/null +++ b/docs/queries/index.rst @@ -0,0 +1,12 @@ +Queries +======= + +Get the data you desire with these assorted handcrafted queries. + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + kubernetes + loki + postgres diff --git a/docs/queries/kubernetes.rst b/docs/queries/kubernetes.rst new file mode 100644 index 0000000..f8d8984 --- /dev/null +++ b/docs/queries/kubernetes.rst @@ -0,0 +1,29 @@ +Kubernetes tips +=============== + +Find top pods by CPU/memory +--------------------------- + +.. code:: bash + + $ kubectl top pods --all-namespaces --sort-by='memory' + $ top pods --all-namespaces --sort-by='cpu' + +Find top nodes by CPU/memory +---------------------------- + +.. code:: bash + + $ kubectl top nodes --sort-by='cpu' + $ kubectl top nodes --sort-by='memory' + +Kubernetes cheat sheet +---------------------- + +`Open Kubernetes cheat +sheet <https://kubernetes.io/docs/reference/kubectl/cheatsheet/>`__ + +Lens IDE +-------- + +`OpenLens <https://github.com/MuhammedKalkan/OpenLens>`__ diff --git a/docs/queries/loki.rst b/docs/queries/loki.rst new file mode 100644 index 0000000..2ee57a3 --- /dev/null +++ b/docs/queries/loki.rst @@ -0,0 +1,25 @@ +Loki queries +============ + +Find any logs containing “ERROR” +-------------------------------- + +.. code:: shell + + {job=~"default/.+"} |= "ERROR" + +Find all logs from bot service +------------------------------ + +.. code:: shell + + {job="default/bot"} + +The format is ``namespace/object`` + +Rate of logs from a service +--------------------------- + +.. code:: shell + + rate(({job="default/bot"} |= "error" != "timeout")[10s]) diff --git a/docs/queries/postgres.rst b/docs/queries/postgres.rst new file mode 100644 index 0000000..5120145 --- /dev/null +++ b/docs/queries/postgres.rst @@ -0,0 +1,336 @@ +PostgreSQL queries +================== + +Disk usage +---------- + +Most of these queries vary based on the database you are connected to. + +General Table Size Information Grouped For Partitioned Tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + WITH RECURSIVE pg_inherit(inhrelid, inhparent) AS + (select inhrelid, inhparent + FROM pg_inherits + UNION + SELECT child.inhrelid, parent.inhparent + FROM pg_inherit child, pg_inherits parent + WHERE child.inhparent = parent.inhrelid), + pg_inherit_short AS (SELECT * FROM pg_inherit WHERE inhparent NOT IN (SELECT inhrelid FROM pg_inherit)) + SELECT table_schema + , TABLE_NAME + , row_estimate + , pg_size_pretty(total_bytes) AS total + , pg_size_pretty(index_bytes) AS INDEX + , pg_size_pretty(toast_bytes) AS toast + , pg_size_pretty(table_bytes) AS TABLE + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes + FROM ( + SELECT c.oid + , nspname AS table_schema + , relname AS TABLE_NAME + , SUM(c.reltuples) OVER (partition BY parent) AS row_estimate + , SUM(pg_total_relation_size(c.oid)) OVER (partition BY parent) AS total_bytes + , SUM(pg_indexes_size(c.oid)) OVER (partition BY parent) AS index_bytes + , SUM(pg_total_relation_size(reltoastrelid)) OVER (partition BY parent) AS toast_bytes + , parent + FROM ( + SELECT pg_class.oid + , reltuples + , relname + , relnamespace + , pg_class.reltoastrelid + , COALESCE(inhparent, pg_class.oid) parent + FROM pg_class + LEFT JOIN pg_inherit_short ON inhrelid = oid + WHERE relkind IN ('r', 'p') + ) c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + ) a + WHERE oid = parent + ) a + ORDER BY total_bytes DESC; + +General Table Size Information +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT *, pg_size_pretty(total_bytes) AS total + , pg_size_pretty(index_bytes) AS index + , pg_size_pretty(toast_bytes) AS toast + , pg_size_pretty(table_bytes) AS table + FROM ( + SELECT *, total_bytes-index_bytes-coalesce(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS table_name + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a + ) a; + +Finding the largest databases in your cluster +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT d.datname as Name, pg_catalog.pg_get_userbyid(d.datdba) as Owner, + CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') + THEN pg_catalog.pg_size_pretty(pg_catalog.pg_database_size(d.datname)) + ELSE 'No Access' + END as Size + FROM pg_catalog.pg_database d + order by + CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') + THEN pg_catalog.pg_database_size(d.datname) + ELSE NULL + END desc -- nulls first + LIMIT 20; + +Finding the size of your biggest relations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Relations are objects in the database such as tables and indexes, and +this query shows the size of all the individual parts. + +.. code:: sql + + SELECT nspname || '.' || relname AS "relation", + pg_size_pretty(pg_relation_size(C.oid)) AS "size" + FROM pg_class C + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE nspname NOT IN ('pg_catalog', 'information_schema') + ORDER BY pg_relation_size(C.oid) DESC + LIMIT 20; + +Finding the total size of your biggest tables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT nspname || '.' || relname AS "relation", + pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size" + FROM pg_class C + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE nspname NOT IN ('pg_catalog', 'information_schema') + AND C.relkind <> 'i' + AND nspname !~ '^pg_toast' + ORDER BY pg_total_relation_size(C.oid) DESC + LIMIT 20; + +Indexes +------- + +Index summary +~~~~~~~~~~~~~ + +.. code:: sql + + SELECT + pg_class.relname, + pg_size_pretty(pg_class.reltuples::bigint) AS rows_in_bytes, + pg_class.reltuples AS num_rows, + count(indexname) AS number_of_indexes, + CASE WHEN x.is_unique = 1 THEN 'Y' + ELSE 'N' + END AS UNIQUE, + SUM(case WHEN number_of_columns = 1 THEN 1 + ELSE 0 + END) AS single_column, + SUM(case WHEN number_of_columns IS NULL THEN 0 + WHEN number_of_columns = 1 THEN 0 + ELSE 1 + END) AS multi_column + FROM pg_namespace + LEFT OUTER JOIN pg_class ON pg_namespace.oid = pg_class.relnamespace + LEFT OUTER JOIN + (SELECT indrelid, + max(CAST(indisunique AS integer)) AS is_unique + FROM pg_index + GROUP BY indrelid) x + ON pg_class.oid = x.indrelid + LEFT OUTER JOIN + ( SELECT c.relname AS ctablename, ipg.relname AS indexname, x.indnatts AS number_of_columns FROM pg_index x + JOIN pg_class c ON c.oid = x.indrelid + JOIN pg_class ipg ON ipg.oid = x.indexrelid ) + AS foo + ON pg_class.relname = foo.ctablename + WHERE + pg_namespace.nspname='public' + AND pg_class.relkind = 'r' + GROUP BY pg_class.relname, pg_class.reltuples, x.is_unique + ORDER BY 2; + +Index size/usage statistics +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT + t.schemaname, + t.tablename, + indexname, + c.reltuples AS num_rows, + pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(t.tablename)::text)) AS table_size, + pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(indexrelname)::text)) AS index_size, + CASE WHEN indisunique THEN 'Y' + ELSE 'N' + END AS UNIQUE, + number_of_scans, + tuples_read, + tuples_fetched + FROM pg_tables t + LEFT OUTER JOIN pg_class c ON t.tablename = c.relname + LEFT OUTER JOIN ( + SELECT + c.relname AS ctablename, + ipg.relname AS indexname, + x.indnatts AS number_of_columns, + idx_scan AS number_of_scans, + idx_tup_read AS tuples_read, + idx_tup_fetch AS tuples_fetched, + indexrelname, + indisunique, + schemaname + FROM pg_index x + JOIN pg_class c ON c.oid = x.indrelid + JOIN pg_class ipg ON ipg.oid = x.indexrelid + JOIN pg_stat_all_indexes psai ON x.indexrelid = psai.indexrelid + ) AS foo ON t.tablename = foo.ctablename AND t.schemaname = foo.schemaname + WHERE t.schemaname NOT IN ('pg_catalog', 'information_schema') + ORDER BY 1,2; + +Duplicate indexes +~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT pg_size_pretty(sum(pg_relation_size(idx))::bigint) as size, + (array_agg(idx))[1] as idx1, (array_agg(idx))[2] as idx2, + (array_agg(idx))[3] as idx3, (array_agg(idx))[4] as idx4 + FROM ( + SELECT indexrelid::regclass as idx, (indrelid::text ||E'\n'|| indclass::text ||E'\n'|| indkey::text ||E'\n'|| + coalesce(indexprs::text,'')||E'\n' || coalesce(indpred::text,'')) as key + FROM pg_index) sub + GROUP BY key HAVING count(*)>1 + ORDER BY sum(pg_relation_size(idx)) DESC; + +Maintenance +----------- + +`PostgreSQL wiki <https://wiki.postgresql.org/wiki/Main_Page>`__ + +CLUSTER-ing +~~~~~~~~~~~ + +`CLUSTER <https://www.postgresql.org/docs/current/sql-cluster.html>`__ + +.. code:: sql + + CLUSTER [VERBOSE] table_name [ USING index_name ] + CLUSTER [VERBOSE] + +``CLUSTER`` instructs PostgreSQL to cluster the table specified by +``table_name`` based on the index specified by ``index_name``. The index +must already have been defined on ``table_name``. + +When a table is clustered, it is physically reordered based on the index +information. + +The +`clusterdb <https://www.postgresql.org/docs/current/app-clusterdb.html>`__ +CLI tool is recommended, and can also be used to cluster all tables at +the same time. + +VACUUM-ing +~~~~~~~~~~ + +Proper vacuuming, particularly autovacuum configuration, is crucial to a +fast and reliable database. + +`Introduction to VACUUM, ANALYZE, EXPLAIN, and +COUNT <https://wiki.postgresql.org/wiki/Introduction_to_VACUUM,_ANALYZE,_EXPLAIN,_and_COUNT>`__ + +It is not advised to run ``VACUUM FULL``, instead look at clustering. +VACUUM FULL is a much more intensive task and acquires an ACCESS +EXCLUSIVE lock on the table, blocking reads and writes. Whilst +``CLUSTER`` also does acquire this lock it’s a less intensive and faster +process. + +The +`vacuumdb <https://www.postgresql.org/docs/current/app-vacuumdb.html>`__ +CLI tool is recommended for manual runs. + +Finding number of dead rows +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: sql + + SELECT relname, n_dead_tup FROM pg_stat_user_tables WHERE n_dead_tup <> 0 ORDER BY 2 DESC; + +Finding last vacuum/auto-vacuum date +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: sql + + SELECT relname, last_vacuum, last_autovacuum FROM pg_stat_user_tables; + +Checking auto-vacuum is enabled +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: sql + + SELECT name, setting FROM pg_settings WHERE name='autovacuum'; + +View all auto-vacuum setting +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code:: sql + + SELECT * from pg_settings where category like 'Autovacuum'; + +Locks +----- + +Looking at granted locks +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT relation::regclass, * FROM pg_locks WHERE NOT granted; + +Сombination of blocked and blocking activity +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: sql + + SELECT blocked_locks.pid AS blocked_pid, + blocked_activity.usename AS blocked_user, + blocking_locks.pid AS blocking_pid, + blocking_activity.usename AS blocking_user, + blocked_activity.query AS blocked_statement, + blocking_activity.query AS current_statement_in_blocking_process + FROM pg_catalog.pg_locks blocked_locks + JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid + JOIN pg_catalog.pg_locks blocking_locks + ON blocking_locks.locktype = blocked_locks.locktype + AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database + AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation + AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page + AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple + AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid + AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid + AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid + AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid + AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid + AND blocking_locks.pid != blocked_locks.pid + + JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid + WHERE NOT blocked_locks.granted; diff --git a/docs/runbooks/index.rst b/docs/runbooks/index.rst new file mode 100644 index 0000000..18690c7 --- /dev/null +++ b/docs/runbooks/index.rst @@ -0,0 +1,17 @@ +Runbooks +======== + +Learn how to do anything in our infrastructure with these guidelines. + +.. note:: + + In general, we try to codify manual processes as much as possible. Still, + this section is important for tasks that are either hard to automate or are + run so infrequently that it does not make sense to regularly run them. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + postgresql-upgrade diff --git a/docs/runbooks/postgresql-upgrade.rst b/docs/runbooks/postgresql-upgrade.rst new file mode 100644 index 0000000..98b1642 --- /dev/null +++ b/docs/runbooks/postgresql-upgrade.rst @@ -0,0 +1,149 @@ +Upgrading PostgreSQL +==================== + +Step 1 - Enable maintenance mode +-------------------------------- + +Add a worker route for ``pythondiscord.com/*`` to forward to the +``maintenance`` Cloudflare worker. + +Step 2 - Scale down all services that use PostgreSQL +---------------------------------------------------- + +Notably site, metricity, bitwarden and the like should be scaled down. + +Services that are read only such as Grafana (but NOT Metabase, Metabase +uses PostgreSQL for internal storage) do not need to be scaled down, as +they do not update the database in any way. + +.. code:: bash + + $ kubectl scale deploy --replicas 0 site metricity metabase bitwarden ... + +Step 3 - Take a database dump and gzip +-------------------------------------- + +Using ``pg_dumpall``, dump the contents of all databases to a ``.sql`` +file. + +Make sure to gzip for faster transfer. + +Take a SHA512 sum of the output ``.sql.gz`` file to validate integrity +after copying. + +.. code:: bash + + $ pg_dumpall -U pythondiscord > backup.sql + $ gzip backup.sql + $ sha512sum backup.sql + a3337bfc65a072fd93124233ac1cefcdfbe8a708e5c1d08adaca2cf8c7cbe9ae4853ffab8c5cfbe943182355eaa701012111a420b29cc4f74d1e87f9df3af459 backup.sql + +Step 4 - Move database dump locally +----------------------------------- + +Use ``kubectl cp`` to move the ``backup.sql.gz`` file from the remote +pod to your local machine. + +Validate the integrity of the received file. + +Step 5 - Attempt local import to new PostgreSQL version +------------------------------------------------------- + +Install the new version of PostgreSQL locally and import the data. Make +sure you are operating on a **completely empty database server.** + +.. code:: bash + + $ gzcat backup.sql.gz | psql -U joe + +You can use any PostgreSQL superuser for the import. Ensure that no +errors other than those mentioned below occur, you may need to attempt +multiple times to fix errors listed below. + +Handle import errors +~~~~~~~~~~~~~~~~~~~~ + +Monitor the output of ``psql`` to check that no errors appear. + +If you receive locale errors ensure that the locale your database is +configured with matches the import script, this may require some usage +of ``sed``: + +.. code:: bash + + $ sed -i '' "s/en_US.utf8/en_GB.UTF-8/g" backup.sql + +Ensure that you **RESET THESE CHANGES** before attempting an import on +the remote, if they come from the PostgreSQL Docker image they will need +the same locale as the export. + +Step 7 - Spin down PostgreSQL +----------------------------- + +Spin down PostgreSQL to 0 replicas. + +Step 8 - Take volume backup at Linode +------------------------------------- + +Backup the volume at Linode through a clone in the Linode UI, name it +something obvious. + +Step 9 - Remove the Linode persistent volume +-------------------------------------------- + +Delete the volume specified in the ``volume.yaml`` file in the +``postgresql`` directory, you must delete the ``pvc`` first followed by +the ``pv``, you can find the relevant disks through +``kubectl get pv/pvc`` + +Step 10 - Create a new volume by re-applying the ``volume.yaml`` file +--------------------------------------------------------------------- + +Apply the ``volume.yaml`` so a new, empty, volume is created. + +Step 11 - Bump the PostgreSQL version in the ``deployment.yaml`` file +--------------------------------------------------------------------- + +Update the Docker image used in the deployment manifest. + +Step 12 - Apply the deployment +------------------------------ + +Run ``kubectl apply -f postgresql/deployment.yaml`` to start the new +database server. + +Step 13 - Copy the data across +------------------------------ + +After the pod has initialised use ``kubectl cp`` to copy the gzipped +backup to the new Postgres pod. + +Step 14 - Extract and import the new data +----------------------------------------- + +.. code:: bash + + $ gunzip backup.sql.gz + $ psql -U pythondiscord -f backup.sql + +Step 15 - Validate data import complete +--------------------------------------- + +Ensure that all logs are successful, you may get duplicate errors for +the ``pythondiscord`` user and database, these are safe to ignore. + +Step 16 - Scale up services +--------------------------- + +Restart the database server + +.. code:: bash + + $ kubectl scale deploy --replicas 1 metricity bitwarden metabase + +Step 17 - Validate all services interact correctly +-------------------------------------------------- + +Validate that all services reconnect successfully and start exchanging +data, ensure that no abnormal logs are outputted and performance remains +as expected. diff --git a/docs/themes/hugo-book b/docs/themes/hugo-book deleted file mode 160000 -Subproject 7c26d9b8b731d556a2bf89848f59e8300eabc44 diff --git a/docs/tooling/bots.rst b/docs/tooling/bots.rst new file mode 100644 index 0000000..e38f83c --- /dev/null +++ b/docs/tooling/bots.rst @@ -0,0 +1,52 @@ +Our GitHub repositories are supported by two custom bots: + +- Our **Fast Forward Bot**, which ensures that commits merged into main + are either merged manually on the command line or via a fast-forward, + ensuring that cryptographic signatures of commits remain intact. + Information on the bot can be found `in the ``ff-bot.yml`` + configuration <https://github.com/python-discord/infra/blob/main/.github/ff-bot.yml>`__. + Merges over the GitHub UI are discouraged for this reason. You can + use it by running ``/merge`` on a pull request. Note that attempting + to use it without permission to will be reported. + +- Our **Craig Dazey Emulator Bot**, which ensures team morale stays + high at all times by thanking team members for submitted pull + requests. [1]_ + +Furthermore, our repositories all have dependabot configured on them. + +Dealing with notifications +-------------------------- + +This section collects some of our team members’ ways of dealing with the +notifications that originate from our bots. + +Sieve (RFC 5228) script +~~~~~~~~~~~~~~~~~~~~~~~ + +If your mail server supports the `Sieve mail filtering +language <https://datatracker.ietf.org/doc/html/rfc5228.html>`__, which +it should, you can adapt the following script to customize the amount of +notifications you receive: + +.. code:: sieve + + require ["envelope", "fileinto", "imap4flags"]; + + if allof (header :is "X-GitHub-Sender" ["coveralls", "github-actions[bot]", "netlify[bot]"], + address :is "from" "[email protected]") { + setflag "\\seen"; + fileinto "Trash"; + stop; + } + +If you also want to filter out notifications from renovate, which we use +for dependency updates, you can add ``renovate[bot]`` to the +``X-GitHub-Sender`` list above. + +.. [1] + Craig Dazey Emulator Bot stands in no affiliation, direct or + indirect, with Craig Dazey. Craig Dazey Emulator Bot. Craig Dazey + Emulator Bot is not endorsed by Craig Dazey. Craig Dazey Emulator Bot + is an independent project of Craig Dazey. No association is made + between Craig Dazey Emulator Bot and Craig Dazey. diff --git a/docs/tooling/index.rst b/docs/tooling/index.rst new file mode 100644 index 0000000..2381849 --- /dev/null +++ b/docs/tooling/index.rst @@ -0,0 +1,12 @@ +Tooling +======= + +Learn about the helperlings that keep Python Discord DevOps running like a +well-oiled machine. + + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + bots diff --git a/pyproject.toml b/pyproject.toml index 58e77b9..047bfa5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,6 +49,8 @@ ignore = [ "RUF005", "RUF012", "RUF015", "S311", "SIM102", "SIM108", + # Docs + "A001", "INP001", # Rules suggested to be ignored when using ruff format "COM812", "COM819", "D206", "E111", "E114", "E117", "E501", "ISC001", "Q000", "Q001", "Q002", "Q003", "W191", |