diff options
Diffstat (limited to 'kubernetes/docs')
30 files changed, 1422 insertions, 0 deletions
| diff --git a/kubernetes/docs/.gitignore b/kubernetes/docs/.gitignore new file mode 100644 index 0000000..f40fbd8 --- /dev/null +++ b/kubernetes/docs/.gitignore @@ -0,0 +1,5 @@ +_site +.sass-cache +.jekyll-cache +.jekyll-metadata +vendor diff --git a/kubernetes/docs/404.html b/kubernetes/docs/404.html new file mode 100644 index 0000000..086a5c9 --- /dev/null +++ b/kubernetes/docs/404.html @@ -0,0 +1,25 @@ +--- +permalink: /404.html +layout: default +--- + +<style type="text/css" media="screen"> +  .container { +    margin: 10px auto; +    max-width: 600px; +    text-align: center; +  } +  h1 { +    margin: 30px 0; +    font-size: 4em; +    line-height: 1; +    letter-spacing: -1px; +  } +</style> + +<div class="container"> +  <h1>404</h1> + +  <p><strong>Page not found :(</strong></p> +  <p>The requested page could not be found.</p> +</div> diff --git a/kubernetes/docs/Gemfile b/kubernetes/docs/Gemfile new file mode 100644 index 0000000..754098e --- /dev/null +++ b/kubernetes/docs/Gemfile @@ -0,0 +1,32 @@ +source "https://rubygems.org" +# Hello! This is where you manage which Jekyll version is used to run. +# When you want to use a different version, change it below, save the +# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: +# +#     bundle exec jekyll serve +# +# This will help ensure the proper Jekyll version is running. +# Happy Jekylling! +gem "jekyll", "~> 4.2.0" +# This is the default theme for new Jekyll sites. You may change this to anything you like. +gem "minima", "~> 2.5" +# If you want to use GitHub Pages, remove the "gem "jekyll"" above and +# uncomment the line below. To upgrade, run `bundle update github-pages`. +# gem "github-pages", group: :jekyll_plugins +# If you have any plugins, put them here! +group :jekyll_plugins do +  gem "jekyll-feed", "~> 0.12" +end + +# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem +# and associated library. +platforms :mingw, :x64_mingw, :mswin, :jruby do +  gem "tzinfo", "~> 1.2" +  gem "tzinfo-data" +end + +# Performance-booster for watching directories on Windows +gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] + +gem "webrick", "~> 1.7" +gem "just-the-docs" diff --git a/kubernetes/docs/Gemfile.lock b/kubernetes/docs/Gemfile.lock new file mode 100644 index 0000000..e992f7d --- /dev/null +++ b/kubernetes/docs/Gemfile.lock @@ -0,0 +1,88 @@ +GEM +  remote: https://rubygems.org/ +  specs: +    addressable (2.8.0) +      public_suffix (>= 2.0.2, < 5.0) +    colorator (1.1.0) +    concurrent-ruby (1.1.9) +    em-websocket (0.5.2) +      eventmachine (>= 0.12.9) +      http_parser.rb (~> 0.6.0) +    eventmachine (1.2.7) +    ffi (1.15.4) +    forwardable-extended (2.6.0) +    http_parser.rb (0.6.0) +    i18n (1.8.10) +      concurrent-ruby (~> 1.0) +    jekyll (4.2.0) +      addressable (~> 2.4) +      colorator (~> 1.0) +      em-websocket (~> 0.5) +      i18n (~> 1.0) +      jekyll-sass-converter (~> 2.0) +      jekyll-watch (~> 2.0) +      kramdown (~> 2.3) +      kramdown-parser-gfm (~> 1.0) +      liquid (~> 4.0) +      mercenary (~> 0.4.0) +      pathutil (~> 0.9) +      rouge (~> 3.0) +      safe_yaml (~> 1.0) +      terminal-table (~> 2.0) +    jekyll-feed (0.15.1) +      jekyll (>= 3.7, < 5.0) +    jekyll-sass-converter (2.1.0) +      sassc (> 2.0.1, < 3.0) +    jekyll-seo-tag (2.7.1) +      jekyll (>= 3.8, < 5.0) +    jekyll-watch (2.2.1) +      listen (~> 3.0) +    just-the-docs (0.3.3) +      jekyll (>= 3.8.5) +      jekyll-seo-tag (~> 2.0) +      rake (>= 12.3.1, < 13.1.0) +    kramdown (2.3.1) +      rexml +    kramdown-parser-gfm (1.1.0) +      kramdown (~> 2.0) +    liquid (4.0.3) +    listen (3.7.0) +      rb-fsevent (~> 0.10, >= 0.10.3) +      rb-inotify (~> 0.9, >= 0.9.10) +    mercenary (0.4.0) +    minima (2.5.1) +      jekyll (>= 3.5, < 5.0) +      jekyll-feed (~> 0.9) +      jekyll-seo-tag (~> 2.1) +    pathutil (0.16.2) +      forwardable-extended (~> 2.6) +    public_suffix (4.0.6) +    rake (13.0.6) +    rb-fsevent (0.11.0) +    rb-inotify (0.10.1) +      ffi (~> 1.0) +    rexml (3.2.5) +    rouge (3.26.0) +    safe_yaml (1.0.5) +    sassc (2.4.0) +      ffi (~> 1.9) +    terminal-table (2.0.0) +      unicode-display_width (~> 1.1, >= 1.1.1) +    unicode-display_width (1.7.0) +    webrick (1.7.0) + +PLATFORMS +  x86_64-linux + +DEPENDENCIES +  jekyll (~> 4.2.0) +  jekyll-feed (~> 0.12) +  just-the-docs +  minima (~> 2.5) +  tzinfo (~> 1.2) +  tzinfo-data +  wdm (~> 0.1.1) +  webrick (~> 1.7) + +BUNDLED WITH +   2.2.27 diff --git a/kubernetes/docs/README.md b/kubernetes/docs/README.md new file mode 100644 index 0000000..664a982 --- /dev/null +++ b/kubernetes/docs/README.md @@ -0,0 +1,5 @@ +# DevOps Knowledgebase + +## Local Development + +Run `jekyll serve --config _config.yml,_config_dev.yml` to start locally. diff --git a/kubernetes/docs/_config.yml b/kubernetes/docs/_config.yml new file mode 100644 index 0000000..3c4b10a --- /dev/null +++ b/kubernetes/docs/_config.yml @@ -0,0 +1,54 @@ +title: PyDis DevOps +email: [email protected] +description: >- +  Knowledgebase for all things DevOps in Python Discord. +baseurl: "/kubernetes" +url: "https://python-discord.github.io" +twitter_username: PythonDiscord +github_username:  python-discord + +remote_theme: just-the-docs/just-the-docs +plugins: +  - jekyll-feed + +# Compression tuning +compress_html: +  ignore: +    envs: all + +# Code block preferences +kramdown: +  syntax_highlighter_opts: +    block: +      line_numbers: true + +# Theme configuration +aux_links: +  "Python Discord": +    "https://pythondiscord.com/" +aux_links_new_tab: true + +color_scheme: dark + +# Document detection +collections: +  runbooks: +    permalink: "/:collection/:path/" +    output: true +  queries: +    permalink: "/:collection/:path" +    output: true +  general: +    permalink: "/:collection/:path" +    output: true + +just_the_docs: +  collections: +    pages: +      name: Pages +    general: +      name: General +    runbooks: +      name: Runbooks +    queries: +      name: Queries diff --git a/kubernetes/docs/_config_dev.yml b/kubernetes/docs/_config_dev.yml new file mode 100644 index 0000000..20fb1af --- /dev/null +++ b/kubernetes/docs/_config_dev.yml @@ -0,0 +1,2 @@ +baseurl: "" +url: "http://localhost:4000" diff --git a/kubernetes/docs/_general/index.md b/kubernetes/docs/_general/index.md new file mode 100644 index 0000000..1d84650 --- /dev/null +++ b/kubernetes/docs/_general/index.md @@ -0,0 +1,7 @@ +--- +title: General +has_children: true +layout: default +nav_exclude: true +search_exclude: true +--- diff --git a/kubernetes/docs/_general/manual-deploys.md b/kubernetes/docs/_general/manual-deploys.md new file mode 100644 index 0000000..092647a --- /dev/null +++ b/kubernetes/docs/_general/manual-deploys.md @@ -0,0 +1,20 @@ +--- +title: Manual Deploys +layout: default +--- + +# Manual Deployments + +When the DevOps team are not available, Administrators and Core Developers can redeploy our critical services, such as Bot, Site and ModMail. + +This is handled through workflow dispatches on this repository. To get started, head to the [Actions](https://github.com/python-discord/kubernetes/actions) tab of this repository and select `Manual Redeploy` in the sidebar, alternatively navigate [here](https://github.com/python-discord/kubernetes/actions/workflows/manual_redeploy.yml). + +<img width="308" alt="image" src="https://user-images.githubusercontent.com/20439493/116442084-00d5f400-a84a-11eb-8e8a-e9e6bcc327dd.png"> + +Click `Run workflow` on the right hand side and enter the service name that needs redeploying, keep the branch as `main`: + +<img width="947" alt="image" src="https://user-images.githubusercontent.com/20439493/116442202-22cf7680-a84a-11eb-8cce-a3e715a1bf68.png"> + +Click `Run` and refresh the page, you'll see a new in progress Action which you can track. Once the deployment completes notifications will be sent to the `#dev-ops` channel on Discord. + +If you encounter errors with this please copy the Action run link to Discord so the DevOps team can investigate when available. diff --git a/kubernetes/docs/_queries/index.md b/kubernetes/docs/_queries/index.md new file mode 100644 index 0000000..991f86d --- /dev/null +++ b/kubernetes/docs/_queries/index.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Queries +has_children: true +nav_exclude: true +search_exclude: true +--- diff --git a/kubernetes/docs/_queries/kubernetes.md b/kubernetes/docs/_queries/kubernetes.md new file mode 100644 index 0000000..032ad70 --- /dev/null +++ b/kubernetes/docs/_queries/kubernetes.md @@ -0,0 +1,28 @@ +--- +layout: page +title: Kubernetes +--- + +# Kubernetes tips + +## Find top pods by CPU/memory + +```bash +$ kubectl top pods --all-namespaces --sort-by='memory' +$ top pods --all-namespaces --sort-by='cpu' +``` + +## Find top nodes by CPU/memory + +```bash +$ kubectl top nodes --sort-by='cpu' +$ kubectl top nodes --sort-by='memory' +``` + +## Kubernetes cheat sheet + +[Open Kubernetes cheat sheet](https://kubernetes.io/docs/reference/kubectl/cheatsheet/){: .btn .btn-purple }{:target="_blank"} + +## Lens IDE + +[Open Lens IDE](https://k8slens.dev){: .btn .btn-purple }{:target="_blank"} diff --git a/kubernetes/docs/_queries/loki.md b/kubernetes/docs/_queries/loki.md new file mode 100644 index 0000000..5dee3c3 --- /dev/null +++ b/kubernetes/docs/_queries/loki.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Loki +--- + +# Loki queries + +## Find any logs containing "ERROR" + +```sql +{job=~"default/.+"} |= "ERROR" +``` + +## Find all logs from bot service + +```sql +{job="default/bot"} +``` + +The format is `namespace/object` + +## Rate of logs from a service + +```sql +rate(({job="default/bot"} |= "error" != "timeout")[10s]) +``` diff --git a/kubernetes/docs/_queries/postgres.md b/kubernetes/docs/_queries/postgres.md new file mode 100644 index 0000000..13728f6 --- /dev/null +++ b/kubernetes/docs/_queries/postgres.md @@ -0,0 +1,301 @@ +--- +layout: default +title: PostgreSQL +--- + +# PostgreSQL queries + +## Disk usage + +Most of these queries vary based on the database you are connected to. + +### General Table Size Information Grouped For Partitioned Tables + +```sql +WITH RECURSIVE pg_inherit(inhrelid, inhparent) AS +    (select inhrelid, inhparent +    FROM pg_inherits +    UNION +    SELECT child.inhrelid, parent.inhparent +    FROM pg_inherit child, pg_inherits parent +    WHERE child.inhparent = parent.inhrelid), +pg_inherit_short AS (SELECT * FROM pg_inherit WHERE inhparent NOT IN (SELECT inhrelid FROM pg_inherit)) +SELECT table_schema +    , TABLE_NAME +    , row_estimate +    , pg_size_pretty(total_bytes) AS total +    , pg_size_pretty(index_bytes) AS INDEX +    , pg_size_pretty(toast_bytes) AS toast +    , pg_size_pretty(table_bytes) AS TABLE +  FROM ( +    SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes +    FROM ( +         SELECT c.oid +              , nspname AS table_schema +              , relname AS TABLE_NAME +              , SUM(c.reltuples) OVER (partition BY parent) AS row_estimate +              , SUM(pg_total_relation_size(c.oid)) OVER (partition BY parent) AS total_bytes +              , SUM(pg_indexes_size(c.oid)) OVER (partition BY parent) AS index_bytes +              , SUM(pg_total_relation_size(reltoastrelid)) OVER (partition BY parent) AS toast_bytes +              , parent +          FROM ( +                SELECT pg_class.oid +                    , reltuples +                    , relname +                    , relnamespace +                    , pg_class.reltoastrelid +                    , COALESCE(inhparent, pg_class.oid) parent +                FROM pg_class +                    LEFT JOIN pg_inherit_short ON inhrelid = oid +                WHERE relkind IN ('r', 'p') +             ) c +             LEFT JOIN pg_namespace n ON n.oid = c.relnamespace +  ) a +  WHERE oid = parent +) a +ORDER BY total_bytes DESC; +``` + +### General Table Size Information + +```sql +SELECT *, pg_size_pretty(total_bytes) AS total +    , pg_size_pretty(index_bytes) AS index +    , pg_size_pretty(toast_bytes) AS toast +    , pg_size_pretty(table_bytes) AS table +  FROM ( +  SELECT *, total_bytes-index_bytes-coalesce(toast_bytes,0) AS table_bytes FROM ( +      SELECT c.oid,nspname AS table_schema, relname AS table_name +              , c.reltuples AS row_estimate +              , pg_total_relation_size(c.oid) AS total_bytes +              , pg_indexes_size(c.oid) AS index_bytes +              , pg_total_relation_size(reltoastrelid) AS toast_bytes +          FROM pg_class c +          LEFT JOIN pg_namespace n ON n.oid = c.relnamespace +          WHERE relkind = 'r' +  ) a +) a; +``` + +### Finding the largest databases in your cluster + +```sql +SELECT d.datname as Name,  pg_catalog.pg_get_userbyid(d.datdba) as Owner, +    CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') +        THEN pg_catalog.pg_size_pretty(pg_catalog.pg_database_size(d.datname)) +        ELSE 'No Access' +    END as Size +FROM pg_catalog.pg_database d +    order by +    CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') +        THEN pg_catalog.pg_database_size(d.datname) +        ELSE NULL +    END desc -- nulls first +    LIMIT 20; +``` + +### Finding the size of your biggest relations + +Relations are objects in the database such as tables and indexes, and this query shows the size of all the individual parts. + +```sql +SELECT nspname || '.' || relname AS "relation", +    pg_size_pretty(pg_relation_size(C.oid)) AS "size" +  FROM pg_class C +  LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) +  WHERE nspname NOT IN ('pg_catalog', 'information_schema') +  ORDER BY pg_relation_size(C.oid) DESC +  LIMIT 20; +``` + +### Finding the total size of your biggest tables + +```sql +SELECT nspname || '.' || relname AS "relation", +    pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size" +  FROM pg_class C +  LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) +  WHERE nspname NOT IN ('pg_catalog', 'information_schema') +    AND C.relkind <> 'i' +    AND nspname !~ '^pg_toast' +  ORDER BY pg_total_relation_size(C.oid) DESC +  LIMIT 20; +``` + +## Indexes + +### Index summary + +```sql +SELECT +    pg_class.relname, +    pg_size_pretty(pg_class.reltuples::bigint) AS rows_in_bytes, +    pg_class.reltuples AS num_rows, +    count(indexname) AS number_of_indexes, +    CASE WHEN x.is_unique = 1 THEN 'Y' +       ELSE 'N' +    END AS UNIQUE, +    SUM(case WHEN number_of_columns = 1 THEN 1 +              ELSE 0 +            END) AS single_column, +    SUM(case WHEN number_of_columns IS NULL THEN 0 +             WHEN number_of_columns = 1 THEN 0 +             ELSE 1 +           END) AS multi_column +FROM pg_namespace +LEFT OUTER JOIN pg_class ON pg_namespace.oid = pg_class.relnamespace +LEFT OUTER JOIN +       (SELECT indrelid, +           max(CAST(indisunique AS integer)) AS is_unique +       FROM pg_index +       GROUP BY indrelid) x +       ON pg_class.oid = x.indrelid +LEFT OUTER JOIN +    ( SELECT c.relname AS ctablename, ipg.relname AS indexname, x.indnatts AS number_of_columns FROM pg_index x +           JOIN pg_class c ON c.oid = x.indrelid +           JOIN pg_class ipg ON ipg.oid = x.indexrelid  ) +    AS foo +    ON pg_class.relname = foo.ctablename +WHERE +     pg_namespace.nspname='public' +AND  pg_class.relkind = 'r' +GROUP BY pg_class.relname, pg_class.reltuples, x.is_unique +ORDER BY 2; +``` + +### Index size/usage statistics + +```sql +SELECT +    t.schemaname, +    t.tablename, +    indexname, +    c.reltuples AS num_rows, +    pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(t.tablename)::text)) AS table_size, +    pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(indexrelname)::text)) AS index_size, +    CASE WHEN indisunique THEN 'Y' +        ELSE 'N' +    END AS UNIQUE, +    number_of_scans, +    tuples_read, +    tuples_fetched +FROM pg_tables t +LEFT OUTER JOIN pg_class c ON t.tablename = c.relname +LEFT OUTER JOIN ( +    SELECT +        c.relname AS ctablename, +        ipg.relname AS indexname, +        x.indnatts AS number_of_columns, +        idx_scan AS number_of_scans, +        idx_tup_read AS tuples_read, +        idx_tup_fetch AS tuples_fetched, +        indexrelname, +        indisunique, +        schemaname +    FROM pg_index x +    JOIN pg_class c ON c.oid = x.indrelid +    JOIN pg_class ipg ON ipg.oid = x.indexrelid +    JOIN pg_stat_all_indexes psai ON x.indexrelid = psai.indexrelid +) AS foo ON t.tablename = foo.ctablename AND t.schemaname = foo.schemaname +WHERE t.schemaname NOT IN ('pg_catalog', 'information_schema') +ORDER BY 1,2; +``` + +### Duplicate indexes + +```sql +SELECT pg_size_pretty(sum(pg_relation_size(idx))::bigint) as size, +       (array_agg(idx))[1] as idx1, (array_agg(idx))[2] as idx2, +       (array_agg(idx))[3] as idx3, (array_agg(idx))[4] as idx4 +FROM ( +    SELECT indexrelid::regclass as idx, (indrelid::text ||E'\n'|| indclass::text ||E'\n'|| indkey::text ||E'\n'|| +                                         coalesce(indexprs::text,'')||E'\n' || coalesce(indpred::text,'')) as key +    FROM pg_index) sub +GROUP BY key HAVING count(*)>1 +ORDER BY sum(pg_relation_size(idx)) DESC; +``` + +## Maintenance + +[PostgreSQL wiki](https://wiki.postgresql.org/wiki/Main_Page) + +### CLUSTER-ing + +[CLUSTER](https://www.postgresql.org/docs/current/sql-cluster.html) + +```sql +CLUSTER [VERBOSE] table_name [ USING index_name ] +CLUSTER [VERBOSE] +``` + +`CLUSTER` instructs PostgreSQL to cluster the table specified by `table_name` based on the index specified by `index_name`. The index must already have been defined on `table_name`. + +When a table is clustered, it is physically reordered based on the index information. + +### VACUUM-ing + +Proper vacuuming, particularly autovacuum configuration, is crucial to a fast and reliable database. + +[Introduction to VACUUM, ANALYZE, EXPLAIN, and COUNT](https://wiki.postgresql.org/wiki/Introduction_to_VACUUM,_ANALYZE,_EXPLAIN,_and_COUNT) + +It is not advised to run `VACUUM FULL`, instead look at clustering. VACUUM FULL is a much more intensive task and acquires an ACCESS EXCLUSIVE lock on the table, blocking reads and writes. Whilst `CLUSTER` also does acquire this lock it's a less intensive and faster process. + +#### Finding number of dead rows + +```sql +SELECT relname, n_dead_tup FROM pg_stat_user_tables WHERE n_dead_tup <> 0 ORDER BY 2 DESC; +``` + +#### Finding last vacuum/auto-vacuum date + +```sql +SELECT relname, last_vacuum, last_autovacuum FROM pg_stat_user_tables; +``` + +#### Checking auto-vacuum is enabled + +```sql +SELECT name, setting FROM pg_settings WHERE name='autovacuum'; +``` + +#### View all auto-vacuum setting + +```sql +SELECT * from pg_settings where category like 'Autovacuum'; +``` + +## Locks + +### Looking at granted locks + +```sql +SELECT relation::regclass, * FROM pg_locks WHERE NOT granted; +``` + +### Π‘ombination of blocked and blocking activity + +```sql +SELECT blocked_locks.pid     AS blocked_pid, +         blocked_activity.usename  AS blocked_user, +         blocking_locks.pid     AS blocking_pid, +         blocking_activity.usename AS blocking_user, +         blocked_activity.query    AS blocked_statement, +         blocking_activity.query   AS current_statement_in_blocking_process +   FROM  pg_catalog.pg_locks         blocked_locks +    JOIN pg_catalog.pg_stat_activity blocked_activity  ON blocked_activity.pid = blocked_locks.pid +    JOIN pg_catalog.pg_locks         blocking_locks +        ON blocking_locks.locktype = blocked_locks.locktype +        AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database +        AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation +        AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page +        AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple +        AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid +        AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid +        AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid +        AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid +        AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid +        AND blocking_locks.pid != blocked_locks.pid + +    JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid +   WHERE NOT blocked_locks.granted; +``` diff --git a/kubernetes/docs/_runbooks/index.md b/kubernetes/docs/_runbooks/index.md new file mode 100644 index 0000000..357431f --- /dev/null +++ b/kubernetes/docs/_runbooks/index.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Runbooks +has_children: true +nav_exclude: true +search_exclude: true +--- diff --git a/kubernetes/docs/_runbooks/postgresql-upgrade.md b/kubernetes/docs/_runbooks/postgresql-upgrade.md new file mode 100644 index 0000000..7d85de2 --- /dev/null +++ b/kubernetes/docs/_runbooks/postgresql-upgrade.md @@ -0,0 +1,123 @@ +--- +title: PostgreSQL Upgrade +layout: page +--- + +# Upgrading PostgreSQL + +<details open markdown="block"> +  <summary> +    Table of contents +  </summary> +  {: .text-delta } +1. TOC +{:toc} +</details> + +# Step 1 - Enable maintenance mode + +Add a worker route for `pythondiscord.com/*` to forward to the `maintenance` Cloudflare worker. + +# Step 2 - Scale down all services that use PostgreSQL + +Notably site, metricity, bitwarden and the like should be scaled down. + +Services that are read only such as Grafana (but NOT Metabase, Metabase uses PostgreSQL for internal storage) do not need to be scaled down, as they do not update the database in any way. + +```bash +$ kubectl scale deploy --replicas 0 site metricity metabase bitwarden ... +``` + +# Step 3 - Take a database dump and gzip + +Using `pg_dumpall`, dump the contents of all databases to a `.sql` file. + +Make sure to gzip for faster transfer. + +Take a SHA512 sum of the output `.sql.gz` file to validate integrity after copying. + +```bash +$ pg_dumpall -U pythondiscord > backup.sql +$ gzip backup.sql +$ sha512sum backup.sql +a3337bfc65a072fd93124233ac1cefcdfbe8a708e5c1d08adaca2cf8c7cbe9ae4853ffab8c5cfbe943182355eaa701012111a420b29cc4f74d1e87f9df3af459  backup.sql +``` + +# Step 4 - Move database dump locally + +Use `kubectl cp` to move the `backup.sql.gz` file from the remote pod to your local machine. + +Validate the integrity of the received file. + +# Step 5 - Attempt local import to new PostgreSQL version + +Install the new version of PostgreSQL locally and import the data. Make sure you are operating on a **completely empty database server.** + +```bash +$ gzcat backup.sql.gz | psql -U joe +``` + +You can use any PostgreSQL superuser for the import. Ensure that no errors other than those mentioned below occur, you may need to attempt multiple times to fix errors listed below. + +## Handle import errors + +Monitor the output of `psql` to check that no errors appear. + +If you receive locale errors ensure that the locale your database is configured with matches the import script, this may require some usage of `sed`: + +```bash +$ sed -i '' "s/en_US.utf8/en_GB.UTF-8/g" backup.sql +``` + +Ensure that you **RESET THESE CHANGES** before attempting an import on the remote, if they come from the PostgreSQL Docker image they will need the same locale as the export. + +# Step 7 - Spin down PostgreSQL + +Spin down PostgreSQL to 0 replicas. + +# Step 8 - Take volume backup at Linode + +Backup the volume at Linode through a clone in the Linode UI, name it something obvious. + +# Step 9 - Remove the Linode persistent volume + +Delete the volume specified in the `volume.yaml` file in the `postgresql` directory, you must delete the `pvc` first followed by the `pv`, you can find the relevant disks through `kubectl get pv/pvc` + +# Step 10 - Create a new volume by re-applying the `volume.yaml` file + +Apply the `volume.yaml` so a new, empty, volume is created. + +# Step 11 - Bump the PostgreSQL version in the `deployment.yaml` file + +Update the Docker image used in the deployment manifest. + +# Step 12 - Apply the deployment + +Run `kubectl apply -f postgresql/deployment.yaml` to start the new database server. + +# Step 13 - Copy the data across + +After the pod has initialised use `kubectl cp` to copy the gzipped backup to the new Postgres pod. + +# Step 14 - Extract and import the new data + +```bash +$ gunzip backup.sql.gz +$ psql -U pythondiscord -f backup.sql +``` + +# Step 15 - Validate data import complete + +Ensure that all logs are successful, you may get duplicate errors for the `pythondiscord` user and database, these are safe to ignore. + +# Step 16 - Scale up services + +Restart the database server + +```bash +$ kubectl scale deploy --replicas 1 metricity bitwarden metabase +``` + +# Step 17 - Validate all services interact correctly + +Validate that all services reconnect successfully and start exchanging data, ensure that no abnormal logs are outputted and performance remains as expected. diff --git a/kubernetes/docs/favicon.ico b/kubernetes/docs/favicon.icoBinary files differ new file mode 100644 index 0000000..45ef3cf --- /dev/null +++ b/kubernetes/docs/favicon.ico diff --git a/kubernetes/docs/index.md b/kubernetes/docs/index.md new file mode 100644 index 0000000..a9c47cb --- /dev/null +++ b/kubernetes/docs/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Home +nav_order: 1 + +--- + +# Python Discord DevOps + +Welcome to the Python Discord DevOps knowledgebase. + +Within this set of pages you will find: +- Changelogs +- Post-mortems +- Common queries +- Runbooks diff --git a/kubernetes/docs/postmortems/2020-12-11-all-services-outage.md b/kubernetes/docs/postmortems/2020-12-11-all-services-outage.md new file mode 100644 index 0000000..35c6d70 --- /dev/null +++ b/kubernetes/docs/postmortems/2020-12-11-all-services-outage.md @@ -0,0 +1,86 @@ +--- +layout: default +title: "2020-12-11: All services outage" +parent: Postmortems +nav_order: 2 +--- + +# 2020-12-11: All services outage + +At **19:55 UTC, all services became unresponsive**. The DevOps were already in a call, and immediately started to investigate. + +Postgres was running at 100% CPU usage due to a **VACUUM**, which caused all services that depended on it to stop working. The high CPU left the host unresponsive and it shutdown. Linode Lassie noticed this and triggered a restart. + +It did not recover gracefully from this restart, with numerous core services reporting an error, so we had to manually restart core system services using Lens in order to get things working again. + +## β οΈ Leadup + +*List the sequence of events that led to the incident* + +Postgres triggered a **AUTOVACUUM**, which lead to a CPU spike. This made Postgres run at 100% CPU and was unresponsive, which caused services to stop responding. This lead to a  restart of the node, from which we did not recover gracefully. + +## π₯ Impact + +*Describe how internal and external users were impacted during the incident* + +All services went down. Catastrophic failure. We did not pass go, we did not collect $200. + +- Help channel system unavailable, so people are not able to effectively ask for help. +- Gates unavailable, so people can't successfully get into the community. +- Moderation and raid prevention unavailable, which leaves us defenseless against attacks. + +## ποΈ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We noticed that all PyDis services had stopped responding, coincidentally our DevOps team were in a call at the time, so that was helpful. + +We may be able to improve detection time by adding monitoring of resource usage. To this end, we've added alerts for high CPU usage and low memory. + +## ππΏββοΈ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. + +We noticed our node was entirely unresponsive and within minutes a restart had been triggered by Lassie after a high CPU shutdown occurred. + +The node came back and we saw a number of core services offline (e.g. Calico, CoreDNS, Linode CSI). + +**Obstacle: no recent database back-up available**{: .text-red-200 } + +## ππ½ββοΈ Recovery + +*How was the incident resolved? How can we improve future mitigation times?* + +Through [Lens](https://k8slens.dev/) we restarted core services one by one until they stabilised, after these core services were up other services began to come back online. + +We finally provisioned PostgreSQL which had been removed as a component before the restart (but too late to prevent the CPU errors). Once PostgreSQL was up we restarted any components that were acting buggy (e.g. site and bot). + +## π Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- Major service outage +- **Why?** Core service failures (e.g. Calico, CoreDNS, Linode CSI) +- **Why?** Kubernetes worker node restart +- **Why?** High CPU shutdown +- **Why?** Intensive PostgreSQL AUTOVACUUM caused a CPU spike + +## π± Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +## π€ Lessons learned + +*What did we learn from this incident?* + +- We must ensure we have working database backups. We are lucky that we did not lose any data this time. If this problem had caused volume corruption, we would be screwed. +- Sentry is broken for the bot. It was missing a DSN secret, which we have now restored. +- The [https://sentry.pydis.com](https://sentry.pydis.com) redirect was never migrated to the cluster. **We should do that.** + +## βοΈ Follow-up tasks + +*List any tasks we've created as a result of this incident* + +- [x] Push forward with backup plans diff --git a/kubernetes/docs/postmortems/2020-12-11-postgres-conn-surge.md b/kubernetes/docs/postmortems/2020-12-11-postgres-conn-surge.md new file mode 100644 index 0000000..3e5360c --- /dev/null +++ b/kubernetes/docs/postmortems/2020-12-11-postgres-conn-surge.md @@ -0,0 +1,96 @@ +--- +layout: default +title: "2020-12-11: Postgres connection surge" +parent: Postmortems +nav_order: 1 +--- + +# 2020-12-11: Postgres connection surge + +At **13:24 UTC,** we noticed the bot was not able to infract, and [pythondiscord.com](http://pythondiscord.com) was unavailable. The DevOps team started to investigate. + +We discovered that Postgres was not accepting new connections because it had hit 100 clients. This made it unavailable to all services that depended on it. + +Ultimately this was resolved by taking down Postgres, remounting the associated volume, and bringing it back up again. + +## β οΈ Leadup + +*List the sequence of events that led to the incident* + +The bot infractions stopped working, and we started investigating. + +## π₯ Impact + +*Describe how internal and external users were impacted during the incident* + +Services were unavailable both for internal and external users. + +- The Help Channel System was unavailable. +- Voice Gate and Server Gate were not working. +- Moderation commands were unavailable. +- Python Discord site & API were unavailable. CloudFlare automatically switched us to Always Online. + +## ποΈ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We noticed HTTP 524s coming from CloudFlare, upon attempting database connection we observed the maximum client limit. + +We noticed this log in site: + +```yaml +django.db.utils.OperationalError: FATAL:  sorry, too many clients already +``` + +We should be monitoring number of clients, and the monitor should alert us when we're approaching the max. That would have allowed for earlier detection, and possibly allowed us to prevent the incident altogether. + +We will look at [wrouesnel/postgres_exporter](https://github.com/wrouesnel/postgres_exporter) for monitoring this. + +## ππΏββοΈ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. The obstacles were mostly a lack of a clear response strategy. + +We should document our recovery procedure so that we're not so dependent on Joe Banks should this happen again while he's unavailable. + +## ππ½ββοΈ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +- Delete PostgreSQL deployment `kubectl delete deployment/postgres` +- Delete any remaining pods, WITH force. `kubectl delete <pod name> --force --grace-period=0` +- Unmount volume at Linode +- Remount volume at Linode +- Reapply deployment `kubectl apply -f postgres/deployment.yaml` + +## π Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- Postgres was unavailable, so our services died. +- **Why?** Postgres hit max clients, and could not respond. +- **Why?** Unknown, but we saw a number of connections from previous deployments of site. This indicates that database connections are not being terminated properly. Needs further investigation. + +## π± Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +We're not sure what the root cause is, but suspect site is not terminating database connections properly in some cases. We were unable to reproduce this problem. + +We've set up new telemetry on Grafana with alerts so that we can investigate this more closely. We will be let know if the number of connections from site exceeds 32, or if the total number of connections exceeds 90. + +## π€ Lessons learned + +*What did we learn from this incident?* + +- We must ensure the DevOps team has access to Linode and other key services even if our Bitwarden is down. +- We need to ensure we're alerted of any risk factors that have the potential to make Postgres unavailable, since this causes a catastrophic outage of practically all services. +- We absolutely need backups for the databases, so that this sort of problem carries less of a risk. +- We may need to consider something like [pg_bouncer](https://wiki.postgresql.org/wiki/PgBouncer) to manage a connection pool so that we don't exceed 100 *legitimate* clients connected as we connect more services to the postgres database. + +## βοΈ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] All database backup diff --git a/kubernetes/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md b/kubernetes/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md new file mode 100644 index 0000000..a8fb815 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md @@ -0,0 +1,86 @@ +--- +layout: default +title: "2021-01-10: Primary Kubernetes node outage" +parent: Postmortems +nav_order: 3 +--- + +# 2021-01-10: Primary Kubernetes node outage + + +We had an outage of our highest spec node due to CPU exhaustion. The outage lasted from around 20:20 to 20:46 UTC, but was not a full service outage. + +## β οΈ Leadup + +*List the sequence of events that led to the incident* + +I ran a query on Prometheus to try figure out some statistics on the number of metrics we are holding, this ended up scanning a lot of data in the TSDB database that Prometheus uses. + +This scan caused a CPU exhaustion which caused issues with the Kubernetes node status. + +## π₯ Impact + +*Describe how internal and external users were impacted during the incident* + +This brought down the primary node which meant there was some service outage. Most services transferred successfully to our secondary node which kept up some key services such as the Moderation bot and Modmail bot, as well as MongoDB. + +## ποΈ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +This was noticed when Discord services started having failures. The primary detection was through alerts though! I was paged 1 minute after we started encountering CPU exhaustion issues. + +## ππΏββοΈ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. + +No major obstacles were encountered during this. + +## ππ½ββοΈ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +It was noted that in the response to `kubectl get nodes` the primary node's status was reported as `NotReady`. Looking into the reason it was because the node had stopped responding. + +The quickest way to fix this was triggering a node restart. This shifted a lot of pods over to node 2 which encountered some capacity issues since it's not as highly specified as the first node. + +I brought this back the first node by restarting it at Linode's end. Once this node was reporting as `Ready` again I drained the second node by running `kubectl drain lke13311-20304-5ffa4d11faab`. This command stops the node from being available for scheduling and moves existing pods onto other nodes. + +Services gradually recovered as the dependencies started. The incident lasted overall around 26 minutes, though this was not a complete outage for the whole time and the bot remained functional throughout (meaning systems like the help channels were still functional). + +## π Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**Why?** Partial service outage + +**Why?** We had a node outage. + +**Why?** CPU exhaustion of our primary node. + +**Why?** Large prometheus query using a lot of CPU. + +**Why?** Prometheus had to scan millions of TSDB records which consumed all cores. + +## π± Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +A large query was run on Prometheus, so the solution is just to not run said queries. + +To protect against this more precisely though we should write resource constraints for services like this that are vulnerable to CPU exhaustion or memory consumption, which are the causes of our two past outages as well. + +## π€ Lessons learned + +*What did we learn from this incident?* + +- Don't run large queries, it consumes CPU! +- Write resource constraints for our services. + +## βοΈ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x]  Write resource constraints for our services. diff --git a/kubernetes/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md b/kubernetes/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md new file mode 100644 index 0000000..6935f02 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md @@ -0,0 +1,112 @@ +--- +layout: default +title: "2021-01-12: Django site CPU/RAM exhaustion outage" +parent: Postmortems +nav_order: 4 +--- + +# 2021-01-12: Django site CPU/RAM exhaustion outage + +At 03:01 UTC on Tuesday 12th January we experienced a momentary outage of our PostgreSQL database, causing some very minor service downtime. + +# β οΈ Leadup + +*List the sequence of events that led to the incident* + +We deleted the Developers role which led to a large user diff for all the users where we had to update their roles on the site. + +The bot was trying to post this for over 24 hours repeatedly after every restart. + +We deployed the bot at 2:55 UTC on 12th January and the user sync process began once again. + +This caused a CPU & RAM spike on our Django site, which in turn triggered an OOM error on the server which killed the Postgres process, sending it into a recovery state where queries could not be executed. + +Django site did not have any tools in place to batch the requests so was trying to process all 80k user updates in a single query, something that PostgreSQL probably could handle, but not the Django ORM. During the incident site jumped from it's average RAM usage of 300-400MB to **1.5GB.** + + + +RAM and CPU usage of site throughout the incident. The period just before 3:40 where no statistics were reported is the actual outage period where the Kubernetes node had some networking errors. + +# π₯ Impact + +*Describe how internal and external users were impacted during the incident* + +This database outage lasted mere minutes, since Postgres recovered and healed itself and the sync process was aborted, but it did leave us with a large user diff and our database becoming further out of sync. + +Most services stayed up that did not depend on PostgreSQL, and the site remained stable after the sync had been cancelled. + +# ποΈ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We were immediately alerted to the PostgreSQL outage on Grafana and through Sentry, meaning our response time was under a minute. + +We reduced some alert thresholds in order to catch RAM & CPU spikes faster in the future. + +It was hard to immediately see the cause of things since there is minimal logging on the site and the bot logs were not evident that anything was at fault, therefore our only detection was through machine metrics. + +We did manage to recover exactly what PostgreSQL was trying to do at the time of crashing by examining the logs which pointed us towards the user sync process. + +# ππΏββοΈ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the issue, there were no real obstacles encountered other than the node being less performant than we would like due to the CPU starvation. + +# ππ½ββοΈ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +The incident was resolved by stopping the sync process and writing a more efficient one through an internal eval script. We batched the updates into 1,000 users and instead of doing one large one did 80 smaller updates. This led to much higher efficiency with a cost of taking a little longer (~7 minutes). + +```python +from bot.exts.backend.sync import _syncers +syncer = _syncers.UserSyncer +diff = await syncer._get_diff(ctx.guild) + +def chunks(lst, n): +    for i in range(0, len(lst), n): +        yield lst[i:i + n] + +for chunk in chunks(diff.updated, 1000): +    await bot.api_client.patch("bot/users/bulk_patch", json=chunk) +``` + +Resource limits were also put into place on site to prevent RAM and CPU spikes, and throttle the CPU usage in these situations. This can be seen in the below graph: + + + +CPU throttling is where a container has hit the limits and we need to reel it in. Ideally this value stays as closes to 0 as possible, however as you can see site hit this twice (during the periods where it was trying to sync 80k users at once) + +# π Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- We experienced a major PostgreSQL outage +- PostgreSQL was killed by the system OOM due to the RAM spike on site. +- The RAM spike on site was caused by a large query. +- This was because we do not chunk queries on the bot. +- The large query was caused by the removal of the Developers role resulting in 80k users needing updating. + +# π± Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +The removal of the Developers role created a large diff which could not be applied by Django in a single request. + +See the follow up tasks on exactly how we can avoid this in future, it's a relatively easy mitigation. + +# π€ Lessons learned + +*What did we learn from this incident?* + +- Django (or DRF) does not like huge update queries. + +# βοΈ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x]  Make the bot syncer more efficient (batch requests) +- [ ]  Increase logging on bot, state when an error has been hit (we had no indication of this inside Discord, we need that) +- [x]  Adjust resource alerts to page DevOps members earlier. +- [x]  Apply resource limits to site to prevent major spikes diff --git a/kubernetes/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md b/kubernetes/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md new file mode 100644 index 0000000..dd2d624 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md @@ -0,0 +1,101 @@ +--- +layout: default +title: "2021-01-30: NodeBalancer networking faults due to memory pressure" +parent: Postmortems +nav_order: 5 +--- + +# 2021-01-30: NodeBalancer networking faults due to memory pressure + +At around 14:30 UTC on Saturday 30th January we started experiencing networking issues at the LoadBalancer level between Cloudflare and our Kubernetes cluster. It seems that the misconfiguration was due to memory and CPU pressure. + +~~This post-mortem is preliminary, we are still awaiting word from Linode's SysAdmins on any problems they detected.~~ + +**Update 2nd February 2021:** Linode have migrated our NodeBalancer to a different machine. + +## β οΈ Leadup + +*List the sequence of events that led to the incident* + +At 14:30 we started receiving alerts that services were becoming unreachable. We first experienced some momentary DNS errors which resolved themselves, however traffic ingress was still degraded. + +Upon checking Linode our NodeBalancer, the service which balances traffic between our Kubernetes nodes was reporting the backends (the services it balances to) as down. It reported all 4 as down (two for port 80 + two for port 443). This status was fluctuating between up and down, meaning traffic was not reaching our cluster correctly. Scaleios correctly noted: + + + +The config seems to have been set incorrectly due to memory and CPU pressure on one of our nodes. Here is the memory throughout the incident: + + + +Here is the display from Linode: + + + +## π₯ Impact + +*Describe how internal and external users were impacted during the incident* + +Since traffic could not correctly enter our cluster multiple services which were web based were offline, including services such as site, grafana and bitwarden. It appears that no inter-node communication was affected as this uses a WireGuard tunnel between the nodes which was not affected by the NodeBalancer. + +The lack of Grafana made diagnosis slightly more difficult, but even then it was only a short trip to the + +## ποΈ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We were alerted fairly promptly through statping which reported services as being down and posted a Discord notification. Subsequent alerts came in from Grafana but were limited since outbound communication was faulty. + +## ππΏββοΈ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded! + +Primary obstacle was the DevOps tools being out due to the traffic ingress problems. + +## ππ½ββοΈ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +The incident resolved itself upstream at Linode, we've opened a ticket with Linode to let them know of the faults, this might give us a better indication of what caused the issues. Our Kubernetes cluster continued posting updates to Linode to refresh the NodeBalancer configuration, inspecting these payloads the configuration looked correct. + +We've set up alerts for when Prometheus services stop responding since this seems to be a fairly tell-tale symptom of networking problems, this was the Prometheus status graph throughout the incident: + + + +## π Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**What?** Our service experienced an outage due to networking faults. + +**Why?** Incoming traffic could not reach our Kubernetes nodes + +**Why?** Our Linode NodeBalancers were not using correct configuration + +**Why?** Memory & CPU pressure seemed to cause invalid configuration errors upstream at Linode. + +**Why?** Unknown at this stage, NodeBalancer migrated. + +## π± Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +The configuration of our NodeBalancer was invalid, we cannot say why at this point since we are awaiting contact back from Linode, but indicators point to it being an upstream fault since memory & CPU pressure should **not** cause a load balancer misconfiguration. + +Linode are going to follow up with us at some point during the week with information from their System Administrators. + +**Update 2nd February 2021:** Linode have concluded investigations at their end, taken notes and migrated our NodeBalancer to a new machine. We haven't experienced problems since. + +## π€ Lessons learned + +*What did we learn from this incident?* + +We should be careful over-scheduling onto nodes since even while operating within reasonable constraints we risk sending invalid configuration upstream to Linode and therefore preventing traffic from entering our cluster. + +## βοΈ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x]  Monitor for follow up from Linode +- [x]  Carefully monitor the allocation rules for our services diff --git a/kubernetes/docs/postmortems/2021-07-11-cascading-node-failures.md b/kubernetes/docs/postmortems/2021-07-11-cascading-node-failures.md new file mode 100644 index 0000000..adf0d57 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-07-11-cascading-node-failures.md @@ -0,0 +1,185 @@ +--- +layout: default +title: "2021-07-11: Cascading node failures and ensuing volume problems" +parent: Postmortems +nav_order: 6 +--- + +# 2021-07-11: Cascading node failures and ensuing volume problems + +A PostgreSQL connection spike (00:27 UTC) caused by Django moved a node to an unresponsive state (00:55 UTC), upon performing a recycle of the affected node volumes were placed into a state where they could not be mounted. + +# β οΈ Leadup + +*List the sequence of events that led to the incident* + +- **00:27 UTC:** Django starts rapidly using connections to our PostgreSQL database +- **00:32 UTC:** DevOps team is alerted that PostgreSQL has saturated it's 115 max connections limit. Joe is paged. +- **00:33 UTC:** DevOps team is alerted that a service has claimed 34 dangerous table locks (it peaked at 61). +- **00:42 UTC:** Status incident created and backdated to 00:25 UTC. [Status incident](https://status.pythondiscord.com/incident/92712) +- **00:55 UTC:** It's clear that the node which PostgreSQL was on is no longer healthy after the Django connection surge, so it's recycled and a new one is to be added to the pool. +- **01:01 UTC:** Node `lke13311-16405-5fafd1b46dcf` begins it's restart +- **01:13 UTC:** Node has restored and regained healthy status, but volumes will not mount to the node. Support ticket opened at Linode for assistance. +- **06:36 UTC:** DevOps team alerted that Python is offline. This is due to Redis being a dependency of the bot, which as a stateful service was not healthy. + +# π₯ Impact + +*Describe how internal and external users were impacted during the incident* + +Initially, this manifested as a standard node outage where services on that node experienced some downtime as the node was restored. + +Post-restore, all stateful services (e.g. PostgreSQL, Redis, PrestaShop) were unexecutable due to the volume issues, and so any dependent services (e.g. Site, Bot, Hastebin) also had trouble starting. + +PostgreSQL was restored early on so for the most part Moderation could continue. + +# ποΈ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +DevOps were initially alerted at 00:32 UTC due to the PostgreSQL connection surge, and acknowledged at the same time. + +Further alerting could be used to catch surges earlier on (looking at conn delta vs. conn total), but for the most part alerting time was satisfactory here. + +# ππΏββοΈ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded. The primary issue encountered was failure upstream at Linode to remount the affected volumes, a support ticket has been created. + +# ππ½ββοΈ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +Initial node restoration was performed by @Joe Banks by recycling the affected node. + +Subsequent volume restoration was also @Joe Banks and once Linode had unlocked the volumes affected pods were scaled down to 0, the volumes were unmounted at the Linode side and then the deployments were recreated. + +<details markdown="block"> +<summary>Support ticket sent</summary> + +<blockquote markdown="block"> +Good evening, + +We experienced a resource surge on one of our Kubernetes nodes at 00:32 UTC, causing a node to go unresponsive. To mitigate problems here the node was recycled and began restarting at 1:01 UTC. + +The node has now rejoined the ring and started picking up services, but volumes will not attach to it, meaning pods with stateful storage will not start. + +An example events log for one such pod: + +``` +  Type     Reason       Age    From               Message +  ----     ------       ----   ----               ------- +  Normal   Scheduled    2m45s  default-scheduler  Successfully assigned default/redis-599887d778-wggbl to lke13311-16405-5fafd1b46dcf +  Warning  FailedMount  103s   kubelet            MountVolume.MountDevice failed for volume "pvc-bb1d06139b334c1f" : rpc error: code = Internal desc = Unable to find device path out of attempted paths: [/dev/disk/by-id/linode-pvcbb1d06139b334c1f /dev/disk/by-id/scsi-0Linode_Volume_pvcbb1d06139b334c1f] +  Warning  FailedMount  43s    kubelet            Unable to attach or mount volumes: unmounted volumes=[redis-data-volume], unattached volumes=[kube-api-access-6wwfs redis-data-volume redis-config-volume]: timed out waiting for the condition + +``` + +I've been trying to manually resolve this through the Linode Web UI but get presented with attachment errors upon doing so. Please could you advise on the best way forward to restore Volumes & Nodes to a functioning state? As far as I can see there is something going on upstream since the Linode UI presents these nodes as mounted however as shown above LKE nodes are not locating them, there is also a few failed attachment logs in the Linode Audit Log. + +Thanks, + +Joe +</blockquote> +</details> + +<details markdown="block"> +<summary>Response received from Linode</summary> + +<blockquote markdown="block"> +Hi Joe, + +> Were there any known issues with Block Storage in Frankfurt today? + +Not today, though there were service issues reported for Block Storage and LKE in Frankfurt on July 8 and 9: + +- [Service Issue - Block Storage - EU-Central (Frankfurt)](https://status.linode.com/incidents/pqfxl884wbh4) +- [Service Issue - Linode Kubernetes Engine - Frankfurt](https://status.linode.com/incidents/13fpkjd32sgz) + +There was also an API issue reported on the 10th (resolved on the 11th), mentioned here: + +- [Service Issue - Cloud Manager and API](https://status.linode.com/incidents/vhjm0xpwnnn5) + +Regarding the specific error you were receiving: + +> `Unable to find device path out of attempted paths` + +I'm not certain it's specifically related to those Service Issues, considering this isn't the first time a customer has reported this error in their LKE logs. In fact, if I recall correctly, I've run across this before too, since our volumes are RWO and I had too many replicas in my deployment that I was trying to attach to, for example. + +> is this a known bug/condition that occurs with Linode CSI/LKE? + +From what I understand, yes, this is a known condition that crops up from time to time, which we are tracking. However, since there is a workaround at the moment (e.g. - "After some more manual attempts to fix things, scaling down deployments, unmounting at Linode and then scaling up the deployments seems to have worked and all our services have now been restored."), there is no ETA for addressing this. With that said, I've let our Storage team know that you've run into this, so as to draw further attention to it. + +If you have any further questions or concerns regarding this, let us know. + +Best regards, +[Redacted] + +Linode Support Team +</blockquote> +</details> + +<details markdown="block"> +<summary>Concluding response from Joe Banks</summary> + +<blockquote markdown="block"> +Hey [Redacted]! + +Thanks for the response. We ensure that stateful pods only ever have one volume assigned to them, either with a single replica deployment or a statefulset. It appears that the error generally manifests when a deployment is being migrated from one node to another during a redeploy, which makes sense if there is some delay on the unmount/remount. + +Confusion occurred because Linode was reporting the volume as attached when the node had been recycled, but I assume that was because the node did not cleanly shutdown and therefore could not cleanly unmount volumes. + +We've not seen any resurgence of such issues, and we'll address the software fault which overloaded the node which will helpfully mitigate such problems in the future. + +Thanks again for the response, have a great week! + +Best, + +Joe +</blockquote> +</details> + +# π Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +### **What?** + +Several of our services became unavailable because their volumes could not be mounted. + +### Why? + +A node recycle left the node unable to mount volumes using the Linode CSI. + +### Why? + +A node recycle was used because PostgreSQL had a connection surge. + +### Why? + +A Django feature deadlocked a table 62 times and suddenly started using ~70 connections to the database, saturating the maximum connections limit. + +### Why? + +The root cause of why Django does this is unclear, and someone with more Django proficiency is absolutely welcome to share any knowledge they may have. I presume it's some sort of worker race condition, but I've not been able to reproduce it. + +# π± Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrence* + +A node being forcefully restarted left volumes in a limbo state where mounting was difficult, it took multiple hours for this to be resolved since we had to wait for the volumes to unlock so they could be cloned. + +# π€ Lessons learned + +*What did we learn from this incident?* + +Volumes are painful. + +We need to look at why Django is doing this and mitigations of the fault to prevent this from occurring again. + +# βοΈ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] [Follow up on ticket at Linode](https://www.notion.so/Cascading-node-failures-and-ensuing-volume-problems-1c6cfdfcadfc4422b719a0d7a4cc5001) +- [ ]  Investigate why Django could be connection surging and locking tables diff --git a/kubernetes/docs/postmortems/index.md b/kubernetes/docs/postmortems/index.md new file mode 100644 index 0000000..5e8b509 --- /dev/null +++ b/kubernetes/docs/postmortems/index.md @@ -0,0 +1,10 @@ +--- +title: Postmortems +layout: default +has_children: true +has_toc: false +--- + +# Postmortems + +Browse the pages under this category to view historical postmortems for Python Discord outages. diff --git a/kubernetes/docs/static/images/2021-01-12/site_cpu_throttle.png b/kubernetes/docs/static/images/2021-01-12/site_cpu_throttle.pngBinary files differ new file mode 100644 index 0000000..b530ec6 --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-12/site_cpu_throttle.png diff --git a/kubernetes/docs/static/images/2021-01-12/site_resource_abnormal.png b/kubernetes/docs/static/images/2021-01-12/site_resource_abnormal.pngBinary files differ new file mode 100644 index 0000000..e1e07af --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-12/site_resource_abnormal.png diff --git a/kubernetes/docs/static/images/2021-01-30/linode_loadbalancers.png b/kubernetes/docs/static/images/2021-01-30/linode_loadbalancers.pngBinary files differ new file mode 100644 index 0000000..f0eae1f --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/linode_loadbalancers.png diff --git a/kubernetes/docs/static/images/2021-01-30/memory_charts.png b/kubernetes/docs/static/images/2021-01-30/memory_charts.pngBinary files differ new file mode 100644 index 0000000..370d19e --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/memory_charts.png diff --git a/kubernetes/docs/static/images/2021-01-30/prometheus_status.png b/kubernetes/docs/static/images/2021-01-30/prometheus_status.pngBinary files differ new file mode 100644 index 0000000..e95b8d7 --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/prometheus_status.png diff --git a/kubernetes/docs/static/images/2021-01-30/scaleios.png b/kubernetes/docs/static/images/2021-01-30/scaleios.pngBinary files differ new file mode 100644 index 0000000..584d74d --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/scaleios.png | 
