diff options
247 files changed, 6160 insertions, 0 deletions
diff --git a/.git-crypt/.gitattributes b/.git-crypt/.gitattributes new file mode 100644 index 0000000..665b10e --- /dev/null +++ b/.git-crypt/.gitattributes @@ -0,0 +1,4 @@ +# Do not edit this file. To specify the files to encrypt, create your own +# .gitattributes file in the directory where your files are. +* !filter !diff +*.gpg binary diff --git a/.git-crypt/keys/default/0/1DA91E6CE87E3C1FCE32BC0CB6ED85CC5872D5E4.gpg b/.git-crypt/keys/default/0/1DA91E6CE87E3C1FCE32BC0CB6ED85CC5872D5E4.gpg Binary files differnew file mode 100644 index 0000000..6953f4c --- /dev/null +++ b/.git-crypt/keys/default/0/1DA91E6CE87E3C1FCE32BC0CB6ED85CC5872D5E4.gpg diff --git a/.git-crypt/keys/default/0/509CDFFC2D0783A33CF87D2B703EE21DE4D4D9C9.gpg b/.git-crypt/keys/default/0/509CDFFC2D0783A33CF87D2B703EE21DE4D4D9C9.gpg new file mode 100644 index 0000000..523dea7 --- /dev/null +++ b/.git-crypt/keys/default/0/509CDFFC2D0783A33CF87D2B703EE21DE4D4D9C9.gpg @@ -0,0 +1,2 @@ +�^V��n�В@��ᱮƘeÎ:$��v����k�(�pp�?0͖j�|J�ڈ�^4-��6��Ӥ�H��A|V��N����*�Z�j����� �|� ��R<���㨂��`�%�P骣c�z}�O=�J!%��KE`q��n`�>�
�17���*���߱��oɗ/ݒ�Q��@p�DO��%��*>zB
m-E��2 2�=��.��_�"?O֕���ܭ���Z���Kʅ��_� +� ��йЈq���,KT�q7��jȔf@qg#��#���2���uGL���h2�
\ No newline at end of file diff --git a/.git-crypt/keys/default/0/8C05D0E98B7914EDEBDCC8CC8E8E09282F2E17AF.gpg b/.git-crypt/keys/default/0/8C05D0E98B7914EDEBDCC8CC8E8E09282F2E17AF.gpg Binary files differnew file mode 100644 index 0000000..bee58e4 --- /dev/null +++ b/.git-crypt/keys/default/0/8C05D0E98B7914EDEBDCC8CC8E8E09282F2E17AF.gpg diff --git a/.git-crypt/keys/default/0/8E56193CE06E24722C7F2DEB1B5B5D1B8BB0BC18.gpg b/.git-crypt/keys/default/0/8E56193CE06E24722C7F2DEB1B5B5D1B8BB0BC18.gpg Binary files differnew file mode 100644 index 0000000..937982e --- /dev/null +++ b/.git-crypt/keys/default/0/8E56193CE06E24722C7F2DEB1B5B5D1B8BB0BC18.gpg diff --git a/.git-crypt/keys/default/0/F8413E8FA339472249D12555DF6738B80C155B71.gpg b/.git-crypt/keys/default/0/F8413E8FA339472249D12555DF6738B80C155B71.gpg Binary files differnew file mode 100644 index 0000000..a14a4b5 --- /dev/null +++ b/.git-crypt/keys/default/0/F8413E8FA339472249D12555DF6738B80C155B71.gpg diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..798448e --- /dev/null +++ b/.gitattributes @@ -0,0 +1,6 @@ +* text=auto +secrets.yaml filter=git-crypt diff=git-crypt +secrets.yml filter=git-crypt diff=git-crypt +secret.yaml filter=git-crypt diff=git-crypt +secret.yml filter=git-crypt diff=git-crypt +ghcr-pull-secrets.yaml filter=git-crypt diff=git-crypt diff --git a/kubernetes/README.md b/kubernetes/README.md new file mode 100644 index 0000000..2c96a68 --- /dev/null +++ b/kubernetes/README.md @@ -0,0 +1,16 @@ +# Kubernetes +Configuration and documentation for Python Discord's Kubernetes setup! + +## Secrets +We use [git-crypt](https://www.agwa.name/projects/git-crypt/) ([Github](https://github.com/AGWA/git-crypt))to secure secrets. Using this means we can commit secrets to change control, without the secrets being leaked. + +The [.gitattributes](.gitattributes) is used to determine which files to encrypt. See [git-crypt](https://www.agwa.name/projects/git-crypt/) documentation for more information. + +To work with our secrets, you must have your GPG key added by a member of the devops team. Once that is done, you can use git-crypt as documented. + +### git-crypt tl;dr +- Get/build a git-crypt binary from [Github](https://github.com/AGWA/git-crypt) +- Rename the binary to `git-crypt` +- Add binary to your PATH +- Run `git-crypt unlock` from this project's root directory. +See [git-crypt](https://www.agwa.name/projects/git-crypt/) documentation for more information. diff --git a/kubernetes/cluster-wide-secrets/README.md b/kubernetes/cluster-wide-secrets/README.md new file mode 100644 index 0000000..2fa9205 --- /dev/null +++ b/kubernetes/cluster-wide-secrets/README.md @@ -0,0 +1,5 @@ +# Cluster wide secrets + +These are secrets that are re-used across multiple services in the cluster. + +`ghcr-pull-secret` - Used by deployments to pull images from GHCR where the image isn't public. diff --git a/kubernetes/cluster-wide-secrets/ghcr-pull-secrets.yaml b/kubernetes/cluster-wide-secrets/ghcr-pull-secrets.yaml Binary files differnew file mode 100644 index 0000000..f8fac4a --- /dev/null +++ b/kubernetes/cluster-wide-secrets/ghcr-pull-secrets.yaml diff --git a/kubernetes/docs/.gitignore b/kubernetes/docs/.gitignore new file mode 100644 index 0000000..f40fbd8 --- /dev/null +++ b/kubernetes/docs/.gitignore @@ -0,0 +1,5 @@ +_site +.sass-cache +.jekyll-cache +.jekyll-metadata +vendor diff --git a/kubernetes/docs/404.html b/kubernetes/docs/404.html new file mode 100644 index 0000000..086a5c9 --- /dev/null +++ b/kubernetes/docs/404.html @@ -0,0 +1,25 @@ +--- +permalink: /404.html +layout: default +--- + +<style type="text/css" media="screen"> + .container { + margin: 10px auto; + max-width: 600px; + text-align: center; + } + h1 { + margin: 30px 0; + font-size: 4em; + line-height: 1; + letter-spacing: -1px; + } +</style> + +<div class="container"> + <h1>404</h1> + + <p><strong>Page not found :(</strong></p> + <p>The requested page could not be found.</p> +</div> diff --git a/kubernetes/docs/Gemfile b/kubernetes/docs/Gemfile new file mode 100644 index 0000000..754098e --- /dev/null +++ b/kubernetes/docs/Gemfile @@ -0,0 +1,32 @@ +source "https://rubygems.org" +# Hello! This is where you manage which Jekyll version is used to run. +# When you want to use a different version, change it below, save the +# file and run `bundle install`. Run Jekyll with `bundle exec`, like so: +# +# bundle exec jekyll serve +# +# This will help ensure the proper Jekyll version is running. +# Happy Jekylling! +gem "jekyll", "~> 4.2.0" +# This is the default theme for new Jekyll sites. You may change this to anything you like. +gem "minima", "~> 2.5" +# If you want to use GitHub Pages, remove the "gem "jekyll"" above and +# uncomment the line below. To upgrade, run `bundle update github-pages`. +# gem "github-pages", group: :jekyll_plugins +# If you have any plugins, put them here! +group :jekyll_plugins do + gem "jekyll-feed", "~> 0.12" +end + +# Windows and JRuby does not include zoneinfo files, so bundle the tzinfo-data gem +# and associated library. +platforms :mingw, :x64_mingw, :mswin, :jruby do + gem "tzinfo", "~> 1.2" + gem "tzinfo-data" +end + +# Performance-booster for watching directories on Windows +gem "wdm", "~> 0.1.1", :platforms => [:mingw, :x64_mingw, :mswin] + +gem "webrick", "~> 1.7" +gem "just-the-docs" diff --git a/kubernetes/docs/Gemfile.lock b/kubernetes/docs/Gemfile.lock new file mode 100644 index 0000000..e992f7d --- /dev/null +++ b/kubernetes/docs/Gemfile.lock @@ -0,0 +1,88 @@ +GEM + remote: https://rubygems.org/ + specs: + addressable (2.8.0) + public_suffix (>= 2.0.2, < 5.0) + colorator (1.1.0) + concurrent-ruby (1.1.9) + em-websocket (0.5.2) + eventmachine (>= 0.12.9) + http_parser.rb (~> 0.6.0) + eventmachine (1.2.7) + ffi (1.15.4) + forwardable-extended (2.6.0) + http_parser.rb (0.6.0) + i18n (1.8.10) + concurrent-ruby (~> 1.0) + jekyll (4.2.0) + addressable (~> 2.4) + colorator (~> 1.0) + em-websocket (~> 0.5) + i18n (~> 1.0) + jekyll-sass-converter (~> 2.0) + jekyll-watch (~> 2.0) + kramdown (~> 2.3) + kramdown-parser-gfm (~> 1.0) + liquid (~> 4.0) + mercenary (~> 0.4.0) + pathutil (~> 0.9) + rouge (~> 3.0) + safe_yaml (~> 1.0) + terminal-table (~> 2.0) + jekyll-feed (0.15.1) + jekyll (>= 3.7, < 5.0) + jekyll-sass-converter (2.1.0) + sassc (> 2.0.1, < 3.0) + jekyll-seo-tag (2.7.1) + jekyll (>= 3.8, < 5.0) + jekyll-watch (2.2.1) + listen (~> 3.0) + just-the-docs (0.3.3) + jekyll (>= 3.8.5) + jekyll-seo-tag (~> 2.0) + rake (>= 12.3.1, < 13.1.0) + kramdown (2.3.1) + rexml + kramdown-parser-gfm (1.1.0) + kramdown (~> 2.0) + liquid (4.0.3) + listen (3.7.0) + rb-fsevent (~> 0.10, >= 0.10.3) + rb-inotify (~> 0.9, >= 0.9.10) + mercenary (0.4.0) + minima (2.5.1) + jekyll (>= 3.5, < 5.0) + jekyll-feed (~> 0.9) + jekyll-seo-tag (~> 2.1) + pathutil (0.16.2) + forwardable-extended (~> 2.6) + public_suffix (4.0.6) + rake (13.0.6) + rb-fsevent (0.11.0) + rb-inotify (0.10.1) + ffi (~> 1.0) + rexml (3.2.5) + rouge (3.26.0) + safe_yaml (1.0.5) + sassc (2.4.0) + ffi (~> 1.9) + terminal-table (2.0.0) + unicode-display_width (~> 1.1, >= 1.1.1) + unicode-display_width (1.7.0) + webrick (1.7.0) + +PLATFORMS + x86_64-linux + +DEPENDENCIES + jekyll (~> 4.2.0) + jekyll-feed (~> 0.12) + just-the-docs + minima (~> 2.5) + tzinfo (~> 1.2) + tzinfo-data + wdm (~> 0.1.1) + webrick (~> 1.7) + +BUNDLED WITH + 2.2.27 diff --git a/kubernetes/docs/README.md b/kubernetes/docs/README.md new file mode 100644 index 0000000..664a982 --- /dev/null +++ b/kubernetes/docs/README.md @@ -0,0 +1,5 @@ +# DevOps Knowledgebase + +## Local Development + +Run `jekyll serve --config _config.yml,_config_dev.yml` to start locally. diff --git a/kubernetes/docs/_config.yml b/kubernetes/docs/_config.yml new file mode 100644 index 0000000..3c4b10a --- /dev/null +++ b/kubernetes/docs/_config.yml @@ -0,0 +1,54 @@ +title: PyDis DevOps +email: [email protected] +description: >- + Knowledgebase for all things DevOps in Python Discord. +baseurl: "/kubernetes" +url: "https://python-discord.github.io" +twitter_username: PythonDiscord +github_username: python-discord + +remote_theme: just-the-docs/just-the-docs +plugins: + - jekyll-feed + +# Compression tuning +compress_html: + ignore: + envs: all + +# Code block preferences +kramdown: + syntax_highlighter_opts: + block: + line_numbers: true + +# Theme configuration +aux_links: + "Python Discord": + "https://pythondiscord.com/" +aux_links_new_tab: true + +color_scheme: dark + +# Document detection +collections: + runbooks: + permalink: "/:collection/:path/" + output: true + queries: + permalink: "/:collection/:path" + output: true + general: + permalink: "/:collection/:path" + output: true + +just_the_docs: + collections: + pages: + name: Pages + general: + name: General + runbooks: + name: Runbooks + queries: + name: Queries diff --git a/kubernetes/docs/_config_dev.yml b/kubernetes/docs/_config_dev.yml new file mode 100644 index 0000000..20fb1af --- /dev/null +++ b/kubernetes/docs/_config_dev.yml @@ -0,0 +1,2 @@ +baseurl: "" +url: "http://localhost:4000" diff --git a/kubernetes/docs/_general/index.md b/kubernetes/docs/_general/index.md new file mode 100644 index 0000000..1d84650 --- /dev/null +++ b/kubernetes/docs/_general/index.md @@ -0,0 +1,7 @@ +--- +title: General +has_children: true +layout: default +nav_exclude: true +search_exclude: true +--- diff --git a/kubernetes/docs/_general/manual-deploys.md b/kubernetes/docs/_general/manual-deploys.md new file mode 100644 index 0000000..092647a --- /dev/null +++ b/kubernetes/docs/_general/manual-deploys.md @@ -0,0 +1,20 @@ +--- +title: Manual Deploys +layout: default +--- + +# Manual Deployments + +When the DevOps team are not available, Administrators and Core Developers can redeploy our critical services, such as Bot, Site and ModMail. + +This is handled through workflow dispatches on this repository. To get started, head to the [Actions](https://github.com/python-discord/kubernetes/actions) tab of this repository and select `Manual Redeploy` in the sidebar, alternatively navigate [here](https://github.com/python-discord/kubernetes/actions/workflows/manual_redeploy.yml). + +<img width="308" alt="image" src="https://user-images.githubusercontent.com/20439493/116442084-00d5f400-a84a-11eb-8e8a-e9e6bcc327dd.png"> + +Click `Run workflow` on the right hand side and enter the service name that needs redeploying, keep the branch as `main`: + +<img width="947" alt="image" src="https://user-images.githubusercontent.com/20439493/116442202-22cf7680-a84a-11eb-8cce-a3e715a1bf68.png"> + +Click `Run` and refresh the page, you'll see a new in progress Action which you can track. Once the deployment completes notifications will be sent to the `#dev-ops` channel on Discord. + +If you encounter errors with this please copy the Action run link to Discord so the DevOps team can investigate when available. diff --git a/kubernetes/docs/_queries/index.md b/kubernetes/docs/_queries/index.md new file mode 100644 index 0000000..991f86d --- /dev/null +++ b/kubernetes/docs/_queries/index.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Queries +has_children: true +nav_exclude: true +search_exclude: true +--- diff --git a/kubernetes/docs/_queries/kubernetes.md b/kubernetes/docs/_queries/kubernetes.md new file mode 100644 index 0000000..032ad70 --- /dev/null +++ b/kubernetes/docs/_queries/kubernetes.md @@ -0,0 +1,28 @@ +--- +layout: page +title: Kubernetes +--- + +# Kubernetes tips + +## Find top pods by CPU/memory + +```bash +$ kubectl top pods --all-namespaces --sort-by='memory' +$ top pods --all-namespaces --sort-by='cpu' +``` + +## Find top nodes by CPU/memory + +```bash +$ kubectl top nodes --sort-by='cpu' +$ kubectl top nodes --sort-by='memory' +``` + +## Kubernetes cheat sheet + +[Open Kubernetes cheat sheet](https://kubernetes.io/docs/reference/kubectl/cheatsheet/){: .btn .btn-purple }{:target="_blank"} + +## Lens IDE + +[Open Lens IDE](https://k8slens.dev){: .btn .btn-purple }{:target="_blank"} diff --git a/kubernetes/docs/_queries/loki.md b/kubernetes/docs/_queries/loki.md new file mode 100644 index 0000000..5dee3c3 --- /dev/null +++ b/kubernetes/docs/_queries/loki.md @@ -0,0 +1,26 @@ +--- +layout: default +title: Loki +--- + +# Loki queries + +## Find any logs containing "ERROR" + +```sql +{job=~"default/.+"} |= "ERROR" +``` + +## Find all logs from bot service + +```sql +{job="default/bot"} +``` + +The format is `namespace/object` + +## Rate of logs from a service + +```sql +rate(({job="default/bot"} |= "error" != "timeout")[10s]) +``` diff --git a/kubernetes/docs/_queries/postgres.md b/kubernetes/docs/_queries/postgres.md new file mode 100644 index 0000000..13728f6 --- /dev/null +++ b/kubernetes/docs/_queries/postgres.md @@ -0,0 +1,301 @@ +--- +layout: default +title: PostgreSQL +--- + +# PostgreSQL queries + +## Disk usage + +Most of these queries vary based on the database you are connected to. + +### General Table Size Information Grouped For Partitioned Tables + +```sql +WITH RECURSIVE pg_inherit(inhrelid, inhparent) AS + (select inhrelid, inhparent + FROM pg_inherits + UNION + SELECT child.inhrelid, parent.inhparent + FROM pg_inherit child, pg_inherits parent + WHERE child.inhparent = parent.inhrelid), +pg_inherit_short AS (SELECT * FROM pg_inherit WHERE inhparent NOT IN (SELECT inhrelid FROM pg_inherit)) +SELECT table_schema + , TABLE_NAME + , row_estimate + , pg_size_pretty(total_bytes) AS total + , pg_size_pretty(index_bytes) AS INDEX + , pg_size_pretty(toast_bytes) AS toast + , pg_size_pretty(table_bytes) AS TABLE + FROM ( + SELECT *, total_bytes-index_bytes-COALESCE(toast_bytes,0) AS table_bytes + FROM ( + SELECT c.oid + , nspname AS table_schema + , relname AS TABLE_NAME + , SUM(c.reltuples) OVER (partition BY parent) AS row_estimate + , SUM(pg_total_relation_size(c.oid)) OVER (partition BY parent) AS total_bytes + , SUM(pg_indexes_size(c.oid)) OVER (partition BY parent) AS index_bytes + , SUM(pg_total_relation_size(reltoastrelid)) OVER (partition BY parent) AS toast_bytes + , parent + FROM ( + SELECT pg_class.oid + , reltuples + , relname + , relnamespace + , pg_class.reltoastrelid + , COALESCE(inhparent, pg_class.oid) parent + FROM pg_class + LEFT JOIN pg_inherit_short ON inhrelid = oid + WHERE relkind IN ('r', 'p') + ) c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + ) a + WHERE oid = parent +) a +ORDER BY total_bytes DESC; +``` + +### General Table Size Information + +```sql +SELECT *, pg_size_pretty(total_bytes) AS total + , pg_size_pretty(index_bytes) AS index + , pg_size_pretty(toast_bytes) AS toast + , pg_size_pretty(table_bytes) AS table + FROM ( + SELECT *, total_bytes-index_bytes-coalesce(toast_bytes,0) AS table_bytes FROM ( + SELECT c.oid,nspname AS table_schema, relname AS table_name + , c.reltuples AS row_estimate + , pg_total_relation_size(c.oid) AS total_bytes + , pg_indexes_size(c.oid) AS index_bytes + , pg_total_relation_size(reltoastrelid) AS toast_bytes + FROM pg_class c + LEFT JOIN pg_namespace n ON n.oid = c.relnamespace + WHERE relkind = 'r' + ) a +) a; +``` + +### Finding the largest databases in your cluster + +```sql +SELECT d.datname as Name, pg_catalog.pg_get_userbyid(d.datdba) as Owner, + CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') + THEN pg_catalog.pg_size_pretty(pg_catalog.pg_database_size(d.datname)) + ELSE 'No Access' + END as Size +FROM pg_catalog.pg_database d + order by + CASE WHEN pg_catalog.has_database_privilege(d.datname, 'CONNECT') + THEN pg_catalog.pg_database_size(d.datname) + ELSE NULL + END desc -- nulls first + LIMIT 20; +``` + +### Finding the size of your biggest relations + +Relations are objects in the database such as tables and indexes, and this query shows the size of all the individual parts. + +```sql +SELECT nspname || '.' || relname AS "relation", + pg_size_pretty(pg_relation_size(C.oid)) AS "size" + FROM pg_class C + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE nspname NOT IN ('pg_catalog', 'information_schema') + ORDER BY pg_relation_size(C.oid) DESC + LIMIT 20; +``` + +### Finding the total size of your biggest tables + +```sql +SELECT nspname || '.' || relname AS "relation", + pg_size_pretty(pg_total_relation_size(C.oid)) AS "total_size" + FROM pg_class C + LEFT JOIN pg_namespace N ON (N.oid = C.relnamespace) + WHERE nspname NOT IN ('pg_catalog', 'information_schema') + AND C.relkind <> 'i' + AND nspname !~ '^pg_toast' + ORDER BY pg_total_relation_size(C.oid) DESC + LIMIT 20; +``` + +## Indexes + +### Index summary + +```sql +SELECT + pg_class.relname, + pg_size_pretty(pg_class.reltuples::bigint) AS rows_in_bytes, + pg_class.reltuples AS num_rows, + count(indexname) AS number_of_indexes, + CASE WHEN x.is_unique = 1 THEN 'Y' + ELSE 'N' + END AS UNIQUE, + SUM(case WHEN number_of_columns = 1 THEN 1 + ELSE 0 + END) AS single_column, + SUM(case WHEN number_of_columns IS NULL THEN 0 + WHEN number_of_columns = 1 THEN 0 + ELSE 1 + END) AS multi_column +FROM pg_namespace +LEFT OUTER JOIN pg_class ON pg_namespace.oid = pg_class.relnamespace +LEFT OUTER JOIN + (SELECT indrelid, + max(CAST(indisunique AS integer)) AS is_unique + FROM pg_index + GROUP BY indrelid) x + ON pg_class.oid = x.indrelid +LEFT OUTER JOIN + ( SELECT c.relname AS ctablename, ipg.relname AS indexname, x.indnatts AS number_of_columns FROM pg_index x + JOIN pg_class c ON c.oid = x.indrelid + JOIN pg_class ipg ON ipg.oid = x.indexrelid ) + AS foo + ON pg_class.relname = foo.ctablename +WHERE + pg_namespace.nspname='public' +AND pg_class.relkind = 'r' +GROUP BY pg_class.relname, pg_class.reltuples, x.is_unique +ORDER BY 2; +``` + +### Index size/usage statistics + +```sql +SELECT + t.schemaname, + t.tablename, + indexname, + c.reltuples AS num_rows, + pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(t.tablename)::text)) AS table_size, + pg_size_pretty(pg_relation_size(quote_ident(t.schemaname)::text || '.' || quote_ident(indexrelname)::text)) AS index_size, + CASE WHEN indisunique THEN 'Y' + ELSE 'N' + END AS UNIQUE, + number_of_scans, + tuples_read, + tuples_fetched +FROM pg_tables t +LEFT OUTER JOIN pg_class c ON t.tablename = c.relname +LEFT OUTER JOIN ( + SELECT + c.relname AS ctablename, + ipg.relname AS indexname, + x.indnatts AS number_of_columns, + idx_scan AS number_of_scans, + idx_tup_read AS tuples_read, + idx_tup_fetch AS tuples_fetched, + indexrelname, + indisunique, + schemaname + FROM pg_index x + JOIN pg_class c ON c.oid = x.indrelid + JOIN pg_class ipg ON ipg.oid = x.indexrelid + JOIN pg_stat_all_indexes psai ON x.indexrelid = psai.indexrelid +) AS foo ON t.tablename = foo.ctablename AND t.schemaname = foo.schemaname +WHERE t.schemaname NOT IN ('pg_catalog', 'information_schema') +ORDER BY 1,2; +``` + +### Duplicate indexes + +```sql +SELECT pg_size_pretty(sum(pg_relation_size(idx))::bigint) as size, + (array_agg(idx))[1] as idx1, (array_agg(idx))[2] as idx2, + (array_agg(idx))[3] as idx3, (array_agg(idx))[4] as idx4 +FROM ( + SELECT indexrelid::regclass as idx, (indrelid::text ||E'\n'|| indclass::text ||E'\n'|| indkey::text ||E'\n'|| + coalesce(indexprs::text,'')||E'\n' || coalesce(indpred::text,'')) as key + FROM pg_index) sub +GROUP BY key HAVING count(*)>1 +ORDER BY sum(pg_relation_size(idx)) DESC; +``` + +## Maintenance + +[PostgreSQL wiki](https://wiki.postgresql.org/wiki/Main_Page) + +### CLUSTER-ing + +[CLUSTER](https://www.postgresql.org/docs/current/sql-cluster.html) + +```sql +CLUSTER [VERBOSE] table_name [ USING index_name ] +CLUSTER [VERBOSE] +``` + +`CLUSTER` instructs PostgreSQL to cluster the table specified by `table_name` based on the index specified by `index_name`. The index must already have been defined on `table_name`. + +When a table is clustered, it is physically reordered based on the index information. + +### VACUUM-ing + +Proper vacuuming, particularly autovacuum configuration, is crucial to a fast and reliable database. + +[Introduction to VACUUM, ANALYZE, EXPLAIN, and COUNT](https://wiki.postgresql.org/wiki/Introduction_to_VACUUM,_ANALYZE,_EXPLAIN,_and_COUNT) + +It is not advised to run `VACUUM FULL`, instead look at clustering. VACUUM FULL is a much more intensive task and acquires an ACCESS EXCLUSIVE lock on the table, blocking reads and writes. Whilst `CLUSTER` also does acquire this lock it's a less intensive and faster process. + +#### Finding number of dead rows + +```sql +SELECT relname, n_dead_tup FROM pg_stat_user_tables WHERE n_dead_tup <> 0 ORDER BY 2 DESC; +``` + +#### Finding last vacuum/auto-vacuum date + +```sql +SELECT relname, last_vacuum, last_autovacuum FROM pg_stat_user_tables; +``` + +#### Checking auto-vacuum is enabled + +```sql +SELECT name, setting FROM pg_settings WHERE name='autovacuum'; +``` + +#### View all auto-vacuum setting + +```sql +SELECT * from pg_settings where category like 'Autovacuum'; +``` + +## Locks + +### Looking at granted locks + +```sql +SELECT relation::regclass, * FROM pg_locks WHERE NOT granted; +``` + +### Сombination of blocked and blocking activity + +```sql +SELECT blocked_locks.pid AS blocked_pid, + blocked_activity.usename AS blocked_user, + blocking_locks.pid AS blocking_pid, + blocking_activity.usename AS blocking_user, + blocked_activity.query AS blocked_statement, + blocking_activity.query AS current_statement_in_blocking_process + FROM pg_catalog.pg_locks blocked_locks + JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid + JOIN pg_catalog.pg_locks blocking_locks + ON blocking_locks.locktype = blocked_locks.locktype + AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database + AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation + AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page + AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple + AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid + AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid + AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid + AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid + AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid + AND blocking_locks.pid != blocked_locks.pid + + JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid + WHERE NOT blocked_locks.granted; +``` diff --git a/kubernetes/docs/_runbooks/index.md b/kubernetes/docs/_runbooks/index.md new file mode 100644 index 0000000..357431f --- /dev/null +++ b/kubernetes/docs/_runbooks/index.md @@ -0,0 +1,7 @@ +--- +layout: default +title: Runbooks +has_children: true +nav_exclude: true +search_exclude: true +--- diff --git a/kubernetes/docs/_runbooks/postgresql-upgrade.md b/kubernetes/docs/_runbooks/postgresql-upgrade.md new file mode 100644 index 0000000..7d85de2 --- /dev/null +++ b/kubernetes/docs/_runbooks/postgresql-upgrade.md @@ -0,0 +1,123 @@ +--- +title: PostgreSQL Upgrade +layout: page +--- + +# Upgrading PostgreSQL + +<details open markdown="block"> + <summary> + Table of contents + </summary> + {: .text-delta } +1. TOC +{:toc} +</details> + +# Step 1 - Enable maintenance mode + +Add a worker route for `pythondiscord.com/*` to forward to the `maintenance` Cloudflare worker. + +# Step 2 - Scale down all services that use PostgreSQL + +Notably site, metricity, bitwarden and the like should be scaled down. + +Services that are read only such as Grafana (but NOT Metabase, Metabase uses PostgreSQL for internal storage) do not need to be scaled down, as they do not update the database in any way. + +```bash +$ kubectl scale deploy --replicas 0 site metricity metabase bitwarden ... +``` + +# Step 3 - Take a database dump and gzip + +Using `pg_dumpall`, dump the contents of all databases to a `.sql` file. + +Make sure to gzip for faster transfer. + +Take a SHA512 sum of the output `.sql.gz` file to validate integrity after copying. + +```bash +$ pg_dumpall -U pythondiscord > backup.sql +$ gzip backup.sql +$ sha512sum backup.sql +a3337bfc65a072fd93124233ac1cefcdfbe8a708e5c1d08adaca2cf8c7cbe9ae4853ffab8c5cfbe943182355eaa701012111a420b29cc4f74d1e87f9df3af459 backup.sql +``` + +# Step 4 - Move database dump locally + +Use `kubectl cp` to move the `backup.sql.gz` file from the remote pod to your local machine. + +Validate the integrity of the received file. + +# Step 5 - Attempt local import to new PostgreSQL version + +Install the new version of PostgreSQL locally and import the data. Make sure you are operating on a **completely empty database server.** + +```bash +$ gzcat backup.sql.gz | psql -U joe +``` + +You can use any PostgreSQL superuser for the import. Ensure that no errors other than those mentioned below occur, you may need to attempt multiple times to fix errors listed below. + +## Handle import errors + +Monitor the output of `psql` to check that no errors appear. + +If you receive locale errors ensure that the locale your database is configured with matches the import script, this may require some usage of `sed`: + +```bash +$ sed -i '' "s/en_US.utf8/en_GB.UTF-8/g" backup.sql +``` + +Ensure that you **RESET THESE CHANGES** before attempting an import on the remote, if they come from the PostgreSQL Docker image they will need the same locale as the export. + +# Step 7 - Spin down PostgreSQL + +Spin down PostgreSQL to 0 replicas. + +# Step 8 - Take volume backup at Linode + +Backup the volume at Linode through a clone in the Linode UI, name it something obvious. + +# Step 9 - Remove the Linode persistent volume + +Delete the volume specified in the `volume.yaml` file in the `postgresql` directory, you must delete the `pvc` first followed by the `pv`, you can find the relevant disks through `kubectl get pv/pvc` + +# Step 10 - Create a new volume by re-applying the `volume.yaml` file + +Apply the `volume.yaml` so a new, empty, volume is created. + +# Step 11 - Bump the PostgreSQL version in the `deployment.yaml` file + +Update the Docker image used in the deployment manifest. + +# Step 12 - Apply the deployment + +Run `kubectl apply -f postgresql/deployment.yaml` to start the new database server. + +# Step 13 - Copy the data across + +After the pod has initialised use `kubectl cp` to copy the gzipped backup to the new Postgres pod. + +# Step 14 - Extract and import the new data + +```bash +$ gunzip backup.sql.gz +$ psql -U pythondiscord -f backup.sql +``` + +# Step 15 - Validate data import complete + +Ensure that all logs are successful, you may get duplicate errors for the `pythondiscord` user and database, these are safe to ignore. + +# Step 16 - Scale up services + +Restart the database server + +```bash +$ kubectl scale deploy --replicas 1 metricity bitwarden metabase +``` + +# Step 17 - Validate all services interact correctly + +Validate that all services reconnect successfully and start exchanging data, ensure that no abnormal logs are outputted and performance remains as expected. diff --git a/kubernetes/docs/favicon.ico b/kubernetes/docs/favicon.ico Binary files differnew file mode 100644 index 0000000..45ef3cf --- /dev/null +++ b/kubernetes/docs/favicon.ico diff --git a/kubernetes/docs/index.md b/kubernetes/docs/index.md new file mode 100644 index 0000000..a9c47cb --- /dev/null +++ b/kubernetes/docs/index.md @@ -0,0 +1,16 @@ +--- +layout: default +title: Home +nav_order: 1 + +--- + +# Python Discord DevOps + +Welcome to the Python Discord DevOps knowledgebase. + +Within this set of pages you will find: +- Changelogs +- Post-mortems +- Common queries +- Runbooks diff --git a/kubernetes/docs/postmortems/2020-12-11-all-services-outage.md b/kubernetes/docs/postmortems/2020-12-11-all-services-outage.md new file mode 100644 index 0000000..35c6d70 --- /dev/null +++ b/kubernetes/docs/postmortems/2020-12-11-all-services-outage.md @@ -0,0 +1,86 @@ +--- +layout: default +title: "2020-12-11: All services outage" +parent: Postmortems +nav_order: 2 +--- + +# 2020-12-11: All services outage + +At **19:55 UTC, all services became unresponsive**. The DevOps were already in a call, and immediately started to investigate. + +Postgres was running at 100% CPU usage due to a **VACUUM**, which caused all services that depended on it to stop working. The high CPU left the host unresponsive and it shutdown. Linode Lassie noticed this and triggered a restart. + +It did not recover gracefully from this restart, with numerous core services reporting an error, so we had to manually restart core system services using Lens in order to get things working again. + +## ⚠️ Leadup + +*List the sequence of events that led to the incident* + +Postgres triggered a **AUTOVACUUM**, which lead to a CPU spike. This made Postgres run at 100% CPU and was unresponsive, which caused services to stop responding. This lead to a restart of the node, from which we did not recover gracefully. + +## 🥏 Impact + +*Describe how internal and external users were impacted during the incident* + +All services went down. Catastrophic failure. We did not pass go, we did not collect $200. + +- Help channel system unavailable, so people are not able to effectively ask for help. +- Gates unavailable, so people can't successfully get into the community. +- Moderation and raid prevention unavailable, which leaves us defenseless against attacks. + +## 👁️ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We noticed that all PyDis services had stopped responding, coincidentally our DevOps team were in a call at the time, so that was helpful. + +We may be able to improve detection time by adding monitoring of resource usage. To this end, we've added alerts for high CPU usage and low memory. + +## 🙋🏿♂️ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. + +We noticed our node was entirely unresponsive and within minutes a restart had been triggered by Lassie after a high CPU shutdown occurred. + +The node came back and we saw a number of core services offline (e.g. Calico, CoreDNS, Linode CSI). + +**Obstacle: no recent database back-up available**{: .text-red-200 } + +## 🙆🏽♀️ Recovery + +*How was the incident resolved? How can we improve future mitigation times?* + +Through [Lens](https://k8slens.dev/) we restarted core services one by one until they stabilised, after these core services were up other services began to come back online. + +We finally provisioned PostgreSQL which had been removed as a component before the restart (but too late to prevent the CPU errors). Once PostgreSQL was up we restarted any components that were acting buggy (e.g. site and bot). + +## 🔎 Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- Major service outage +- **Why?** Core service failures (e.g. Calico, CoreDNS, Linode CSI) +- **Why?** Kubernetes worker node restart +- **Why?** High CPU shutdown +- **Why?** Intensive PostgreSQL AUTOVACUUM caused a CPU spike + +## 🌱 Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +## 🤔 Lessons learned + +*What did we learn from this incident?* + +- We must ensure we have working database backups. We are lucky that we did not lose any data this time. If this problem had caused volume corruption, we would be screwed. +- Sentry is broken for the bot. It was missing a DSN secret, which we have now restored. +- The [https://sentry.pydis.com](https://sentry.pydis.com) redirect was never migrated to the cluster. **We should do that.** + +## ☑️ Follow-up tasks + +*List any tasks we've created as a result of this incident* + +- [x] Push forward with backup plans diff --git a/kubernetes/docs/postmortems/2020-12-11-postgres-conn-surge.md b/kubernetes/docs/postmortems/2020-12-11-postgres-conn-surge.md new file mode 100644 index 0000000..3e5360c --- /dev/null +++ b/kubernetes/docs/postmortems/2020-12-11-postgres-conn-surge.md @@ -0,0 +1,96 @@ +--- +layout: default +title: "2020-12-11: Postgres connection surge" +parent: Postmortems +nav_order: 1 +--- + +# 2020-12-11: Postgres connection surge + +At **13:24 UTC,** we noticed the bot was not able to infract, and [pythondiscord.com](http://pythondiscord.com) was unavailable. The DevOps team started to investigate. + +We discovered that Postgres was not accepting new connections because it had hit 100 clients. This made it unavailable to all services that depended on it. + +Ultimately this was resolved by taking down Postgres, remounting the associated volume, and bringing it back up again. + +## ⚠️ Leadup + +*List the sequence of events that led to the incident* + +The bot infractions stopped working, and we started investigating. + +## 🥏 Impact + +*Describe how internal and external users were impacted during the incident* + +Services were unavailable both for internal and external users. + +- The Help Channel System was unavailable. +- Voice Gate and Server Gate were not working. +- Moderation commands were unavailable. +- Python Discord site & API were unavailable. CloudFlare automatically switched us to Always Online. + +## 👁️ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We noticed HTTP 524s coming from CloudFlare, upon attempting database connection we observed the maximum client limit. + +We noticed this log in site: + +```yaml +django.db.utils.OperationalError: FATAL: sorry, too many clients already +``` + +We should be monitoring number of clients, and the monitor should alert us when we're approaching the max. That would have allowed for earlier detection, and possibly allowed us to prevent the incident altogether. + +We will look at [wrouesnel/postgres_exporter](https://github.com/wrouesnel/postgres_exporter) for monitoring this. + +## 🙋🏿♂️ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. The obstacles were mostly a lack of a clear response strategy. + +We should document our recovery procedure so that we're not so dependent on Joe Banks should this happen again while he's unavailable. + +## 🙆🏽♀️ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +- Delete PostgreSQL deployment `kubectl delete deployment/postgres` +- Delete any remaining pods, WITH force. `kubectl delete <pod name> --force --grace-period=0` +- Unmount volume at Linode +- Remount volume at Linode +- Reapply deployment `kubectl apply -f postgres/deployment.yaml` + +## 🔎 Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- Postgres was unavailable, so our services died. +- **Why?** Postgres hit max clients, and could not respond. +- **Why?** Unknown, but we saw a number of connections from previous deployments of site. This indicates that database connections are not being terminated properly. Needs further investigation. + +## 🌱 Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +We're not sure what the root cause is, but suspect site is not terminating database connections properly in some cases. We were unable to reproduce this problem. + +We've set up new telemetry on Grafana with alerts so that we can investigate this more closely. We will be let know if the number of connections from site exceeds 32, or if the total number of connections exceeds 90. + +## 🤔 Lessons learned + +*What did we learn from this incident?* + +- We must ensure the DevOps team has access to Linode and other key services even if our Bitwarden is down. +- We need to ensure we're alerted of any risk factors that have the potential to make Postgres unavailable, since this causes a catastrophic outage of practically all services. +- We absolutely need backups for the databases, so that this sort of problem carries less of a risk. +- We may need to consider something like [pg_bouncer](https://wiki.postgresql.org/wiki/PgBouncer) to manage a connection pool so that we don't exceed 100 *legitimate* clients connected as we connect more services to the postgres database. + +## ☑️ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] All database backup diff --git a/kubernetes/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md b/kubernetes/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md new file mode 100644 index 0000000..a8fb815 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-01-10-primary-kubernetes-node-outage.md @@ -0,0 +1,86 @@ +--- +layout: default +title: "2021-01-10: Primary Kubernetes node outage" +parent: Postmortems +nav_order: 3 +--- + +# 2021-01-10: Primary Kubernetes node outage + + +We had an outage of our highest spec node due to CPU exhaustion. The outage lasted from around 20:20 to 20:46 UTC, but was not a full service outage. + +## ⚠️ Leadup + +*List the sequence of events that led to the incident* + +I ran a query on Prometheus to try figure out some statistics on the number of metrics we are holding, this ended up scanning a lot of data in the TSDB database that Prometheus uses. + +This scan caused a CPU exhaustion which caused issues with the Kubernetes node status. + +## 🥏 Impact + +*Describe how internal and external users were impacted during the incident* + +This brought down the primary node which meant there was some service outage. Most services transferred successfully to our secondary node which kept up some key services such as the Moderation bot and Modmail bot, as well as MongoDB. + +## 👁️ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +This was noticed when Discord services started having failures. The primary detection was through alerts though! I was paged 1 minute after we started encountering CPU exhaustion issues. + +## 🙋🏿♂️ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the incident. + +No major obstacles were encountered during this. + +## 🙆🏽♀️ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +It was noted that in the response to `kubectl get nodes` the primary node's status was reported as `NotReady`. Looking into the reason it was because the node had stopped responding. + +The quickest way to fix this was triggering a node restart. This shifted a lot of pods over to node 2 which encountered some capacity issues since it's not as highly specified as the first node. + +I brought this back the first node by restarting it at Linode's end. Once this node was reporting as `Ready` again I drained the second node by running `kubectl drain lke13311-20304-5ffa4d11faab`. This command stops the node from being available for scheduling and moves existing pods onto other nodes. + +Services gradually recovered as the dependencies started. The incident lasted overall around 26 minutes, though this was not a complete outage for the whole time and the bot remained functional throughout (meaning systems like the help channels were still functional). + +## 🔎 Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**Why?** Partial service outage + +**Why?** We had a node outage. + +**Why?** CPU exhaustion of our primary node. + +**Why?** Large prometheus query using a lot of CPU. + +**Why?** Prometheus had to scan millions of TSDB records which consumed all cores. + +## 🌱 Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +A large query was run on Prometheus, so the solution is just to not run said queries. + +To protect against this more precisely though we should write resource constraints for services like this that are vulnerable to CPU exhaustion or memory consumption, which are the causes of our two past outages as well. + +## 🤔 Lessons learned + +*What did we learn from this incident?* + +- Don't run large queries, it consumes CPU! +- Write resource constraints for our services. + +## ☑️ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] Write resource constraints for our services. diff --git a/kubernetes/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md b/kubernetes/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md new file mode 100644 index 0000000..6935f02 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-01-12-site-cpu-ram-exhaustion.md @@ -0,0 +1,112 @@ +--- +layout: default +title: "2021-01-12: Django site CPU/RAM exhaustion outage" +parent: Postmortems +nav_order: 4 +--- + +# 2021-01-12: Django site CPU/RAM exhaustion outage + +At 03:01 UTC on Tuesday 12th January we experienced a momentary outage of our PostgreSQL database, causing some very minor service downtime. + +# ⚠️ Leadup + +*List the sequence of events that led to the incident* + +We deleted the Developers role which led to a large user diff for all the users where we had to update their roles on the site. + +The bot was trying to post this for over 24 hours repeatedly after every restart. + +We deployed the bot at 2:55 UTC on 12th January and the user sync process began once again. + +This caused a CPU & RAM spike on our Django site, which in turn triggered an OOM error on the server which killed the Postgres process, sending it into a recovery state where queries could not be executed. + +Django site did not have any tools in place to batch the requests so was trying to process all 80k user updates in a single query, something that PostgreSQL probably could handle, but not the Django ORM. During the incident site jumped from it's average RAM usage of 300-400MB to **1.5GB.** + + + +RAM and CPU usage of site throughout the incident. The period just before 3:40 where no statistics were reported is the actual outage period where the Kubernetes node had some networking errors. + +# 🥏 Impact + +*Describe how internal and external users were impacted during the incident* + +This database outage lasted mere minutes, since Postgres recovered and healed itself and the sync process was aborted, but it did leave us with a large user diff and our database becoming further out of sync. + +Most services stayed up that did not depend on PostgreSQL, and the site remained stable after the sync had been cancelled. + +# 👁️ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We were immediately alerted to the PostgreSQL outage on Grafana and through Sentry, meaning our response time was under a minute. + +We reduced some alert thresholds in order to catch RAM & CPU spikes faster in the future. + +It was hard to immediately see the cause of things since there is minimal logging on the site and the bot logs were not evident that anything was at fault, therefore our only detection was through machine metrics. + +We did manage to recover exactly what PostgreSQL was trying to do at the time of crashing by examining the logs which pointed us towards the user sync process. + +# 🙋🏿♂️ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded to the issue, there were no real obstacles encountered other than the node being less performant than we would like due to the CPU starvation. + +# 🙆🏽♀️ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +The incident was resolved by stopping the sync process and writing a more efficient one through an internal eval script. We batched the updates into 1,000 users and instead of doing one large one did 80 smaller updates. This led to much higher efficiency with a cost of taking a little longer (~7 minutes). + +```python +from bot.exts.backend.sync import _syncers +syncer = _syncers.UserSyncer +diff = await syncer._get_diff(ctx.guild) + +def chunks(lst, n): + for i in range(0, len(lst), n): + yield lst[i:i + n] + +for chunk in chunks(diff.updated, 1000): + await bot.api_client.patch("bot/users/bulk_patch", json=chunk) +``` + +Resource limits were also put into place on site to prevent RAM and CPU spikes, and throttle the CPU usage in these situations. This can be seen in the below graph: + + + +CPU throttling is where a container has hit the limits and we need to reel it in. Ideally this value stays as closes to 0 as possible, however as you can see site hit this twice (during the periods where it was trying to sync 80k users at once) + +# 🔎 Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +- We experienced a major PostgreSQL outage +- PostgreSQL was killed by the system OOM due to the RAM spike on site. +- The RAM spike on site was caused by a large query. +- This was because we do not chunk queries on the bot. +- The large query was caused by the removal of the Developers role resulting in 80k users needing updating. + +# 🌱 Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +The removal of the Developers role created a large diff which could not be applied by Django in a single request. + +See the follow up tasks on exactly how we can avoid this in future, it's a relatively easy mitigation. + +# 🤔 Lessons learned + +*What did we learn from this incident?* + +- Django (or DRF) does not like huge update queries. + +# ☑️ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] Make the bot syncer more efficient (batch requests) +- [ ] Increase logging on bot, state when an error has been hit (we had no indication of this inside Discord, we need that) +- [x] Adjust resource alerts to page DevOps members earlier. +- [x] Apply resource limits to site to prevent major spikes diff --git a/kubernetes/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md b/kubernetes/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md new file mode 100644 index 0000000..dd2d624 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-01-30-nodebalancer-fails-memory.md @@ -0,0 +1,101 @@ +--- +layout: default +title: "2021-01-30: NodeBalancer networking faults due to memory pressure" +parent: Postmortems +nav_order: 5 +--- + +# 2021-01-30: NodeBalancer networking faults due to memory pressure + +At around 14:30 UTC on Saturday 30th January we started experiencing networking issues at the LoadBalancer level between Cloudflare and our Kubernetes cluster. It seems that the misconfiguration was due to memory and CPU pressure. + +~~This post-mortem is preliminary, we are still awaiting word from Linode's SysAdmins on any problems they detected.~~ + +**Update 2nd February 2021:** Linode have migrated our NodeBalancer to a different machine. + +## ⚠️ Leadup + +*List the sequence of events that led to the incident* + +At 14:30 we started receiving alerts that services were becoming unreachable. We first experienced some momentary DNS errors which resolved themselves, however traffic ingress was still degraded. + +Upon checking Linode our NodeBalancer, the service which balances traffic between our Kubernetes nodes was reporting the backends (the services it balances to) as down. It reported all 4 as down (two for port 80 + two for port 443). This status was fluctuating between up and down, meaning traffic was not reaching our cluster correctly. Scaleios correctly noted: + + + +The config seems to have been set incorrectly due to memory and CPU pressure on one of our nodes. Here is the memory throughout the incident: + + + +Here is the display from Linode: + + + +## 🥏 Impact + +*Describe how internal and external users were impacted during the incident* + +Since traffic could not correctly enter our cluster multiple services which were web based were offline, including services such as site, grafana and bitwarden. It appears that no inter-node communication was affected as this uses a WireGuard tunnel between the nodes which was not affected by the NodeBalancer. + +The lack of Grafana made diagnosis slightly more difficult, but even then it was only a short trip to the + +## 👁️ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +We were alerted fairly promptly through statping which reported services as being down and posted a Discord notification. Subsequent alerts came in from Grafana but were limited since outbound communication was faulty. + +## 🙋🏿♂️ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded! + +Primary obstacle was the DevOps tools being out due to the traffic ingress problems. + +## 🙆🏽♀️ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +The incident resolved itself upstream at Linode, we've opened a ticket with Linode to let them know of the faults, this might give us a better indication of what caused the issues. Our Kubernetes cluster continued posting updates to Linode to refresh the NodeBalancer configuration, inspecting these payloads the configuration looked correct. + +We've set up alerts for when Prometheus services stop responding since this seems to be a fairly tell-tale symptom of networking problems, this was the Prometheus status graph throughout the incident: + + + +## 🔎 Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +**What?** Our service experienced an outage due to networking faults. + +**Why?** Incoming traffic could not reach our Kubernetes nodes + +**Why?** Our Linode NodeBalancers were not using correct configuration + +**Why?** Memory & CPU pressure seemed to cause invalid configuration errors upstream at Linode. + +**Why?** Unknown at this stage, NodeBalancer migrated. + +## 🌱 Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrance* + +The configuration of our NodeBalancer was invalid, we cannot say why at this point since we are awaiting contact back from Linode, but indicators point to it being an upstream fault since memory & CPU pressure should **not** cause a load balancer misconfiguration. + +Linode are going to follow up with us at some point during the week with information from their System Administrators. + +**Update 2nd February 2021:** Linode have concluded investigations at their end, taken notes and migrated our NodeBalancer to a new machine. We haven't experienced problems since. + +## 🤔 Lessons learned + +*What did we learn from this incident?* + +We should be careful over-scheduling onto nodes since even while operating within reasonable constraints we risk sending invalid configuration upstream to Linode and therefore preventing traffic from entering our cluster. + +## ☑️ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] Monitor for follow up from Linode +- [x] Carefully monitor the allocation rules for our services diff --git a/kubernetes/docs/postmortems/2021-07-11-cascading-node-failures.md b/kubernetes/docs/postmortems/2021-07-11-cascading-node-failures.md new file mode 100644 index 0000000..adf0d57 --- /dev/null +++ b/kubernetes/docs/postmortems/2021-07-11-cascading-node-failures.md @@ -0,0 +1,185 @@ +--- +layout: default +title: "2021-07-11: Cascading node failures and ensuing volume problems" +parent: Postmortems +nav_order: 6 +--- + +# 2021-07-11: Cascading node failures and ensuing volume problems + +A PostgreSQL connection spike (00:27 UTC) caused by Django moved a node to an unresponsive state (00:55 UTC), upon performing a recycle of the affected node volumes were placed into a state where they could not be mounted. + +# ⚠️ Leadup + +*List the sequence of events that led to the incident* + +- **00:27 UTC:** Django starts rapidly using connections to our PostgreSQL database +- **00:32 UTC:** DevOps team is alerted that PostgreSQL has saturated it's 115 max connections limit. Joe is paged. +- **00:33 UTC:** DevOps team is alerted that a service has claimed 34 dangerous table locks (it peaked at 61). +- **00:42 UTC:** Status incident created and backdated to 00:25 UTC. [Status incident](https://status.pythondiscord.com/incident/92712) +- **00:55 UTC:** It's clear that the node which PostgreSQL was on is no longer healthy after the Django connection surge, so it's recycled and a new one is to be added to the pool. +- **01:01 UTC:** Node `lke13311-16405-5fafd1b46dcf` begins it's restart +- **01:13 UTC:** Node has restored and regained healthy status, but volumes will not mount to the node. Support ticket opened at Linode for assistance. +- **06:36 UTC:** DevOps team alerted that Python is offline. This is due to Redis being a dependency of the bot, which as a stateful service was not healthy. + +# 🥏 Impact + +*Describe how internal and external users were impacted during the incident* + +Initially, this manifested as a standard node outage where services on that node experienced some downtime as the node was restored. + +Post-restore, all stateful services (e.g. PostgreSQL, Redis, PrestaShop) were unexecutable due to the volume issues, and so any dependent services (e.g. Site, Bot, Hastebin) also had trouble starting. + +PostgreSQL was restored early on so for the most part Moderation could continue. + +# 👁️ Detection + +*Report when the team detected the incident, and how we could improve detection time* + +DevOps were initially alerted at 00:32 UTC due to the PostgreSQL connection surge, and acknowledged at the same time. + +Further alerting could be used to catch surges earlier on (looking at conn delta vs. conn total), but for the most part alerting time was satisfactory here. + +# 🙋🏿♂️ Response + +*Who responded to the incident, and what obstacles did they encounter?* + +Joe Banks responded. The primary issue encountered was failure upstream at Linode to remount the affected volumes, a support ticket has been created. + +# 🙆🏽♀️ Recovery + +*How was the incident resolved? How can we improve future mitigation?* + +Initial node restoration was performed by @Joe Banks by recycling the affected node. + +Subsequent volume restoration was also @Joe Banks and once Linode had unlocked the volumes affected pods were scaled down to 0, the volumes were unmounted at the Linode side and then the deployments were recreated. + +<details markdown="block"> +<summary>Support ticket sent</summary> + +<blockquote markdown="block"> +Good evening, + +We experienced a resource surge on one of our Kubernetes nodes at 00:32 UTC, causing a node to go unresponsive. To mitigate problems here the node was recycled and began restarting at 1:01 UTC. + +The node has now rejoined the ring and started picking up services, but volumes will not attach to it, meaning pods with stateful storage will not start. + +An example events log for one such pod: + +``` + Type Reason Age From Message + ---- ------ ---- ---- ------- + Normal Scheduled 2m45s default-scheduler Successfully assigned default/redis-599887d778-wggbl to lke13311-16405-5fafd1b46dcf + Warning FailedMount 103s kubelet MountVolume.MountDevice failed for volume "pvc-bb1d06139b334c1f" : rpc error: code = Internal desc = Unable to find device path out of attempted paths: [/dev/disk/by-id/linode-pvcbb1d06139b334c1f /dev/disk/by-id/scsi-0Linode_Volume_pvcbb1d06139b334c1f] + Warning FailedMount 43s kubelet Unable to attach or mount volumes: unmounted volumes=[redis-data-volume], unattached volumes=[kube-api-access-6wwfs redis-data-volume redis-config-volume]: timed out waiting for the condition + +``` + +I've been trying to manually resolve this through the Linode Web UI but get presented with attachment errors upon doing so. Please could you advise on the best way forward to restore Volumes & Nodes to a functioning state? As far as I can see there is something going on upstream since the Linode UI presents these nodes as mounted however as shown above LKE nodes are not locating them, there is also a few failed attachment logs in the Linode Audit Log. + +Thanks, + +Joe +</blockquote> +</details> + +<details markdown="block"> +<summary>Response received from Linode</summary> + +<blockquote markdown="block"> +Hi Joe, + +> Were there any known issues with Block Storage in Frankfurt today? + +Not today, though there were service issues reported for Block Storage and LKE in Frankfurt on July 8 and 9: + +- [Service Issue - Block Storage - EU-Central (Frankfurt)](https://status.linode.com/incidents/pqfxl884wbh4) +- [Service Issue - Linode Kubernetes Engine - Frankfurt](https://status.linode.com/incidents/13fpkjd32sgz) + +There was also an API issue reported on the 10th (resolved on the 11th), mentioned here: + +- [Service Issue - Cloud Manager and API](https://status.linode.com/incidents/vhjm0xpwnnn5) + +Regarding the specific error you were receiving: + +> `Unable to find device path out of attempted paths` + +I'm not certain it's specifically related to those Service Issues, considering this isn't the first time a customer has reported this error in their LKE logs. In fact, if I recall correctly, I've run across this before too, since our volumes are RWO and I had too many replicas in my deployment that I was trying to attach to, for example. + +> is this a known bug/condition that occurs with Linode CSI/LKE? + +From what I understand, yes, this is a known condition that crops up from time to time, which we are tracking. However, since there is a workaround at the moment (e.g. - "After some more manual attempts to fix things, scaling down deployments, unmounting at Linode and then scaling up the deployments seems to have worked and all our services have now been restored."), there is no ETA for addressing this. With that said, I've let our Storage team know that you've run into this, so as to draw further attention to it. + +If you have any further questions or concerns regarding this, let us know. + +Best regards, +[Redacted] + +Linode Support Team +</blockquote> +</details> + +<details markdown="block"> +<summary>Concluding response from Joe Banks</summary> + +<blockquote markdown="block"> +Hey [Redacted]! + +Thanks for the response. We ensure that stateful pods only ever have one volume assigned to them, either with a single replica deployment or a statefulset. It appears that the error generally manifests when a deployment is being migrated from one node to another during a redeploy, which makes sense if there is some delay on the unmount/remount. + +Confusion occurred because Linode was reporting the volume as attached when the node had been recycled, but I assume that was because the node did not cleanly shutdown and therefore could not cleanly unmount volumes. + +We've not seen any resurgence of such issues, and we'll address the software fault which overloaded the node which will helpfully mitigate such problems in the future. + +Thanks again for the response, have a great week! + +Best, + +Joe +</blockquote> +</details> + +# 🔎 Five Why's + +*Run a 5-whys analysis to understand the true cause of the incident.* + +### **What?** + +Several of our services became unavailable because their volumes could not be mounted. + +### Why? + +A node recycle left the node unable to mount volumes using the Linode CSI. + +### Why? + +A node recycle was used because PostgreSQL had a connection surge. + +### Why? + +A Django feature deadlocked a table 62 times and suddenly started using ~70 connections to the database, saturating the maximum connections limit. + +### Why? + +The root cause of why Django does this is unclear, and someone with more Django proficiency is absolutely welcome to share any knowledge they may have. I presume it's some sort of worker race condition, but I've not been able to reproduce it. + +# 🌱 Blameless root cause + +*Note the final root cause and describe what needs to change to prevent reoccurrence* + +A node being forcefully restarted left volumes in a limbo state where mounting was difficult, it took multiple hours for this to be resolved since we had to wait for the volumes to unlock so they could be cloned. + +# 🤔 Lessons learned + +*What did we learn from this incident?* + +Volumes are painful. + +We need to look at why Django is doing this and mitigations of the fault to prevent this from occurring again. + +# ☑️ Follow-up tasks + +*List any tasks we should complete that are relevant to this incident* + +- [x] [Follow up on ticket at Linode](https://www.notion.so/Cascading-node-failures-and-ensuing-volume-problems-1c6cfdfcadfc4422b719a0d7a4cc5001) +- [ ] Investigate why Django could be connection surging and locking tables diff --git a/kubernetes/docs/postmortems/index.md b/kubernetes/docs/postmortems/index.md new file mode 100644 index 0000000..5e8b509 --- /dev/null +++ b/kubernetes/docs/postmortems/index.md @@ -0,0 +1,10 @@ +--- +title: Postmortems +layout: default +has_children: true +has_toc: false +--- + +# Postmortems + +Browse the pages under this category to view historical postmortems for Python Discord outages. diff --git a/kubernetes/docs/static/images/2021-01-12/site_cpu_throttle.png b/kubernetes/docs/static/images/2021-01-12/site_cpu_throttle.png Binary files differnew file mode 100644 index 0000000..b530ec6 --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-12/site_cpu_throttle.png diff --git a/kubernetes/docs/static/images/2021-01-12/site_resource_abnormal.png b/kubernetes/docs/static/images/2021-01-12/site_resource_abnormal.png Binary files differnew file mode 100644 index 0000000..e1e07af --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-12/site_resource_abnormal.png diff --git a/kubernetes/docs/static/images/2021-01-30/linode_loadbalancers.png b/kubernetes/docs/static/images/2021-01-30/linode_loadbalancers.png Binary files differnew file mode 100644 index 0000000..f0eae1f --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/linode_loadbalancers.png diff --git a/kubernetes/docs/static/images/2021-01-30/memory_charts.png b/kubernetes/docs/static/images/2021-01-30/memory_charts.png Binary files differnew file mode 100644 index 0000000..370d19e --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/memory_charts.png diff --git a/kubernetes/docs/static/images/2021-01-30/prometheus_status.png b/kubernetes/docs/static/images/2021-01-30/prometheus_status.png Binary files differnew file mode 100644 index 0000000..e95b8d7 --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/prometheus_status.png diff --git a/kubernetes/docs/static/images/2021-01-30/scaleios.png b/kubernetes/docs/static/images/2021-01-30/scaleios.png Binary files differnew file mode 100644 index 0000000..584d74d --- /dev/null +++ b/kubernetes/docs/static/images/2021-01-30/scaleios.png diff --git a/kubernetes/namespaces/cert-manager/cert-manager/README.md b/kubernetes/namespaces/cert-manager/cert-manager/README.md new file mode 100644 index 0000000..a7389e6 --- /dev/null +++ b/kubernetes/namespaces/cert-manager/cert-manager/README.md @@ -0,0 +1,13 @@ +# cert-manager + +X.509 certificate management for Kubernetes. + +> cert-manager builds on top of Kubernetes, introducing certificate authorities and certificates as first-class resource types in the Kubernetes API. This makes it possible to provide to developers 'certificates as a service' in your Kubernetes cluster. + +We install cert-mamnanger through [Helm using this guide](https://cert-manager.io/docs/installation/kubernetes/#installing-with-helm). + +## Directories + +`issuers`: Contains configured issuers, right now only letsencrypt production & staging. + +`certificates`: Contains TLS certificates that should be provisioned and where they should be stored. diff --git a/kubernetes/namespaces/cert-manager/cert-manager/certificates/pythondiscord.com.yaml b/kubernetes/namespaces/cert-manager/cert-manager/certificates/pythondiscord.com.yaml new file mode 100644 index 0000000..94bd7dc --- /dev/null +++ b/kubernetes/namespaces/cert-manager/cert-manager/certificates/pythondiscord.com.yaml @@ -0,0 +1,12 @@ +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + name: pythondiscord-com +spec: + secretName: pythondiscord.com-tls + dnsNames: + - pythondiscord.com + - '*.pythondiscord.com' + issuerRef: + name: letsencrypt + kind: ClusterIssuer diff --git a/kubernetes/namespaces/cert-manager/cert-manager/issuers/letsencrypt-prod.yaml b/kubernetes/namespaces/cert-manager/cert-manager/issuers/letsencrypt-prod.yaml new file mode 100644 index 0000000..4321377 --- /dev/null +++ b/kubernetes/namespaces/cert-manager/cert-manager/issuers/letsencrypt-prod.yaml @@ -0,0 +1,18 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt + namespace: cert-manager +spec: + acme: + email: [email protected] + server: https://acme-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-account-key + solvers: + - dns01: + cloudflare: + email: [email protected] + apiTokenSecretRef: + name: cloudflare-credentials + key: cloudflare-api-key diff --git a/kubernetes/namespaces/cert-manager/cert-manager/issuers/letsencrypt-staging.yaml b/kubernetes/namespaces/cert-manager/cert-manager/issuers/letsencrypt-staging.yaml new file mode 100644 index 0000000..e9fdfc7 --- /dev/null +++ b/kubernetes/namespaces/cert-manager/cert-manager/issuers/letsencrypt-staging.yaml @@ -0,0 +1,18 @@ +apiVersion: cert-manager.io/v1 +kind: ClusterIssuer +metadata: + name: letsencrypt-staging + namespace: cert-manager +spec: + acme: + email: [email protected] + server: https://acme-staging-v02.api.letsencrypt.org/directory + privateKeySecretRef: + name: letsencrypt-staging-account-key + solvers: + - dns01: + cloudflare: + email: [email protected] + apiTokenSecretRef: + name: cloudflare-credentials + key: cloudflare-api-key diff --git a/kubernetes/namespaces/cert-manager/cert-manager/values.yaml b/kubernetes/namespaces/cert-manager/cert-manager/values.yaml new file mode 100644 index 0000000..1b4551c --- /dev/null +++ b/kubernetes/namespaces/cert-manager/cert-manager/values.yaml @@ -0,0 +1 @@ +installCRDs: true diff --git a/kubernetes/namespaces/default/bitwarden/README.md b/kubernetes/namespaces/default/bitwarden/README.md new file mode 100644 index 0000000..37f01eb --- /dev/null +++ b/kubernetes/namespaces/default/bitwarden/README.md @@ -0,0 +1,14 @@ +# BitWarden + +Our internal password manager, used by the admins to share passwords for our services. Hosted at https://bitwarden.pythondiscord.com + +To deploy this, first set up the secrets (see below) and then run `kubectl apply -f .` in this folder. + +## Secrets +This deployment expects a few secrets to exist in a secret called `bitwarden-secret-env`. + + +| Environment | Description | +|-----------------------|-------------------------------------------| +| ADMIN_TOKEN | 64-character token used for initial login | +| DATABASE_URL | Database string: host://user:pass/db | diff --git a/kubernetes/namespaces/default/bitwarden/configmap.yaml b/kubernetes/namespaces/default/bitwarden/configmap.yaml new file mode 100644 index 0000000..c758f5d --- /dev/null +++ b/kubernetes/namespaces/default/bitwarden/configmap.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: bitwarden-config-env +data: + # Domain to access bitwarden by + DOMAIN: "https://bitwarden.pythondiscord.com" + + # Password hint must be sent to an email when this is false. + # When it's true, it'll be shown right on the page. + SHOW_PASSWORD_HINT: "false" + + # Admins only, please! + SIGNUPS_ALLOWED: "false" + + # Used for LiveSync + WEBSOCKET_ENABLED: "true" + + # Max conns to the DB + DATABASE_MAX_CONNS: "2" + + # Force bitwarden to use postgres, rather than it's own volume + I_REALLY_WANT_VOLATILE_STORAGE: "true" diff --git a/kubernetes/namespaces/default/bitwarden/deployment.yaml b/kubernetes/namespaces/default/bitwarden/deployment.yaml new file mode 100644 index 0000000..70a22ce --- /dev/null +++ b/kubernetes/namespaces/default/bitwarden/deployment.yaml @@ -0,0 +1,34 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bitwarden +spec: + replicas: 1 + selector: + matchLabels: + app: bitwarden + template: + metadata: + labels: + app: bitwarden + spec: + containers: + - name: bitwarden + image: vaultwarden/server:latest + imagePullPolicy: Always + resources: + requests: + cpu: 1m + memory: 50Mi + limits: + cpu: 50m + memory: 100Mi + ports: + - containerPort: 80 + envFrom: + - secretRef: + name: bitwarden-secret-env + - configMapRef: + name: bitwarden-config-env + securityContext: + readOnlyRootFilesystem: true diff --git a/kubernetes/namespaces/default/bitwarden/ingress.yaml b/kubernetes/namespaces/default/bitwarden/ingress.yaml new file mode 100644 index 0000000..d0371f6 --- /dev/null +++ b/kubernetes/namespaces/default/bitwarden/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: bitwarden +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: bitwarden.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: bitwarden + port: + number: 80 diff --git a/kubernetes/namespaces/default/bitwarden/secrets.yaml b/kubernetes/namespaces/default/bitwarden/secrets.yaml Binary files differnew file mode 100644 index 0000000..34cfd7a --- /dev/null +++ b/kubernetes/namespaces/default/bitwarden/secrets.yaml diff --git a/kubernetes/namespaces/default/bitwarden/service.yaml b/kubernetes/namespaces/default/bitwarden/service.yaml new file mode 100644 index 0000000..3df8cc2 --- /dev/null +++ b/kubernetes/namespaces/default/bitwarden/service.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Service +metadata: + name: bitwarden +spec: + ports: + - port: 80 + selector: + app: bitwarden diff --git a/kubernetes/namespaces/default/black-knight/README.md b/kubernetes/namespaces/default/black-knight/README.md new file mode 100644 index 0000000..d1f8d89 --- /dev/null +++ b/kubernetes/namespaces/default/black-knight/README.md @@ -0,0 +1,16 @@ +## Black Knight +Deployment file for @Black-Knight, our courageous and ever present anti-raid bot. + +## Secrets +This deployment expects a number of secrets/environment variables to exist in a secret called `black-knight-env`. + +| Environment | Description | +|-----------------------|-------------------------------------------------------------------| +| BOT_TOKEN | The Discord bot token for Black Knight to connect to Discord with | +| DATABASE_URL | A full PostgreSQL connection string to the postgres db | +| BOT_SENTRY_DSN | The DSN to connect send sentry reports to | + +Black knight also requires a redis password, which is pulled from the `redis-credentials` secret. +``` +REDIS_PASSWORD - The password to redis +``` diff --git a/kubernetes/namespaces/default/black-knight/deployment.yaml b/kubernetes/namespaces/default/black-knight/deployment.yaml new file mode 100644 index 0000000..c61429a --- /dev/null +++ b/kubernetes/namespaces/default/black-knight/deployment.yaml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: black-knight +spec: + replicas: 1 + selector: + matchLabels: + app: black-knight + template: + metadata: + labels: + app: black-knight + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: black-knight + image: ghcr.io/python-discord/black-knight:latest + imagePullPolicy: Always + resources: + requests: + cpu: 500m + memory: 300Mi + limits: + cpu: 750m + memory: 500Mi + envFrom: + - secretRef: + name: black-knight-env + - secretRef: + name: redis-credentials + securityContext: + readOnlyRootFilesystem: true + imagePullSecrets: + - name: ghcr-pull-secret diff --git a/kubernetes/namespaces/default/black-knight/secrets.yaml b/kubernetes/namespaces/default/black-knight/secrets.yaml Binary files differnew file mode 100644 index 0000000..40909c9 --- /dev/null +++ b/kubernetes/namespaces/default/black-knight/secrets.yaml diff --git a/kubernetes/namespaces/default/blackbox/README.md b/kubernetes/namespaces/default/blackbox/README.md new file mode 100644 index 0000000..f53ef87 --- /dev/null +++ b/kubernetes/namespaces/default/blackbox/README.md @@ -0,0 +1,18 @@ +# Blackbox +These manifests provision a CronJob for blackbox, our database backup tool. + +You can find the repository for blackbox at [lemonsaurus/blackbox](https://github.com/lemonsaurus/blackbox). + +## Secrets +blackbox requires the following secrets in a secret titled `blackbox-env`: + +| Variable | Description | +|--------------------------------|------------------------| +| **POSTGRES_USER** | Postgres username | +| **POSTGRES_PASSWORD** | Postgres password | +| **REDIS_PASSWORD** | Redis password | +| **MONGO_INITDB_ROOT_USERNAME** | MongoDB username | +| **MONGO_INITDB_ROOT_PASSWORD** | MongoDB password | +| **AWS_ACCESS_KEY_ID** | Access key for S3 | +| **AWS_SECRET_ACCESS_KEY** | Secret key for S3 | +| **DEVOPS_WEBHOOK** | Webhook for #dev-ops | diff --git a/kubernetes/namespaces/default/blackbox/blackbox-configmap.yaml b/kubernetes/namespaces/default/blackbox/blackbox-configmap.yaml new file mode 100644 index 0000000..6922b1f --- /dev/null +++ b/kubernetes/namespaces/default/blackbox/blackbox-configmap.yaml @@ -0,0 +1,47 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: blackbox-config +data: + blackbox.yaml: | + databases: + mongodb: + main_mongodb: + connection_string: mongodb://{{ MONGO_INITDB_ROOT_USERNAME }}:{{ MONGO_INITDB_ROOT_PASSWORD }}@mongodb.default.svc.cluster.local:27017 + postgres: + main_postgres: + username: {{ POSTGRES_USER }} + password: {{ POSTGRES_PASSWORD }} + host: postgres.default.svc.cluster.local + port: "5432" + redis: + main_redis: + password: {{ REDIS_PASSWORD }} + host: redis.default.svc.cluster.local + port: "6379" + + storage: + s3: + frankfurt_s3: + bucket: blackbox + endpoint: eu-central-1.linodeobjects.com + aws_access_key_id: {{ AWS_ACCESS_KEY_ID }} + aws_secret_access_key: {{ AWS_SECRET_ACCESS_KEY }} + newark_s3: + bucket: blackbox + endpoint: us-east-1.linodeobjects.com + aws_access_key_id: {{ AWS_ACCESS_KEY_ID }} + aws_secret_access_key: {{ AWS_SECRET_ACCESS_KEY }} + singapore_s3: + bucket: blackbox + endpoint: ap-south-1.linodeobjects.com + aws_access_key_id: {{ AWS_ACCESS_KEY_ID }} + aws_secret_access_key: {{ AWS_SECRET_ACCESS_KEY }} + + + notifiers: + discord: + dev_ops: + webhook: {{ DEVOPS_WEBHOOK }} + + retention_days: 7 diff --git a/kubernetes/namespaces/default/blackbox/cronjob.yaml b/kubernetes/namespaces/default/blackbox/cronjob.yaml new file mode 100644 index 0000000..405bfbe --- /dev/null +++ b/kubernetes/namespaces/default/blackbox/cronjob.yaml @@ -0,0 +1,39 @@ +apiVersion: batch/v1 +kind: CronJob +metadata: + name: blackbox +spec: + schedule: "0 15 * * *" + jobTemplate: + spec: + template: + spec: + containers: + - name: blackbox + image: lemonsaurus/blackbox:main + imagePullPolicy: Always + envFrom: + - secretRef: + name: blackbox-env + env: + - name: BLACKBOX_CONFIG_PATH + value: "/blackbox/config_file/blackbox.yaml" + volumeMounts: + - mountPath: /blackbox/config_file + name: blackbox-config + - mountPath: /tmp + name: blackbox-tmp + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: blackbox-config + configMap: + name: blackbox-config + - name: blackbox-tmp + emptyDir: {} + restartPolicy: OnFailure + nodeSelector: + # NOTE: This should be updated to match the highest spec + # instance that is being used by the cluster. + beta.kubernetes.io/instance-type: g6-standard-4 + backoffLimit: 3 diff --git a/kubernetes/namespaces/default/blackbox/secrets.yaml b/kubernetes/namespaces/default/blackbox/secrets.yaml Binary files differnew file mode 100644 index 0000000..3659e59 --- /dev/null +++ b/kubernetes/namespaces/default/blackbox/secrets.yaml diff --git a/kubernetes/namespaces/default/bot/README.md b/kubernetes/namespaces/default/bot/README.md new file mode 100644 index 0000000..6a992b5 --- /dev/null +++ b/kubernetes/namespaces/default/bot/README.md @@ -0,0 +1,18 @@ +## Bot + +Deployment file for @Python, our valiant community bot and workhorse. + +## Secrets +This deployment expects a number of secrets and environment variables to exist in a secret called `bot-env`. + +| Environment | Description | +|-------------------|-------------------------------------------------------------| +| API_KEYS_GITHUB | An API key for Github's API. | +| API_KEYS_SITE_API | The token to access our site's API. | +| BOT_SENTRY_DSN | The sentry DSN to send sentry events to. | +| BOT_TOKEN | The Discord bot token to run the bot on. | +| BOT_TRACE_LOGGERS | Comma separated list of loggers to enable trace logging for | +| DEBUG | Debug mode true/false | +| METABASE_PASSWORD | Password for Metabase | +| METABASE_USERNAME | Username for Metabase | +| URLS_PASTE_URL | The URL to the paste site | diff --git a/kubernetes/namespaces/default/bot/deployment.yaml b/kubernetes/namespaces/default/bot/deployment.yaml new file mode 100644 index 0000000..e05b2ec --- /dev/null +++ b/kubernetes/namespaces/default/bot/deployment.yaml @@ -0,0 +1,46 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: bot +spec: + replicas: 1 + selector: + matchLabels: + app: bot + template: + metadata: + labels: + app: bot + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: bot + image: ghcr.io/python-discord/bot:latest + imagePullPolicy: Always + resources: + requests: + cpu: 750m + memory: 600Mi + limits: + cpu: 1000m + memory: 1400Mi + envFrom: + - secretRef: + name: bot-env + - secretRef: + name: redis-credentials + volumeMounts: + - mountPath: /bot/logs + name: logs-vol + - mountPath: /.cache/python-tldextract + name: tldextract-cache + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: logs-vol + emptyDir: {} + - name: tldextract-cache + emptyDir: {} diff --git a/kubernetes/namespaces/default/bot/secrets.yaml b/kubernetes/namespaces/default/bot/secrets.yaml Binary files differnew file mode 100644 index 0000000..c48842e --- /dev/null +++ b/kubernetes/namespaces/default/bot/secrets.yaml diff --git a/kubernetes/namespaces/default/code-jam-management/README.md b/kubernetes/namespaces/default/code-jam-management/README.md new file mode 100644 index 0000000..b377130 --- /dev/null +++ b/kubernetes/namespaces/default/code-jam-management/README.md @@ -0,0 +1,11 @@ +# Code Jam Management + +This contains the deployment for the internal [code jam management](https://github.com/python-discord/code-jam-management) service. + +### Required Secret +In a secret named `code-jam-management-env`: + +| Environment | Description | +|--------------|------------------------------------------------------------------------| +| API_TOKEN | A random string to use as the auth token for making requests to CJMS | +| DATABASE_URL | `postgres://<user>:<password>@<host>:<port>/<name>` | diff --git a/kubernetes/namespaces/default/code-jam-management/deployment.yaml b/kubernetes/namespaces/default/code-jam-management/deployment.yaml new file mode 100644 index 0000000..86d8328 --- /dev/null +++ b/kubernetes/namespaces/default/code-jam-management/deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: code-jam-management +spec: + replicas: 1 + selector: + matchLabels: + app: code-jam-management + template: + metadata: + labels: + app: code-jam-management + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: codejam-management + image: ghcr.io/python-discord/code-jam-management:latest + imagePullPolicy: Always + volumeMounts: + - mountPath: /tmp + name: code-jam-mgmt-tmp + - mountPath: /.cache + name: code-jam-mgmt-venv + ports: + - containerPort: 8000 + envFrom: + - secretRef: + name: code-jam-management-env + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: code-jam-mgmt-tmp + emptyDir: + medium: Memory + - name: code-jam-mgmt-venv + emptyDir: {} diff --git a/kubernetes/namespaces/default/code-jam-management/secrets.yaml b/kubernetes/namespaces/default/code-jam-management/secrets.yaml Binary files differnew file mode 100644 index 0000000..6400778 --- /dev/null +++ b/kubernetes/namespaces/default/code-jam-management/secrets.yaml diff --git a/kubernetes/namespaces/default/code-jam-management/service.yaml b/kubernetes/namespaces/default/code-jam-management/service.yaml new file mode 100644 index 0000000..2083adb --- /dev/null +++ b/kubernetes/namespaces/default/code-jam-management/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: code-jam-management +spec: + selector: + app: code-jam-management + ports: + - protocol: TCP + port: 8000 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/forms/backend/README.md b/kubernetes/namespaces/default/forms/backend/README.md new file mode 100644 index 0000000..06fed13 --- /dev/null +++ b/kubernetes/namespaces/default/forms/backend/README.md @@ -0,0 +1,20 @@ +# Forms Backend + +Forms backend is our surveyance system for putting out forms in our community. + +This directory contains the necessary routing manifests, the deployment is located in the [python-discord/forms-backend](https://github.com/python-discord/forms-backend) repository. + +This deployment expects a number of secrets and environment variables to exist in a secret called `forms-backend-env`. + +| Environment | Description | +|----------------------|-----------------------------------------------------------------| +| DATABASE_URL | A MongoDB database url | +| DISCORD_BOT_TOKEN | The bot token to connect to Discord's API with | +| DOCS_PASSWORD | The password required to access the auto-generated API docs | +| HCAPTCHA_API_SECRET | The API key to HCAPTCHA's API | +| OAUTH2_CLIENT_ID | The Discord app OAuth2 client id | +| OAUTH2_CLIENT_SECRET | The Discord app OAuth2 client secret | +| PRODUCTION | Whether the app is in production true/false | +| SECRET_KEY | The key to sign all JWTs with | +| SNEKBOX_URL | The URL to the senkbox service to use for code form submissions | +| FORMS_BACKEND_DSN | The sentry DSN to send sentry events to | diff --git a/kubernetes/namespaces/default/forms/backend/ingress.yaml b/kubernetes/namespaces/default/forms/backend/ingress.yaml new file mode 100644 index 0000000..1967ab4 --- /dev/null +++ b/kubernetes/namespaces/default/forms/backend/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: forms-backend +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: forms-api.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: forms-backend + port: + number: 80 diff --git a/kubernetes/namespaces/default/forms/backend/secrets.yaml b/kubernetes/namespaces/default/forms/backend/secrets.yaml Binary files differnew file mode 100644 index 0000000..8977e99 --- /dev/null +++ b/kubernetes/namespaces/default/forms/backend/secrets.yaml diff --git a/kubernetes/namespaces/default/forms/backend/service.yaml b/kubernetes/namespaces/default/forms/backend/service.yaml new file mode 100644 index 0000000..6e11796 --- /dev/null +++ b/kubernetes/namespaces/default/forms/backend/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: forms-backend +spec: + selector: + app: forms-backend + ports: + - protocol: TCP + port: 80 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/ghost/README.md b/kubernetes/namespaces/default/ghost/README.md new file mode 100644 index 0000000..fee4f8f --- /dev/null +++ b/kubernetes/namespaces/default/ghost/README.md @@ -0,0 +1,7 @@ +# Ghost + +This folder contains the deployment manifests for Ghost, the CMS we use for https://blog.pythondiscord.com/. + +There should be no additional configuration required, there is a setup process on the domain when Ghost first boots, you can reach it by going to https://blog.pythondiscord.com/ghost/ immediately after starting the deployment. + +To deploy this application run `kubectl apply -f ghost` from the root directory of this repository. This will create a deployment, service ingress and persistent volume. diff --git a/kubernetes/namespaces/default/ghost/deployment.yaml b/kubernetes/namespaces/default/ghost/deployment.yaml new file mode 100644 index 0000000..3810e9d --- /dev/null +++ b/kubernetes/namespaces/default/ghost/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: ghost +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: ghost + template: + metadata: + labels: + app: ghost + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: ghost + image: ghost:5.38-alpine + imagePullPolicy: Always + ports: + - containerPort: 2368 + env: + - name: url + value: https://blog.pythondiscord.com + - name: database__client + value: sqlite3 + - name: database__connection__filename + value: /var/lib/ghost/content/data/ghost.db + volumeMounts: + - mountPath: /var/lib/ghost/content + name: ghost-data + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: ghost-data + persistentVolumeClaim: + claimName: ghost-storage diff --git a/kubernetes/namespaces/default/ghost/ingress.yaml b/kubernetes/namespaces/default/ghost/ingress.yaml new file mode 100644 index 0000000..30f0589 --- /dev/null +++ b/kubernetes/namespaces/default/ghost/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: ghost +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: blog.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: ghost + port: + number: 2368 diff --git a/kubernetes/namespaces/default/ghost/service.yaml b/kubernetes/namespaces/default/ghost/service.yaml new file mode 100644 index 0000000..b030ecb --- /dev/null +++ b/kubernetes/namespaces/default/ghost/service.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Service +metadata: + name: ghost +spec: + ports: + - port: 2368 + selector: + app: ghost diff --git a/kubernetes/namespaces/default/ghost/volume.yaml b/kubernetes/namespaces/default/ghost/volume.yaml new file mode 100644 index 0000000..24c7929 --- /dev/null +++ b/kubernetes/namespaces/default/ghost/volume.yaml @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: ghost-storage + labels: + app: ghost +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/kubernetes/namespaces/default/grafana/README.md b/kubernetes/namespaces/default/grafana/README.md new file mode 100644 index 0000000..03a9682 --- /dev/null +++ b/kubernetes/namespaces/default/grafana/README.md @@ -0,0 +1,11 @@ +# Grafana + +This folder contains the manifests for deploying our Grafana instance, the service we use to query our data. + +This deployment expects a number of secrets and environment variables to exist in a secret called `grafana-secret-env`. + +| Environment | Description | +|------------------------------|-----------------------------------------------------| +| GF_AUTH_GITHUB_CLIENT_ID | The client ID of the Github app to use for auth | +| GF_AUTH_GITHUB_CLIENT_SECRET | The client secret of the Github app to use for auth | +| GF_SECURITY_ADMIN_PASSWORD | The admin password the the grafana admin console | diff --git a/kubernetes/namespaces/default/grafana/configmap.yaml b/kubernetes/namespaces/default/grafana/configmap.yaml new file mode 100644 index 0000000..87eeba9 --- /dev/null +++ b/kubernetes/namespaces/default/grafana/configmap.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-default +data: + # Root settings + GF_INSTANCE_NAME: "pythondiscord" + GF_SERVER_DOMAIN: "grafana.pythondiscord.com" + GF_SERVER_ROOT_URL: "https://grafana.pythondiscord.com" + GF_SECURITY_COOKIE_SECURE: "true" + + # GitHub Auth + GF_AUTH_GITHUB_ENABLED: "true" + GF_AUTH_GITHUB_SCOPES: "user:email,read:org" + # IDs can be retrieved via `gh api orgs/python-discord/teams`. + GF_AUTH_GITHUB_TEAM_IDS: "2638565,3854739,3114246" + GF_AUTH_GITHUB_AUTH_URL: "https://github.com/login/oauth/authorize" + GF_AUTH_GITHUB_TOKEN_URL: "https://github.com/login/oauth/access_token" + GF_AUTH_GITHUB_API_URL: "https://api.github.com/user" + GF_AUTH_ALLOW_SIGN_UP: "true" + + # Image renderer + GF_RENDERING_SERVER_URL: "http://grafana-image-renderer.default.svc.cluster.local:8081/render" + GF_RENDERING_CALLBACK_URL: "http://grafana.default.svc.cluster.local:3000/" + GF_RENDERING_CONCURRENT_RENDER_REQUEST_LIMIT: "3" + + # Image storage + GF_EXTERNAL_IMAGE_STORAGE_PROVIDED: "local" + + # Metrics + GF_METRICS_ENABLED: "false" + + # User sign up + GF_USERS_AUTO_ASSIGN_ORG: "true" + GF_USERS_AUTO_ASSIGN_ORG_ID: "2" + + # Feature toggles + GF_FEATURE_TOGGLES_ENABLE: "" diff --git a/kubernetes/namespaces/default/grafana/deployment-grafana.yaml b/kubernetes/namespaces/default/grafana/deployment-grafana.yaml new file mode 100644 index 0000000..3acef14 --- /dev/null +++ b/kubernetes/namespaces/default/grafana/deployment-grafana.yaml @@ -0,0 +1,47 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana:9.1.7 + imagePullPolicy: Always + ports: + - containerPort: 3000 + resources: + requests: + cpu: 200m + memory: 100Mi + limits: + cpu: 300m + memory: 250Mi + envFrom: + - configMapRef: + name: grafana-default + - secretRef: + name: grafana-secret-env + volumeMounts: + - mountPath: /var/lib/grafana + name: grafana-volume + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: grafana-volume + persistentVolumeClaim: + claimName: grafana-storage + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/grafana/deployment-image-renderer.yaml b/kubernetes/namespaces/default/grafana/deployment-image-renderer.yaml new file mode 100644 index 0000000..ea3e297 --- /dev/null +++ b/kubernetes/namespaces/default/grafana/deployment-image-renderer.yaml @@ -0,0 +1,33 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: grafana-image-renderer +spec: + replicas: 0 + selector: + matchLabels: + app: grafana-image-renderer + template: + metadata: + labels: + app: grafana-image-renderer + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: grafana-image-renderer + image: grafana/grafana-image-renderer:latest + resources: + requests: + cpu: 500m + memory: 300Mi + limits: + cpu: 1000m + memory: 500Mi + imagePullPolicy: Always + ports: + - containerPort: 8081 + securityContext: + readOnlyRootFilesystem: true diff --git a/kubernetes/namespaces/default/grafana/ingress.yaml b/kubernetes/namespaces/default/grafana/ingress.yaml new file mode 100644 index 0000000..60138af --- /dev/null +++ b/kubernetes/namespaces/default/grafana/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: grafana +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: grafana.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: grafana + port: + number: 3000 diff --git a/kubernetes/namespaces/default/grafana/secrets.yaml b/kubernetes/namespaces/default/grafana/secrets.yaml Binary files differnew file mode 100644 index 0000000..bb492d1 --- /dev/null +++ b/kubernetes/namespaces/default/grafana/secrets.yaml diff --git a/kubernetes/namespaces/default/grafana/services.yaml b/kubernetes/namespaces/default/grafana/services.yaml new file mode 100644 index 0000000..e5430a2 --- /dev/null +++ b/kubernetes/namespaces/default/grafana/services.yaml @@ -0,0 +1,19 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana +spec: + ports: + - port: 3000 + selector: + app: grafana +--- +apiVersion: v1 +kind: Service +metadata: + name: grafana-image-renderer +spec: + ports: + - port: 8081 + selector: + app: grafana-image-renderer diff --git a/kubernetes/namespaces/default/grafana/volume.yaml b/kubernetes/namespaces/default/grafana/volume.yaml new file mode 100644 index 0000000..6283a7c --- /dev/null +++ b/kubernetes/namespaces/default/grafana/volume.yaml @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: grafana-storage + labels: + app: grafana +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/kubernetes/namespaces/default/graphite/README.md b/kubernetes/namespaces/default/graphite/README.md new file mode 100644 index 0000000..1d14e36 --- /dev/null +++ b/kubernetes/namespaces/default/graphite/README.md @@ -0,0 +1,11 @@ +# Graphite + +These files provision an instance of the [graphite-statsd](https://hub.docker.com/r/graphiteapp/graphite-statsd/) image. + +The following ports are exposed by the service: + +**80**: NGINX +**8125**: StatsD Ingest +**8126**: StatsD Admin + +There is a 10Gi persistent volume mounted at `/opt/graphite/storage` which holds our statistic data. diff --git a/kubernetes/namespaces/default/graphite/deployment.yaml b/kubernetes/namespaces/default/graphite/deployment.yaml new file mode 100644 index 0000000..17c66f8 --- /dev/null +++ b/kubernetes/namespaces/default/graphite/deployment.yaml @@ -0,0 +1,38 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: graphite +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: graphite + template: + metadata: + labels: + app: graphite + spec: + containers: + - name: graphite + image: graphiteapp/graphite-statsd:latest + imagePullPolicy: Always + resources: + requests: + cpu: 200m + memory: 500Mi + limits: + cpu: 1000m + memory: 750Mi + ports: + - containerPort: 80 + - containerPort: 8125 + - containerPort: 8126 + volumeMounts: + - mountPath: /opt/graphite/storage + name: graphite-volume + volumes: + - name: graphite-volume + persistentVolumeClaim: + claimName: graphite-storage diff --git a/kubernetes/namespaces/default/graphite/service.yaml b/kubernetes/namespaces/default/graphite/service.yaml new file mode 100644 index 0000000..599dcdb --- /dev/null +++ b/kubernetes/namespaces/default/graphite/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: graphite +spec: + ports: + - port: 80 + name: nginx + - port: 8125 + name: statsd + protocol: UDP + - port: 8126 + name: statsd-admin + selector: + app: graphite diff --git a/kubernetes/namespaces/default/graphite/volume.yaml b/kubernetes/namespaces/default/graphite/volume.yaml new file mode 100644 index 0000000..ebb830a --- /dev/null +++ b/kubernetes/namespaces/default/graphite/volume.yaml @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: graphite-storage + labels: + app: graphite +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 30Gi diff --git a/kubernetes/namespaces/default/h-asgi/README.md b/kubernetes/namespaces/default/h-asgi/README.md new file mode 100644 index 0000000..fba6b15 --- /dev/null +++ b/kubernetes/namespaces/default/h-asgi/README.md @@ -0,0 +1,3 @@ +## h + +A pure ASGI service that returns h. diff --git a/kubernetes/namespaces/default/h-asgi/deployment.yaml b/kubernetes/namespaces/default/h-asgi/deployment.yaml new file mode 100644 index 0000000..d195201 --- /dev/null +++ b/kubernetes/namespaces/default/h-asgi/deployment.yaml @@ -0,0 +1,31 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: h-asgi +spec: + replicas: 1 + selector: + matchLabels: + app: h-asgi + template: + metadata: + labels: + app: h-asgi + spec: + containers: + - name: h-asgi + image: ghcr.io/vcokltfre/h:latest + imagePullPolicy: Always + resources: + requests: + cpu: 30m + memory: 25Mi + limits: + cpu: 30m + memory: 25Mi + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/h-asgi/ingress.yaml b/kubernetes/namespaces/default/h-asgi/ingress.yaml new file mode 100644 index 0000000..b6bfd31 --- /dev/null +++ b/kubernetes/namespaces/default/h-asgi/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://letterh.xyz" + name: h-asgi +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: h.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: h-asgi + port: + number: 80 diff --git a/kubernetes/namespaces/default/h-asgi/service.yaml b/kubernetes/namespaces/default/h-asgi/service.yaml new file mode 100644 index 0000000..53ac73b --- /dev/null +++ b/kubernetes/namespaces/default/h-asgi/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: h-asgi +spec: + selector: + app: h-asgi + ports: + - protocol: TCP + port: 80 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/hastebin/README.md b/kubernetes/namespaces/default/hastebin/README.md new file mode 100644 index 0000000..9491ed4 --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/README.md @@ -0,0 +1,11 @@ +# Hastebin +These manifests provision an instance of the hastebin service used on https://paste-old.pythondiscord.com + +## How to deploy this service +- Check the defaults in `defaults-configmap.yaml` match what you want. + +This deployment expects an environment variable to exist in a secret called `hastebin-redis-password`. + +| Environment | Description | +|------------------|-------------------------------------------------------| +| STORAGE_PASSWORD | The password to the redis instance to store pastes to | diff --git a/kubernetes/namespaces/default/hastebin/defaults-configmap.yaml b/kubernetes/namespaces/default/hastebin/defaults-configmap.yaml new file mode 100644 index 0000000..b05812b --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/defaults-configmap.yaml @@ -0,0 +1,50 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: hastebin-defaults +data: + # Set storage method + STORAGE_TYPE: "redis" + + # Set storage host + STORAGE_HOST: "redis.default.svc.cluster.local" + + # Set port of storage host + STORAGE_PORT: "6379" + + # Expiration of documents in seconds + STORAGE_EXPIRE_SECONDS: "2629746" # 1 month + + # Select the Redis DB to use + STORAGE_DB: "2" + + # Maximum length in characters of documents + MAX_LENGTH: "100000" + + # Logging configuration + LOGGING_LEVEL: "verbose" + LOGGING_TYPE: "Console" + LOGGING_COLORIZE: "true" + + # Host address and port + HOST: "0.0.0.0" + PORT: "7777" + + # Length of keys + KEY_LENGTH: "10" + + # Max length of static asset caching + STATIC_MAX_AGE: "86400" + + # Compress assets + RECOMPRESS_STATIC_ASSETS: "true" + + # Kegenerator + KEYGENERATOR_TYPE: "phonetic" + + # Ratelimits + RATELIMITS_NORMAL_TOTAL_REQUESTS: "500" + RATELIMITS_NORMAL_EVERY_MILLISECONDS: "60000" + + # Default documents + DOCUMENTS: "about=./about.md" diff --git a/kubernetes/namespaces/default/hastebin/deployment.yaml b/kubernetes/namespaces/default/hastebin/deployment.yaml new file mode 100644 index 0000000..7f88e05 --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: hastebin +spec: + replicas: 1 + selector: + matchLabels: + app: hastebin + template: + metadata: + labels: + app: hastebin + spec: + containers: + - name: hastebin + # Same image as https://github.com/seejohnrun/haste-server/blob/master/Dockerfile + image: node:14.8.0-stretch + command: [ "bash", "/init/init.sh" ] + imagePullPolicy: Always + resources: + requests: + cpu: 5m + memory: 70Mi + limits: + cpu: 100m + memory: 100Mi + ports: + - containerPort: 7777 + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: hastebin-init-volume + mountPath: /init + - name: hastebin-code-volume + mountPath: /haste-server + - name: hastebin-npm-cache + mountPath: /home/node/ + envFrom: + - secretRef: + name: hastebin-redis-password + - configMapRef: + name: hastebin-defaults + volumes: + - name: hastebin-init-volume + configMap: + name: hastebin-init + - name: hastebin-code-volume + emptyDir: {} + - name: hastebin-npm-cache + emptyDir: {} + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/hastebin/ingress.yaml b/kubernetes/namespaces/default/hastebin/ingress.yaml new file mode 100644 index 0000000..26437ec --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: hastebin +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: paste-old.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: hastebin + port: + number: 80 diff --git a/kubernetes/namespaces/default/hastebin/init-configmap.yaml b/kubernetes/namespaces/default/hastebin/init-configmap.yaml new file mode 100644 index 0000000..906060f --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/init-configmap.yaml @@ -0,0 +1,49 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: hastebin-init +data: + monkeypatch_extensions.sh: | + #!/bin/bash + + ORIGINAL="file += '\.' + _this.lookupExtensionByType(ret\.language);" + PATCHED="\/\/file += '\.' + _this.lookupExtensionByType(ret\.language);" + FILENAME="static/application.js" + + touch changed + sed -i "s/$ORIGINAL/$PATCHED/w changed" "$FILENAME" + init.sh: | + #!/bin/bash + + # Clone the repo + git clone https://github.com/toptal/haste-server.git + cd haste-server + + # Monkey patch - don't add extensions to the URLs. + # + # This is a pretty messy monkey patch, and it may break if the hastebin + # JS code changes significantly. It makes the URL display as "file" + # instead of "file.py" when you save a file, which makes it possible + # to share the URL without triggering Discord's suspicious URL filter. + cp /init/monkeypatch_extensions.sh ./monkeypatch_extensions.sh + chmod +x monkeypatch_extensions.sh + ./monkeypatch_extensions.sh + + # Check if monkeypatch succeeded. Otherwise, fail hard. + if [ -s changed ]; then + echo "Monkey patch executed: Hastebin will no longer add extensions to URLs." + else + echo "Monkey patch for not adding extension could not be performed. Maybe the hastebin code has changed?" + exit 69 + fi + + # Install and start + npm install + + set -e + + # Generate the config file from the environment + node docker-entrypoint.js > config.js + + # Start Hastebin + npm start diff --git a/kubernetes/namespaces/default/hastebin/secrets.yaml b/kubernetes/namespaces/default/hastebin/secrets.yaml Binary files differnew file mode 100644 index 0000000..9cec074 --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/secrets.yaml diff --git a/kubernetes/namespaces/default/hastebin/service.yaml b/kubernetes/namespaces/default/hastebin/service.yaml new file mode 100644 index 0000000..d34bf5c --- /dev/null +++ b/kubernetes/namespaces/default/hastebin/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: hastebin +spec: + selector: + app: hastebin + ports: + - protocol: TCP + port: 80 + targetPort: 7777 diff --git a/kubernetes/namespaces/default/king-arthur/README.md b/kubernetes/namespaces/default/king-arthur/README.md new file mode 100644 index 0000000..704d45b --- /dev/null +++ b/kubernetes/namespaces/default/king-arthur/README.md @@ -0,0 +1,12 @@ +# King Arthur + +Deployment file for @King Arthur, our DevOps helper bot. + +## Secrets +This deployment expects a number of secrets and environment variables to exist in a secret called `king-arthur-env`. + +| Environment | Description | +| ---------------------------- | ------------------------------------------------------------------------- | +| KING_ARTHUR_TOKEN | The token to authorize with Discord | +| KING_ARTHUR_NOTION_API_TOKEN | The API token to the notion API | +| KING_ARTHUR_CLOUDFLARE_TOKEN | A token for the Cloudflare API used for the Cloudflare commands in Arthur | diff --git a/kubernetes/namespaces/default/king-arthur/deployment.yaml b/kubernetes/namespaces/default/king-arthur/deployment.yaml new file mode 100644 index 0000000..cbc3874 --- /dev/null +++ b/kubernetes/namespaces/default/king-arthur/deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: king-arthur +spec: + replicas: 1 + selector: + matchLabels: + app: king-arthur + template: + metadata: + labels: + app: king-arthur + spec: + serviceAccountName: king-arthur + containers: + - name: king-arthur + image: ghcr.io/python-discord/king-arthur:latest + imagePullPolicy: Always + resources: + requests: + cpu: 400m + memory: 100Mi + limits: + cpu: 500m + memory: 200Mi + envFrom: + - secretRef: + name: king-arthur-env + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/king-arthur/secrets.yaml b/kubernetes/namespaces/default/king-arthur/secrets.yaml Binary files differnew file mode 100644 index 0000000..a410013 --- /dev/null +++ b/kubernetes/namespaces/default/king-arthur/secrets.yaml diff --git a/kubernetes/namespaces/default/king-arthur/service-account.yaml b/kubernetes/namespaces/default/king-arthur/service-account.yaml new file mode 100644 index 0000000..a63a88e --- /dev/null +++ b/kubernetes/namespaces/default/king-arthur/service-account.yaml @@ -0,0 +1,27 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: king-arthur +rules: +- apiGroups: ["", "extensions", "apps", "batch", "rbac.authorization.k8s.io", "cert-manager.io"] + resources: ["*"] + verbs: ["*"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: king-arthur +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: king-arthur +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: king-arthur +subjects: + - kind: ServiceAccount + name: king-arthur + namespace: default diff --git a/kubernetes/namespaces/default/metabase/README.md b/kubernetes/namespaces/default/metabase/README.md new file mode 100644 index 0000000..b246d54 --- /dev/null +++ b/kubernetes/namespaces/default/metabase/README.md @@ -0,0 +1,14 @@ +# Metabase +These manifests provision an instance of Metabase, our database analysis tool. + +## Secrets +A single secret of name `metabase-env` is used with the following values: + +| Environment | Description | +|--------------|-------------------------------------------| +| MB_DB_DBNAME | Database name for internal metabase usage | +| MB_DB_HOST | Address of PostgreSQL database | +| MB_DB_TYPE | Always postgres | +| MB_DB_PASS | Database user password | +| MB_DB_PORT | Always 5432 | +| MB_DB_USER | User for metabase internal | diff --git a/kubernetes/namespaces/default/metabase/deployment.yaml b/kubernetes/namespaces/default/metabase/deployment.yaml new file mode 100644 index 0000000..7a58851 --- /dev/null +++ b/kubernetes/namespaces/default/metabase/deployment.yaml @@ -0,0 +1,35 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: metabase +spec: + replicas: 1 + selector: + matchLabels: + app: metabase + template: + metadata: + labels: + app: metabase + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: metabase + image: metabase/metabase:latest + imagePullPolicy: Always + ports: + - containerPort: 3000 + envFrom: + - secretRef: + name: metabase-env + volumeMounts: + - mountPath: /plugins + name: mb-plugins-volume + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: mb-plugins-volume + emptyDir: {} diff --git a/kubernetes/namespaces/default/metabase/ingress.yaml b/kubernetes/namespaces/default/metabase/ingress.yaml new file mode 100644 index 0000000..16b66d5 --- /dev/null +++ b/kubernetes/namespaces/default/metabase/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: metabase +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: metabase.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: metabase + port: + number: 80 diff --git a/kubernetes/namespaces/default/metabase/secrets.yaml b/kubernetes/namespaces/default/metabase/secrets.yaml Binary files differnew file mode 100644 index 0000000..2cb73c5 --- /dev/null +++ b/kubernetes/namespaces/default/metabase/secrets.yaml diff --git a/kubernetes/namespaces/default/metabase/service.yaml b/kubernetes/namespaces/default/metabase/service.yaml new file mode 100644 index 0000000..36118a0 --- /dev/null +++ b/kubernetes/namespaces/default/metabase/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: metabase +spec: + selector: + app: metabase + ports: + - protocol: TCP + port: 80 + targetPort: 3000 diff --git a/kubernetes/namespaces/default/metricity/README.md b/kubernetes/namespaces/default/metricity/README.md new file mode 100644 index 0000000..30c8b95 --- /dev/null +++ b/kubernetes/namespaces/default/metricity/README.md @@ -0,0 +1,13 @@ +# Metricity + +This folder contains the secrets for the metricity service. + +The actual metricity deployment manifest can be found inside the metricity repository at [python-discord/metricity](https://github.com/python-discord/metricity). + +## Secrets +A single secret of name `metricity-env` is used with the following values: + +| Environment | Description | +|--------------|------------------------------------| +| BOT_TOKEN | The Discord bot token to run under | +| DATABASE_URI | Database URI to save the states to | diff --git a/kubernetes/namespaces/default/metricity/secrets.yaml b/kubernetes/namespaces/default/metricity/secrets.yaml Binary files differnew file mode 100644 index 0000000..f97dc51 --- /dev/null +++ b/kubernetes/namespaces/default/metricity/secrets.yaml diff --git a/kubernetes/namespaces/default/modmail/README.md b/kubernetes/namespaces/default/modmail/README.md new file mode 100644 index 0000000..92ac16b --- /dev/null +++ b/kubernetes/namespaces/default/modmail/README.md @@ -0,0 +1,19 @@ +# Modmail + +This folder contains the manifests for our Modmail service. + +## Secrets + +The services require one shared secret called `modmail` containing the following: + +| Key | Value | Description | +| ------------------------| ---------------------------------|--------------------------------------------------------------| +| `CONNECTION_URI` | MongoDB connection URI | Used for storing data | +| `DATABASE_TYPE` | `mongodb` | The type of database to use, only supports mongodb right now | +| `DATA_COLLECTION` | `False` | Disable bot metadata collection by modmail devs | +| `DISABLE_AUTOUPDATES` | `yes` | Auto-updates breaks in production | +| `GUILD_ID` | Snowflake of Discord guild | Guild to respond to commands in | +| `LOG_URL` | URL of the web portal | Used for generating links on the bot | +| `OWNERS` | Comma separated list of user IDs | Used for granting high permissions on the bot | +| `REGISTRY_PLUGINS_ONLY` | `false` | Allows the usage of plugins outside of the official registry | +| `TOKEN` | Discord Token | Used to connect to Discord | diff --git a/kubernetes/namespaces/default/modmail/bot/README.md b/kubernetes/namespaces/default/modmail/bot/README.md new file mode 100644 index 0000000..ac29ac2 --- /dev/null +++ b/kubernetes/namespaces/default/modmail/bot/README.md @@ -0,0 +1,7 @@ +# Modmail bot +These manifests will provision the resources for an instance of our Modmail bot. + +To deploy this bot simply run: +``` +kubectl apply -f deployment.yaml +``` diff --git a/kubernetes/namespaces/default/modmail/bot/deployment.yaml b/kubernetes/namespaces/default/modmail/bot/deployment.yaml new file mode 100644 index 0000000..e640fdc --- /dev/null +++ b/kubernetes/namespaces/default/modmail/bot/deployment.yaml @@ -0,0 +1,48 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: modmail-bot +spec: + replicas: 1 + selector: + matchLabels: + app: modmail-bot + template: + metadata: + labels: + app: modmail-bot + spec: + containers: + - name: modmail-bot + image: ghcr.io/python-discord/modmail:latest + resources: + requests: + cpu: 75m + memory: 500Mi + limits: + cpu: 125m + memory: 750Mi + imagePullPolicy: "Always" + volumeMounts: + - mountPath: /modmailbot/plugins + name: plugins-vol + - mountPath: /modmailbot/temp + name: temp-vol + env: + - name: TMPDIR + value: /modmailbot/temp + envFrom: + - secretRef: + name: modmail + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: plugins-vol + emptyDir: {} + - name: temp-vol + emptyDir: + medium: Memory + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/modmail/secrets.yaml b/kubernetes/namespaces/default/modmail/secrets.yaml Binary files differnew file mode 100644 index 0000000..f2d5d5d --- /dev/null +++ b/kubernetes/namespaces/default/modmail/secrets.yaml diff --git a/kubernetes/namespaces/default/modmail/web/README.md b/kubernetes/namespaces/default/modmail/web/README.md new file mode 100644 index 0000000..7b7e19e --- /dev/null +++ b/kubernetes/namespaces/default/modmail/web/README.md @@ -0,0 +1,2 @@ +# Modmail web +These manifests provision an instance of the web logviewer for our Modmail system. diff --git a/kubernetes/namespaces/default/modmail/web/deployment.yaml b/kubernetes/namespaces/default/modmail/web/deployment.yaml new file mode 100644 index 0000000..1070e22 --- /dev/null +++ b/kubernetes/namespaces/default/modmail/web/deployment.yaml @@ -0,0 +1,36 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: modmail-web +spec: + replicas: 1 + selector: + matchLabels: + app: modmail-web + template: + metadata: + labels: + app: modmail-web + spec: + containers: + - name: modmail-web + image: ghcr.io/python-discord/logviewer:latest + imagePullPolicy: Always + resources: + requests: + cpu: 50m + memory: 100Mi + limits: + cpu: 100m + memory: 150Mi + ports: + - containerPort: 8000 + envFrom: + - secretRef: + name: modmail + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/modmail/web/ingress.yaml b/kubernetes/namespaces/default/modmail/web/ingress.yaml new file mode 100644 index 0000000..a5990cf --- /dev/null +++ b/kubernetes/namespaces/default/modmail/web/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: modmail-web +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: modmail.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: modmail-web + port: + number: 80 diff --git a/kubernetes/namespaces/default/modmail/web/service.yaml b/kubernetes/namespaces/default/modmail/web/service.yaml new file mode 100644 index 0000000..384e638 --- /dev/null +++ b/kubernetes/namespaces/default/modmail/web/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: modmail-web +spec: + selector: + app: modmail-web + ports: + - protocol: TCP + port: 80 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/mongodb/README.md b/kubernetes/namespaces/default/mongodb/README.md new file mode 100644 index 0000000..05f3f1d --- /dev/null +++ b/kubernetes/namespaces/default/mongodb/README.md @@ -0,0 +1,22 @@ +# Python Discord MongoDB +This folder contains the configuration for Python Discord's MongoDB instance. + +## Volume +A 10Gi volume is provisioned on the Linode Block Storage (Retain) storage class. + +## Secrets +| Key | Value | Description | +| ---------------------------- | -------------------------- | ------------------------------- | +| `MONGO_INITDB_ROOT_USERNAME` | `pythondiscord` | Username of root user | +| `MONGO_INITDB_ROOT_PASSWORD` | Root password for database | Password for the root user | + + +## Deployment +The deployment will pull the `mongo:latest` image from DockerHub. + +It will mount the created volume at `/data/db`. + +It will expose port `27017` to connect to MongoDB. + +## Service +A service called `mongodb` will be created to give the deployment a cluster local DNS record of `mongodb.default.svc.cluster.local`. diff --git a/kubernetes/namespaces/default/mongodb/deployment.yaml b/kubernetes/namespaces/default/mongodb/deployment.yaml new file mode 100644 index 0000000..592c342 --- /dev/null +++ b/kubernetes/namespaces/default/mongodb/deployment.yaml @@ -0,0 +1,46 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mongodb +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: mongodb + template: + metadata: + labels: + app: mongodb + spec: + containers: + - name: mongodb + image: mongo:4.4 + imagePullPolicy: Always + ports: + - containerPort: 27017 + resources: + requests: + cpu: 100m + memory: 300Mi + limits: + cpu: 200m + memory: 400Mi + volumeMounts: + - mountPath: /data/db + name: mongodb-volume + - mountPath: /tmp + name: mongo-temp + envFrom: + - secretRef: + name: mongo-credentials + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: mongodb-volume + persistentVolumeClaim: + claimName: mongodb-storage + - name: mongo-temp + emptyDir: + medium: Memory diff --git a/kubernetes/namespaces/default/mongodb/secrets.yaml b/kubernetes/namespaces/default/mongodb/secrets.yaml Binary files differnew file mode 100644 index 0000000..2ca5e82 --- /dev/null +++ b/kubernetes/namespaces/default/mongodb/secrets.yaml diff --git a/kubernetes/namespaces/default/mongodb/service.yaml b/kubernetes/namespaces/default/mongodb/service.yaml new file mode 100644 index 0000000..ed14298 --- /dev/null +++ b/kubernetes/namespaces/default/mongodb/service.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Service +metadata: + name: mongodb +spec: + ports: + - port: 27017 + selector: + app: mongodb diff --git a/kubernetes/namespaces/default/mongodb/volume.yaml b/kubernetes/namespaces/default/mongodb/volume.yaml new file mode 100644 index 0000000..094e212 --- /dev/null +++ b/kubernetes/namespaces/default/mongodb/volume.yaml @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: mongodb-storage + labels: + app: mongodb +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/kubernetes/namespaces/default/olli/README.md b/kubernetes/namespaces/default/olli/README.md new file mode 100644 index 0000000..ff748d5 --- /dev/null +++ b/kubernetes/namespaces/default/olli/README.md @@ -0,0 +1,13 @@ +# Olli + +This folder contains the deployment information for [Olli](https://github.com/python-discord/olli), our Loki-Discord relay. + +The deployment manifest is located within the repository. + +The rest of the configuration can be applied through `kubectl apply -f .` in this directory. + +A secret called `olli-env` with the following a key `WEBHOOK_URL` with the configured Discord webhook. + +| Key | Description | +| --------------| -------------------------- | +| `WEBHOOK_URL` | The Discord webhook to use | diff --git a/kubernetes/namespaces/default/olli/config.yaml b/kubernetes/namespaces/default/olli/config.yaml new file mode 100644 index 0000000..356aba8 --- /dev/null +++ b/kubernetes/namespaces/default/olli/config.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: olli-config +data: + olli.toml: | + [olli] + interval_minutes = 30 + + [loki] + api_url = "http://loki.loki.svc.cluster.local:3100/" + jobs = [ + "default/bot", + "default/site", + "default/sir-lancebot", + "default/metricity", + "default/snekbox", + "default/quackstack", + "default/thread-bot", + "default/black-knight" + ] + + [[olli.tokens]] + token = "ERROR" + color = "#ff5f5f" + + [[olli.tokens]] + token = "CRITICAL" + color = "#ff5f5f" + + [[olli.tokens]] + token = "WARN" + color = "#ffe24d" + case_sensitive = true diff --git a/kubernetes/namespaces/default/olli/secrets.yaml b/kubernetes/namespaces/default/olli/secrets.yaml Binary files differnew file mode 100644 index 0000000..75cdf23 --- /dev/null +++ b/kubernetes/namespaces/default/olli/secrets.yaml diff --git a/kubernetes/namespaces/default/patsy/README.md b/kubernetes/namespaces/default/patsy/README.md new file mode 100644 index 0000000..78e386a --- /dev/null +++ b/kubernetes/namespaces/default/patsy/README.md @@ -0,0 +1,17 @@ +# Patsy + +Patsy is the premiere project for data collection in the python-discord toolchain. It uses word-class technology in a system architected by our in-house engineers to facilitate the automatic transfer, collection, and categorization of user data to develop user-centric solutions to real world problems. It is a marvel of engineering designed to push the limits of what we thought possible. + +The deployment for the [Patsy API](https://git.pydis.com/patsy), there is no ingress as Patsy is designed to only be accessible from within the cluster. + +This API is given help channel messages by the bot and stores them in postgres for after-the-fact processing. +The hope with this project is that we can inspect what topics get asked about often in help channels, along with which ones go un-answered the most. + +## Secret + +It requires a `patsy-env` secret with the following + +| Key | Description | +| -------------- | ------------------------------------------------------------ | +| `DATABASE_URL` | An asyncpg connection string to the postgres database | +| `STATE_SECRET` | A long random string, used to lock down endpoints with auth. | diff --git a/kubernetes/namespaces/default/patsy/deployment.yaml b/kubernetes/namespaces/default/patsy/deployment.yaml new file mode 100644 index 0000000..79fa243 --- /dev/null +++ b/kubernetes/namespaces/default/patsy/deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: patsy +spec: + replicas: 0 + selector: + matchLabels: + app: patsy + template: + metadata: + labels: + app: patsy + spec: + containers: + - name: patsy + image: ghcr.io/python-discord/patsy:latest + imagePullPolicy: Always + ports: + - containerPort: 80 + name: http + envFrom: + - secretRef: + name: patsy-env + startupProbe: + httpGet: + path: /ping + port: http + httpHeaders: + - name: Host + value: patsy.pythondiscord.com + failureThreshold: 3 + periodSeconds: 1 + initialDelaySeconds: 10 + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/patsy/secrets.yaml b/kubernetes/namespaces/default/patsy/secrets.yaml Binary files differnew file mode 100644 index 0000000..8d5c4f3 --- /dev/null +++ b/kubernetes/namespaces/default/patsy/secrets.yaml diff --git a/kubernetes/namespaces/default/patsy/service.yaml b/kubernetes/namespaces/default/patsy/service.yaml new file mode 100644 index 0000000..1844ff3 --- /dev/null +++ b/kubernetes/namespaces/default/patsy/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: patsy +spec: + selector: + app: patsy + ports: + - protocol: TCP + port: 80 + targetPort: 80 diff --git a/kubernetes/namespaces/default/pinnwand/README.md b/kubernetes/namespaces/default/pinnwand/README.md new file mode 100644 index 0000000..945b357 --- /dev/null +++ b/kubernetes/namespaces/default/pinnwand/README.md @@ -0,0 +1,8 @@ +# pinnwand +These manifests provision an instance of the pinnwand service used on https://paste.pythondiscord.com. + +A init-service is used to download the Python Discord banner logo and save it to a volume, as pinnwand expects it to be present within the image. + +## Secrets & config +This deployment expects an env var named `PINNWAND_DATABASE_URI` to exist in a secret called `pinnwand-postgres-connection`. +All other configuration can be found in `defaults-configmap.yaml`. diff --git a/kubernetes/namespaces/default/pinnwand/defaults-configmap.yaml b/kubernetes/namespaces/default/pinnwand/defaults-configmap.yaml new file mode 100644 index 0000000..96fa074 --- /dev/null +++ b/kubernetes/namespaces/default/pinnwand/defaults-configmap.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: pinnwand-config +data: + config.toml: | + # Maximum size in bytes of pastes + paste_size = 524288 + + default_selected_lexer = "python" + # List of lexers to pin to the top of the dropdown list + preferred_lexers = ["python", "autodetect", "pytb", "pycon", "text", "markdown", "restructuredtext", "sql"] + + page_list = ["about", "removal"] + footer = 'View <a href="//github.com/supakeen/pinnwand" target="_BLANK">source code</a>, <a href="/removal">removal</a> information, or read the <a href="/about">about</a> page.' + + paste_help = '''<p>Welcome to Python Discord's pastebin, powered by <a href="//github.com/supakeen/pinnwand" target="_BLANK">pinnwand</a>. It allows you to share code with others. If you write code in the text area below and press the paste button you will be given a link you can share with others so they can view your code as well.</p><p>People with the link can view your pasted code, only you can remove your paste and it expires automatically. Note that anyone could guess the URI to your paste so don't rely on it being private.</p>''' + expiries.30days = 2592000 + expiries.7days = 604800 + expiries.1day = 86400 + + ratelimit.read.capacity = 100 + ratelimit.read.consume = 1 + ratelimit.read.refill = 2 + + ratelimit.create.capacity = 10 # Default is 2 + ratelimit.create.consume = 1 # Default is 2 + ratelimit.create.refill = 10 # Default is 1 + + ratelimit.delete.capacity = 2 + ratelimit.delete.consume = 2 + ratelimit.delete.refill = 1 + + report_email = "[email protected]" diff --git a/kubernetes/namespaces/default/pinnwand/deployment.yaml b/kubernetes/namespaces/default/pinnwand/deployment.yaml new file mode 100644 index 0000000..2a6525a --- /dev/null +++ b/kubernetes/namespaces/default/pinnwand/deployment.yaml @@ -0,0 +1,56 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pinnwand +spec: + replicas: 1 + selector: + matchLabels: + app: pinnwand + template: + metadata: + labels: + app: pinnwand + spec: + initContainers: + - name: init-service + image: busybox:latest + command: ["wget", "https://raw.githubusercontent.com/python-discord/branding/main/logos/badge/badge_512x172.png", "-O", "/tmp/logo.png"] + volumeMounts: + - name: pinnwand-logo + mountPath: /tmp/ + containers: + - name: pinnwand + image: ghcr.io/supakeen/pinnwand:v1.5.0-psql + command: ["venv/bin/python3", "-m", "pinnwand", "-vvvvvvvv", "--configuration-path", "/config/config.toml", "http"] + imagePullPolicy: Always + resources: + requests: + cpu: 5m + memory: 70Mi + limits: + cpu: 100m + memory: 100Mi + ports: + - containerPort: 8000 + envFrom: + - secretRef: + name: pinnwand-postgres-connection + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: pinnwand-config + mountPath: /config/ + - name: pinnwand-logo + mountPath: /usr/app/pinnwand/static/logo.png + subPath: logo.png + volumes: + - name: pinnwand-logo + emptyDir: {} + - name: pinnwand-config + configMap: + name: pinnwand-config + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/pinnwand/ingress.yaml b/kubernetes/namespaces/default/pinnwand/ingress.yaml new file mode 100644 index 0000000..17dcb83 --- /dev/null +++ b/kubernetes/namespaces/default/pinnwand/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: pinnwand +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: paste.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: pinnwand + port: + number: 80 diff --git a/kubernetes/namespaces/default/pinnwand/secrets.yaml b/kubernetes/namespaces/default/pinnwand/secrets.yaml Binary files differnew file mode 100644 index 0000000..7fb586b --- /dev/null +++ b/kubernetes/namespaces/default/pinnwand/secrets.yaml diff --git a/kubernetes/namespaces/default/pinnwand/service.yaml b/kubernetes/namespaces/default/pinnwand/service.yaml new file mode 100644 index 0000000..be6bc4f --- /dev/null +++ b/kubernetes/namespaces/default/pinnwand/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: pinnwand +spec: + selector: + app: pinnwand + ports: + - protocol: TCP + port: 80 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/pixels-modsite/README.md b/kubernetes/namespaces/default/pixels-modsite/README.md new file mode 100644 index 0000000..ee95650 --- /dev/null +++ b/kubernetes/namespaces/default/pixels-modsite/README.md @@ -0,0 +1,13 @@ +# Pixels + +The deployment for the [Pixels modsite project](https://git.pydis.com/pixels-modsite), hosted at https://pixels-modsite.pythondiscord.com. + +This mod site will give Discord mods easy access to moderation actions for the pixels event. + +## Secret + +It requires a `pixels-modsite-env` secret with the following entries: + +| Environment | Description | +|-------------------|-----------------------------------------------------------------| +| DISCORD_BOT_TOKEN | The Discord bot token to use to check roles of users logging in | diff --git a/kubernetes/namespaces/default/pixels-modsite/deployment.yaml b/kubernetes/namespaces/default/pixels-modsite/deployment.yaml new file mode 100644 index 0000000..2b9d632 --- /dev/null +++ b/kubernetes/namespaces/default/pixels-modsite/deployment.yaml @@ -0,0 +1,31 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pixels-modsite +spec: + replicas: 1 + selector: + matchLabels: + app: pixels-modsite + template: + metadata: + labels: + app: pixels-modsite + spec: + containers: + - name: pixels-modsite + image: ghcr.io/python-discord/pixels-modsite:latest + imagePullPolicy: Always + ports: + - containerPort: 3000 + envFrom: + - secretRef: + name: pixels-modsite-env + securityContext: + readOnlyRootFilesystem: true + imagePullSecrets: + - name: ghcr-pull-secret + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/pixels-modsite/ingress.yaml b/kubernetes/namespaces/default/pixels-modsite/ingress.yaml new file mode 100644 index 0000000..80c8dbe --- /dev/null +++ b/kubernetes/namespaces/default/pixels-modsite/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: pixels-modsite +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: pixels-modsite.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: pixels-modsite + port: + number: 80 diff --git a/kubernetes/namespaces/default/pixels-modsite/secrets.yaml b/kubernetes/namespaces/default/pixels-modsite/secrets.yaml Binary files differnew file mode 100644 index 0000000..0f4d361 --- /dev/null +++ b/kubernetes/namespaces/default/pixels-modsite/secrets.yaml diff --git a/kubernetes/namespaces/default/pixels-modsite/service.yaml b/kubernetes/namespaces/default/pixels-modsite/service.yaml new file mode 100644 index 0000000..8118f09 --- /dev/null +++ b/kubernetes/namespaces/default/pixels-modsite/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: pixels-modsite +spec: + selector: + app: pixels-modsite + ports: + - protocol: TCP + port: 80 + targetPort: 3000 diff --git a/kubernetes/namespaces/default/pixels/README.md b/kubernetes/namespaces/default/pixels/README.md new file mode 100644 index 0000000..10e4a5d --- /dev/null +++ b/kubernetes/namespaces/default/pixels/README.md @@ -0,0 +1,25 @@ +# Pixels + +The deployment for the [Pixels project](https://git.pydis.com/pixels-v2), hosted at https://pixels.pythondiscord.com. + +## Secret + +It requires a `pixels-env` secret with the following entries: + +| Environment | Description | +|---------------|---------------------------------------------------------------------------------------------------------| +| AUTH_URL | A Discord OAuth2 URL with scopes: identify & guilds.members.read | +| BASE_URL | Where the root endpoint can be found | +| CLIENT_ID | Discord Oauth2 client ID | +| CLIENT_SECRET | Discord Oauth2 client secret | +| DATABASE_URL | Postgres database URL. | +| FORCE_LOGIN | Whether to requires authorization for all endpoints beside the login page, and limits access to helpers | +| GUILD_ID | The guild to check for user roles in | +| HELPERS_ROLE | Helpers role ID | +| JWT_SECRET | 32 byte (64 digit hex string) secret for encoding tokens. Any value can be used. | +| LOG_LEVEL | What level to log at | +| MOD_ROLE | Moderator role ID | +| PRODUCTION | Whether the app is in production | +| REDIS_URL | Redis storage URL | +| SENTRY_DSN | The Sentry DSN to send sentry events to | +| WEBHOOK_URL | The webhook to periodically post the canvas state to | diff --git a/kubernetes/namespaces/default/pixels/deployment.yaml b/kubernetes/namespaces/default/pixels/deployment.yaml new file mode 100644 index 0000000..7775216 --- /dev/null +++ b/kubernetes/namespaces/default/pixels/deployment.yaml @@ -0,0 +1,42 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: pixels +spec: + replicas: 1 + selector: + matchLabels: + app: pixels + template: + metadata: + labels: + app: pixels + spec: + containers: + - name: pixels + image: ghcr.io/python-discord/pixels:latest + imagePullPolicy: Always + ports: + - containerPort: 8000 + envFrom: + - secretRef: + name: pixels-env + startupProbe: + httpGet: + path: /health + port: 8000 + httpHeaders: + - name: Host + value: pixels.pythondiscord.com + failureThreshold: 15 + periodSeconds: 2 + timeoutSeconds: 5 + initialDelaySeconds: 10 + securityContext: + readOnlyRootFilesystem: true + imagePullSecrets: + - name: ghcr-pull-secret + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/pixels/ingress.yaml b/kubernetes/namespaces/default/pixels/ingress.yaml new file mode 100644 index 0000000..bfc0ada --- /dev/null +++ b/kubernetes/namespaces/default/pixels/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: pixels +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: pixels.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: pixels + port: + number: 80 diff --git a/kubernetes/namespaces/default/pixels/secrets.yaml b/kubernetes/namespaces/default/pixels/secrets.yaml Binary files differnew file mode 100644 index 0000000..b3c77cd --- /dev/null +++ b/kubernetes/namespaces/default/pixels/secrets.yaml diff --git a/kubernetes/namespaces/default/pixels/service.yaml b/kubernetes/namespaces/default/pixels/service.yaml new file mode 100644 index 0000000..41860a1 --- /dev/null +++ b/kubernetes/namespaces/default/pixels/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: pixels +spec: + selector: + app: pixels + ports: + - protocol: TCP + port: 80 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/policy-bot/README.md b/kubernetes/namespaces/default/policy-bot/README.md new file mode 100644 index 0000000..ed44a63 --- /dev/null +++ b/kubernetes/namespaces/default/policy-bot/README.md @@ -0,0 +1,24 @@ +# Policy Bot +Policy Bot is our instance of [palantir/policy-bot](https://github.com/palantir/policy-bot) for managing review policy across our GitHub repositories. + +Actual review policy is stored inside our GitHub repositories in the `.github/review-policy.yml` file, so the configuration here is purely for interacting with GitHub and some lower level things. + +## GitHub Configuration + +Follow the instructions in the [repository](https://github.com/palantir/policy-bot#deployment) to provision a GitHub application. Our manifests are configured to run the policy bot at https://policy-bot.pythondiscord.com/. + +## Secrets + +This app requires a `policy-bot-defaults` secret with the following entries: + +| Environment | Description | +|--------------------------------------------------|-----------------------------------------------------------------------| +| GITHUB_APP_PRIVATE_KEY | Contents of the PEM certificate downloadable from the GitHub App page | +| GITHUB_APP_WEBHOOK_SECRET | Webhook secret from GitHub App Page | +| GITHUB_OAUTH_CLIENT_SECRET | OAuth 2 client secret from Github App page | +| POLICYBOT_OPTIONS_DO_NOT_LOAD_COMMIT_PUSHED_DATE | Set to True to not use deprecated commit_pushed_date from Github API | +| POLICYBOT_SESSIONS_KEY | Random characters for signing user sessions | + +Run `kubectl apply -f .` inside this directory to apply the the configuration. + +Access the running application over [policy-bot.pythondiscord.com]([https://policy-bot.pythondiscord.com/])! diff --git a/kubernetes/namespaces/default/policy-bot/configmap.yaml b/kubernetes/namespaces/default/policy-bot/configmap.yaml new file mode 100644 index 0000000..6183948 --- /dev/null +++ b/kubernetes/namespaces/default/policy-bot/configmap.yaml @@ -0,0 +1,49 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: policy-bot-defaults +data: + policy-bot.yml: | + # Options for the http server + server: + # The listen address and port + address: "0.0.0.0" + port: 8080 + # The public URL, used for URL generation when the server is behind a proxy + public_url: https://policy-bot.pythondiscord.com/ + + # Options for logging output + logging: + # If true, logs are printed in human-readable form. We recommend using + # "false" to output JSON-formatted logs in production + text: false + # Set a minimum logging level threshold + # Choose from: debug, info, warn, error + level: debug + + # Options for connecting to GitHub + github: + # The URL of the GitHub homepage. Can also be set by the GITHUB_WEB_URL + # environment variable. + web_url: "https://github.com" + # The base URL for v3 (REST) API requests. Can also be set by the + # GITHUB_V3_API_URL environment variable. + v3_api_url: "https://api.github.com" + # The base URL for v4 (GraphQL) API requests. Can also be set by the + # GITHUB_V4_API_URL environment variable. + v4_api_url: "https://api.github.com/graphql" + app: + # The integration ID of the GitHub app. Can also be set by the + # GITHUB_APP_INTEGRATION_ID environment variable. + integration_id: 91554 + oauth: + # The client ID of the OAuth app associated with the GitHub app. Can also + # be set by the GITHUB_OAUTH_CLIENT_ID environment variable. + client_id: "Iv1.5be42b7c960b1ab2" + + # Options for application behavior + options: + # The path within repositories to find the policy.yml file + policy_path: .github/review-policy.yml + # The context prefix for status checks created by the bot + status_check_context: "Review requirements" diff --git a/kubernetes/namespaces/default/policy-bot/deployment.yaml b/kubernetes/namespaces/default/policy-bot/deployment.yaml new file mode 100644 index 0000000..1b3f96b --- /dev/null +++ b/kubernetes/namespaces/default/policy-bot/deployment.yaml @@ -0,0 +1,43 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: policy-bot +spec: + replicas: 1 + selector: + matchLabels: + app: policy-bot + template: + metadata: + labels: + app: policy-bot + spec: + containers: + - name: policy-bot + image: palantirtechnologies/policy-bot:latest + imagePullPolicy: Always + resources: + requests: + cpu: 50m + memory: 50Mi + limits: + cpu: 100m + memory: 100Mi + ports: + - containerPort: 8080 + volumeMounts: + - mountPath: /secrets + name: policy-bot-config + securityContext: + readOnlyRootFilesystem: true + envFrom: + - secretRef: + name: policy-bot-secrets + volumes: + - name: policy-bot-config + configMap: + name: policy-bot-defaults + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/policy-bot/ingress.yaml b/kubernetes/namespaces/default/policy-bot/ingress.yaml new file mode 100644 index 0000000..088dd01 --- /dev/null +++ b/kubernetes/namespaces/default/policy-bot/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: policy-bot +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: policy-bot.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: policy-bot + port: + number: 80 diff --git a/kubernetes/namespaces/default/policy-bot/secrets.yaml b/kubernetes/namespaces/default/policy-bot/secrets.yaml Binary files differnew file mode 100644 index 0000000..9d91e70 --- /dev/null +++ b/kubernetes/namespaces/default/policy-bot/secrets.yaml diff --git a/kubernetes/namespaces/default/policy-bot/service.yaml b/kubernetes/namespaces/default/policy-bot/service.yaml new file mode 100644 index 0000000..da0fb64 --- /dev/null +++ b/kubernetes/namespaces/default/policy-bot/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: policy-bot +spec: + selector: + app: policy-bot + ports: + - protocol: TCP + port: 80 + targetPort: 8080 diff --git a/kubernetes/namespaces/default/postgresql/README.md b/kubernetes/namespaces/default/postgresql/README.md new file mode 100644 index 0000000..07eed4f --- /dev/null +++ b/kubernetes/namespaces/default/postgresql/README.md @@ -0,0 +1,16 @@ +# Postgres +This folder contains the manifests for Postgres, our primary database. + +You can alter the configuration file inside the `configmap.yaml` file which will be injected into the database container upon boot. Certain parameters (marked in the file) will require a server restart whereas others can be reloaded by using `SELECT pg_reload_conf();` inside Postgres. + +Note that there may be up to a minute before your changes to the ConfigMap are reflected inside the container, if things do not change after that you will have to restart the server for the configuration to apply. + +## Secrets + +postgres requires a `postgres-env` secret with the following entries: + +| Environment | Description | +|-------------------|-----------------------------------| +| PGDATA | The path to the pg_data directory | +| POSTGRES_PASSWORD | The default password to use | +| POSTGRES_USER | The default user to use | diff --git a/kubernetes/namespaces/default/postgresql/configmap.yaml b/kubernetes/namespaces/default/postgresql/configmap.yaml new file mode 100644 index 0000000..a2c9a4e --- /dev/null +++ b/kubernetes/namespaces/default/postgresql/configmap.yaml @@ -0,0 +1,28 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: postgres-config +data: + postgresql.conf: | + # From pgtune + max_connections = 110 + shared_buffers = 1536MB + effective_cache_size = 4608MB + maintenance_work_mem = 384MB + checkpoint_completion_target = 0.9 + wal_buffers = 16MB + default_statistics_target = 100 + random_page_cost = 1.1 + effective_io_concurrency = 200 + work_mem = 3574kB + min_wal_size = 1GB + max_wal_size = 4GB + max_worker_processes = 4 + max_parallel_workers_per_gather = 2 + max_parallel_workers = 4 + max_parallel_maintenance_workers = 2 + + # Custom conf + listen_addresses = '*' + password_encryption = md5 + shared_preload_libraries = 'pg_stat_statements' # (change requires restart) diff --git a/kubernetes/namespaces/default/postgresql/deployment.yaml b/kubernetes/namespaces/default/postgresql/deployment.yaml new file mode 100644 index 0000000..cfe6101 --- /dev/null +++ b/kubernetes/namespaces/default/postgresql/deployment.yaml @@ -0,0 +1,46 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: postgres + template: + metadata: + labels: + app: postgres + spec: + containers: + - name: postgres + image: ghcr.io/chrislovering/psql_extended:15 + imagePullPolicy: "Always" + ports: + - name: postgres + containerPort: 5432 + envFrom: + - secretRef: + name: postgres-env + args: ["-c", "config_file=/etc/postgresql/postgresql.conf"] + volumeMounts: + - name: postgres-data + mountPath: /var/lib/postgresql/data + subPath: pgdata + - name: postgres-config + mountPath: /etc/postgresql + - name: dshm + mountPath: /dev/shm + volumes: + - name: postgres-data + persistentVolumeClaim: + claimName: postgres-storage + - name: postgres-config + configMap: + name: postgres-config + - name: dshm + emptyDir: + medium: Memory + sizeLimit: 2Gi diff --git a/kubernetes/namespaces/default/postgresql/secrets.yaml b/kubernetes/namespaces/default/postgresql/secrets.yaml Binary files differnew file mode 100644 index 0000000..902f7d5 --- /dev/null +++ b/kubernetes/namespaces/default/postgresql/secrets.yaml diff --git a/kubernetes/namespaces/default/postgresql/service.yaml b/kubernetes/namespaces/default/postgresql/service.yaml new file mode 100644 index 0000000..f69deae --- /dev/null +++ b/kubernetes/namespaces/default/postgresql/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: postgres + name: postgres +spec: + ports: + - port: 5432 + selector: + app: postgres diff --git a/kubernetes/namespaces/default/postgresql/volume.yaml b/kubernetes/namespaces/default/postgresql/volume.yaml new file mode 100644 index 0000000..53108ce --- /dev/null +++ b/kubernetes/namespaces/default/postgresql/volume.yaml @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: postgres-storage + labels: + app: postgres +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 25Gi diff --git a/kubernetes/namespaces/default/prestashop/README.md b/kubernetes/namespaces/default/prestashop/README.md new file mode 100644 index 0000000..183f87c --- /dev/null +++ b/kubernetes/namespaces/default/prestashop/README.md @@ -0,0 +1,12 @@ +# Prestashop + +This folder contains the ingress and values.yaml file for the deployment of Prestashop, used for our merch store. It additionally deploys MariaDB, used for data storage. + +## Deployment + +``` +$ helm repo add bitnami https://charts.bitnami.com/bitnami +$ helm install prestashop bitnami/prestashop -f prestashop/values.yaml --set prestashopPassword=<admin passsword>,mariadb.auth.rootPassword=<database password>,smtpPassword=<password from mailgun> +``` + +The Helm chart can be located [here](https://github.com/bitnami/charts/tree/master/bitnami/prestashop), including all available parameters. diff --git a/kubernetes/namespaces/default/prestashop/ingress.yaml b/kubernetes/namespaces/default/prestashop/ingress.yaml new file mode 100644 index 0000000..c4abc58 --- /dev/null +++ b/kubernetes/namespaces/default/prestashop/ingress.yaml @@ -0,0 +1,37 @@ +apiVersion: v1 +kind: Service +metadata: + name: prestashop-svc +spec: + selector: + app.kubernetes.io/instance: prestashop + app.kubernetes.io/name: prestashop + ports: + - protocol: TCP + port: 80 + targetPort: 8080 +--- +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: merch +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: merch.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prestashop-svc + port: + number: 80 diff --git a/kubernetes/namespaces/default/prestashop/values.yaml b/kubernetes/namespaces/default/prestashop/values.yaml new file mode 100644 index 0000000..865393a --- /dev/null +++ b/kubernetes/namespaces/default/prestashop/values.yaml @@ -0,0 +1,49 @@ +prestashopHost: merch.pythondiscord.com + +prestashopUsername: [email protected] +prestashopPassword: +prestashopEmail: [email protected] +prestashopFirstName: Joe +prestashopLastName: Banks + +allowEmptyPassword: false + +smtpHost: smtp.mailgun.org +smtpPort: 587 +smtpUser: [email protected] +smtpProtocol: tls + +podAnnotations: + prometheus.io/scrape: "false" + +mariadb: + primary: + persistence: + enabled: true + storageClass: linode-block-storage-retain + size: 10Gi + +containerPorts: + http: 8080 + https: 8443 + +updateStrategy: + type: Recreate + +service: + type: ClusterIP + +ingress: + enabled: false + +persistence: + enabled: true + storageClass: "linode-block-storage-retain" + + size: 10Gi + +metrics: + enabled: false + + podAnnotations: + prometheus.io/scrape: "false" diff --git a/kubernetes/namespaces/default/public-stats/README.md b/kubernetes/namespaces/default/public-stats/README.md new file mode 100644 index 0000000..2654eaa --- /dev/null +++ b/kubernetes/namespaces/default/public-stats/README.md @@ -0,0 +1,7 @@ +# Public Stats + +Python Discord Public Stats is the public stats portal for viewing server statistics at https://stats.pythondiscord.com/ + +The deployment manifest is located in the [python-discord/public-stats](https://github.com/python-discord/public-stats/blob/master/deployment.yaml) repo. + +To apply the service and ingress run `kubectl apply -f .` in this folder. diff --git a/kubernetes/namespaces/default/public-stats/ingress.yaml b/kubernetes/namespaces/default/public-stats/ingress.yaml new file mode 100644 index 0000000..c620eb2 --- /dev/null +++ b/kubernetes/namespaces/default/public-stats/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: public-stats +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: stats.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: public-stats + port: + number: 8000 diff --git a/kubernetes/namespaces/default/public-stats/service.yaml b/kubernetes/namespaces/default/public-stats/service.yaml new file mode 100644 index 0000000..d52055f --- /dev/null +++ b/kubernetes/namespaces/default/public-stats/service.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Service +metadata: + name: public-stats +spec: + ports: + - port: 8000 + selector: + app: public-stats diff --git a/kubernetes/namespaces/default/quackstack/README.md b/kubernetes/namespaces/default/quackstack/README.md new file mode 100644 index 0000000..c8b8a92 --- /dev/null +++ b/kubernetes/namespaces/default/quackstack/README.md @@ -0,0 +1,7 @@ +# QuackStack + +The deployment for the [QuackStack](https://github.com/python-discord/quackstack) project, hosted at https://quackstack.pythondiscord.com. + +This project doesn't need any configuration right now. + +To deploy this application run `kubectl apply -f .` from this directory. This will create a deployment, service and ingress. diff --git a/kubernetes/namespaces/default/quackstack/deployment.yaml b/kubernetes/namespaces/default/quackstack/deployment.yaml new file mode 100644 index 0000000..15747c2 --- /dev/null +++ b/kubernetes/namespaces/default/quackstack/deployment.yaml @@ -0,0 +1,20 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: quackstack +spec: + replicas: 1 + selector: + matchLabels: + app: quackstack + template: + metadata: + labels: + app: quackstack + spec: + containers: + - name: quackstack + image: ghcr.io/python-discord/quackstack:main + imagePullPolicy: Always + ports: + - containerPort: 80 diff --git a/kubernetes/namespaces/default/quackstack/ingress.yaml b/kubernetes/namespaces/default/quackstack/ingress.yaml new file mode 100644 index 0000000..624e153 --- /dev/null +++ b/kubernetes/namespaces/default/quackstack/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: quackstack +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: quackstack.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: quackstack + port: + number: 80 diff --git a/kubernetes/namespaces/default/quackstack/service.yaml b/kubernetes/namespaces/default/quackstack/service.yaml new file mode 100644 index 0000000..86926c5 --- /dev/null +++ b/kubernetes/namespaces/default/quackstack/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: quackstack +spec: + selector: + app: quackstack + ports: + - protocol: TCP + port: 80 + targetPort: 80 diff --git a/kubernetes/namespaces/default/redirects/README.md b/kubernetes/namespaces/default/redirects/README.md new file mode 100644 index 0000000..cbaf102 --- /dev/null +++ b/kubernetes/namespaces/default/redirects/README.md @@ -0,0 +1,8 @@ +# Redirects +Some of our point to an external service, for example https://git.pythondiscord.com/ points towards our GitHub organisation. + +This folder contains all the redirects for our subdomains. + +They consist of an Ingress to handle the redirection through rewrite annotations. + +To deploy these routes simply run `kubectl apply -f .` in this folder. diff --git a/kubernetes/namespaces/default/redirects/github.yaml b/kubernetes/namespaces/default/redirects/github.yaml new file mode 100644 index 0000000..130b0dd --- /dev/null +++ b/kubernetes/namespaces/default/redirects/github.yaml @@ -0,0 +1,25 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/rewrite-target: "https://github.com/python-discord/$1" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: github-redirect +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: git.pythondiscord.com + http: + paths: + - path: /(.*) + pathType: Prefix + backend: + service: + name: site + port: + number: 80 diff --git a/kubernetes/namespaces/default/redirects/paypal.yaml b/kubernetes/namespaces/default/redirects/paypal.yaml new file mode 100644 index 0000000..d84afa1 --- /dev/null +++ b/kubernetes/namespaces/default/redirects/paypal.yaml @@ -0,0 +1,25 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/rewrite-target: "https://www.paypal.com/paypalme/pythondiscord" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: paypal-redirect +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: paypal.pythondiscord.com + http: + paths: + - path: /(.*) + pathType: Prefix + backend: + service: + name: site + port: + number: 80 diff --git a/kubernetes/namespaces/default/redirects/sentry.yaml b/kubernetes/namespaces/default/redirects/sentry.yaml new file mode 100644 index 0000000..c4ad8e6 --- /dev/null +++ b/kubernetes/namespaces/default/redirects/sentry.yaml @@ -0,0 +1,25 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/rewrite-target: "https://sentry.io/organizations/python-discord/issues/" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: sentry-redirect +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: sentry.pythondiscord.com + http: + paths: + - path: /(.*) + pathType: Prefix + backend: + service: + name: site + port: + number: 80 diff --git a/kubernetes/namespaces/default/redis/README.md b/kubernetes/namespaces/default/redis/README.md new file mode 100644 index 0000000..d496758 --- /dev/null +++ b/kubernetes/namespaces/default/redis/README.md @@ -0,0 +1,34 @@ +# Python Discord Redis +This folder contains the configuration for Python Discord's Redis instance. + +## ConfigMap +**We'll need to create a ConfigMap for this service, which will hold the `redis.conf` configuration.** + +Do the following: +1. Make a copy of `redis.conf.template` called `redis.conf` +2. Edit your `redis.conf` to replace `<INSERT PASSWORD>` with the password you'd like your redis instance to use. +3. Use `kubectl create configmap redis-conf --from-file=redis.conf` to create the ConfigMap +4. Delete the `redis.conf`. **We don't wanna commit that password anywhere!** + +## Volume +A 10Gi volume is provisioned on the Linode Block Storage (Retain) storage class. + +## Deployment +The deployment will pull the `redis:latest` image from DockerHub. + +It will mount the created volume at `/data`. + +It will expose port `6379` to connect to Redis. + +## Service +A service called `redis` will be created to give the deployment a cluster local DNS record of `redis.default.svc.cluster.local`. + +## Secrets + +Redis requires a `redis-credentials` secret with the following entries: + +| Environment | Description | +|----------------|---------------------------------------| +| REDIS_HOST | The host redis is running on | +| REDIS_PASSWORD | The password to connect to redis with | +| REDIS_PORT | The port redis is listening on | diff --git a/kubernetes/namespaces/default/redis/configmap.yaml b/kubernetes/namespaces/default/redis/configmap.yaml new file mode 100644 index 0000000..2a2f23e --- /dev/null +++ b/kubernetes/namespaces/default/redis/configmap.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: redis-conf + namespace: default +data: + redis.conf: | + # Store all commands used and replay on server startup + appendonly yes + + # Set working directory + dir /data + + # Set a memory maximum + maxmemory 1gb diff --git a/kubernetes/namespaces/default/redis/deployment.yaml b/kubernetes/namespaces/default/redis/deployment.yaml new file mode 100644 index 0000000..ef5d68c --- /dev/null +++ b/kubernetes/namespaces/default/redis/deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis +spec: + replicas: 1 + strategy: + type: Recreate + selector: + matchLabels: + app: redis + template: + metadata: + labels: + app: redis + spec: + containers: + - name: redis + image: redis:latest + command: + - redis-server + args: + - /config/redis.conf + - --requirepass + - $(REDIS_PASSWORD) + imagePullPolicy: Always + resources: + requests: + cpu: 50m + memory: 100Mi + limits: + cpu: 100m + memory: 150Mi + ports: + - containerPort: 6379 + envFrom: + - secretRef: + name: redis-credentials + volumeMounts: + - name: redis-data-volume + mountPath: /data # Must match the dir in the redis.conf + - name: redis-config-volume + mountPath: /config + securityContext: + readOnlyRootFilesystem: true + + volumes: + - name: redis-data-volume + persistentVolumeClaim: + claimName: redis-storage + - name: redis-config-volume + configMap: + name: redis-conf + + securityContext: + fsGroup: 1000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/redis/redis.conf.template b/kubernetes/namespaces/default/redis/redis.conf.template new file mode 100644 index 0000000..6d8eeac --- /dev/null +++ b/kubernetes/namespaces/default/redis/redis.conf.template @@ -0,0 +1,11 @@ +# Store all commands used and replay on server startup +appendonly yes + +# Set password +requirepass <INSERT PASSWORD> + +# Set working directory +dir /data + +# Set a memory maximum +maxmemory 1gb diff --git a/kubernetes/namespaces/default/redis/secrets.yaml b/kubernetes/namespaces/default/redis/secrets.yaml Binary files differnew file mode 100644 index 0000000..29e4c15 --- /dev/null +++ b/kubernetes/namespaces/default/redis/secrets.yaml diff --git a/kubernetes/namespaces/default/redis/service.yaml b/kubernetes/namespaces/default/redis/service.yaml new file mode 100644 index 0000000..0be72e8 --- /dev/null +++ b/kubernetes/namespaces/default/redis/service.yaml @@ -0,0 +1,9 @@ +apiVersion: v1 +kind: Service +metadata: + name: redis +spec: + ports: + - port: 6379 # Redis default port + selector: + app: redis diff --git a/kubernetes/namespaces/default/redis/volume.yaml b/kubernetes/namespaces/default/redis/volume.yaml new file mode 100644 index 0000000..6522ea6 --- /dev/null +++ b/kubernetes/namespaces/default/redis/volume.yaml @@ -0,0 +1,13 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: redis-storage + labels: + app: redis +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 10Gi diff --git a/kubernetes/namespaces/default/sir-lancebot/README.md b/kubernetes/namespaces/default/sir-lancebot/README.md new file mode 100644 index 0000000..293a955 --- /dev/null +++ b/kubernetes/namespaces/default/sir-lancebot/README.md @@ -0,0 +1,29 @@ +## Sir Lancebot +``` +Oh brave Sir Lancebot! + +Whereat he turned and stood with folded arms and numerous antennae, +"Why frown upon a friend? Few live that have too many." +A weary-waiting optical array, now calibrated to a sad wrath. +Hereafter, thus t'was with him that we hath forged our path. +``` + +## Secrets +This deployment expects a number of secrets and environment variables to exist in a secret called `sir-lancebot-env` shown below. The bot also relies on redis credentials being available in a secret named `redis-credentials` + + +| Environment | Description | +|---------------------------|------------------------------------------| +| BOT_SENTRY_DSN | The DSN for the Sentry project. | +| CLIENT_DEBUG | Should the bot start in DEBUG mode? | +| CLIENT_TOKEN | The bot token to run the bot on. | +| LATEX_API_URL | The URl tha the latex API is served from | +| TOKENS_GIPHY | API key for Giphy. | +| TOKENS_GITHUB | GitHub access token, for Hacktoberstats. | +| TOKENS_IGDB_CLIENT_ID | Client ID IGDB - used to find games. | +| TOKENS_IGDB_CLIENT_SECRET | Client secret IGDB - used to find games. | +| TOKENS_NASA | API key for NASA. | +| TOKENS_TMDB | Token for TMBD. Used for scarymovie.py. | +| TOKENS_UNSPLASH | Token for unsplash. | +| TOKENS_YOUTUBE | API key for YouTube. | +| WOLFRAM_KEY | API key for Wolfram Alpha. | diff --git a/kubernetes/namespaces/default/sir-lancebot/deployment.yaml b/kubernetes/namespaces/default/sir-lancebot/deployment.yaml new file mode 100644 index 0000000..826af5b --- /dev/null +++ b/kubernetes/namespaces/default/sir-lancebot/deployment.yaml @@ -0,0 +1,51 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sir-lancebot +spec: + replicas: 1 + selector: + matchLabels: + app: sir-lancebot + template: + metadata: + labels: + app: sir-lancebot + spec: + containers: + - name: sir-lancebot + image: ghcr.io/python-discord/sir-lancebot:latest + imagePullPolicy: Always + resources: + requests: + cpu: 400m + memory: 100Mi + limits: + cpu: 500m + memory: 200Mi + envFrom: + - secretRef: + name: sir-lancebot-env + - secretRef: + name: redis-credentials + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: lancebot-data-vol + mountPath: /bot/bot/exts/fun/_latex_cache + - name: lancebot-logs-vol + mountPath: /bot/bot/log + - name: lancebot-tmp-vol + mountPath: /tmp + volumes: + - name: lancebot-data-vol + emptyDir: {} + - name: lancebot-logs-vol + emptyDir: {} + - name: lancebot-tmp-vol + emptyDir: + medium: Memory + securityContext: + fsGroup: 1000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/sir-lancebot/secrets.yaml b/kubernetes/namespaces/default/sir-lancebot/secrets.yaml Binary files differnew file mode 100644 index 0000000..a54c729 --- /dev/null +++ b/kubernetes/namespaces/default/sir-lancebot/secrets.yaml diff --git a/kubernetes/namespaces/default/sir-robin/README.md b/kubernetes/namespaces/default/sir-robin/README.md new file mode 100644 index 0000000..652cb53 --- /dev/null +++ b/kubernetes/namespaces/default/sir-robin/README.md @@ -0,0 +1,17 @@ +## Sir-Robin +Deployment file for @Sir-Robin, the not-quite-so-bot as Sir Lancebot, is our humble events bot. +He is tasked with dealing with all the things that the event team can throw at it! + +## Secrets +This deployment expects a number of secrets/environment variables to exist in a secret called `sir-robin-env`. The bot also relies on redis credentials being available in a secret named `redis-credentials` + +| Environment | Description | +|---------------------------|-----------------------------------------------| +| AOC_LEADERBOARDS | A list of all AOC leaderboards to use | +| AOC_STAFF_LEADERBOARD_ID | The staff AOC leaderboard. | +| AOC_YEAR | The current year to use for AOC | +| BOT_DEBUG | Whether debug is enabled (true/false) | +| BOT_TOKEN | The bot token to run the bot on. | +| CODE_JAM_API_KEY | The API key to the code jam management system | +| SITE_API_TOKEN | The token to access the site API. | +| SITE_URL | The base URL for our website. | diff --git a/kubernetes/namespaces/default/sir-robin/deployment.yaml b/kubernetes/namespaces/default/sir-robin/deployment.yaml new file mode 100644 index 0000000..dc2a839 --- /dev/null +++ b/kubernetes/namespaces/default/sir-robin/deployment.yaml @@ -0,0 +1,36 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: sir-robin +spec: + replicas: 1 + selector: + matchLabels: + app: sir-robin + template: + metadata: + labels: + app: sir-robin + spec: + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + containers: + - name: sir-robin + image: ghcr.io/python-discord/sir-robin:latest + imagePullPolicy: Always + resources: + requests: + cpu: 500m + memory: 300Mi + limits: + cpu: 750m + memory: 500Mi + envFrom: + - secretRef: + name: sir-robin-env + - secretRef: + name: redis-credentials + securityContext: + readOnlyRootFilesystem: true diff --git a/kubernetes/namespaces/default/sir-robin/secrets.yaml b/kubernetes/namespaces/default/sir-robin/secrets.yaml Binary files differnew file mode 100644 index 0000000..d40e958 --- /dev/null +++ b/kubernetes/namespaces/default/sir-robin/secrets.yaml diff --git a/kubernetes/namespaces/default/site/README.md b/kubernetes/namespaces/default/site/README.md new file mode 100644 index 0000000..3eff711 --- /dev/null +++ b/kubernetes/namespaces/default/site/README.md @@ -0,0 +1,25 @@ +# Python Discord Site +This folder contains the manifests for Python Discord site. + +## Serving static files + +Usually, a web server like `nginx` should be deployed and configured to serve static files needed by Django. Then we'd put an `Ingress` +rule to route traffic to the `STATIC_URL` to that webserver. +Check the [official docs](https://docs.djangoproject.com/en/4.2/howto/static-files/deployment/) for more info. + +In this setup, we do it differently thanks to [WhiteNoise](https://whitenoise.readthedocs.io/en/stable/base.html#), which sets up +a middleware that handles the caching, compression and serving of the static files for us. + +## Secrets + +The deployment expects the following secrets to be available in `site-env`: + +| Environment | Description | +|-----------------------|------------------------------------------------------------| +| DATABASE_URL | The URL for the Postgresql database. | +| GITHUB_APP_ID | The ID of a GitHub Application (related to the above key). | +| GITHUB_APP_KEY | A PEM key for a GitHub Application. | +| GITHUB_TOKEN | An API key to the Github API | +| METRICITY_DB_URL | The URL for the Metricity database. | +| SECRET_KEY | Secret key for Django. | +| SITE_DSN | The Sentry Data Source Name. | diff --git a/kubernetes/namespaces/default/site/deployment.yaml b/kubernetes/namespaces/default/site/deployment.yaml new file mode 100644 index 0000000..2f88af8 --- /dev/null +++ b/kubernetes/namespaces/default/site/deployment.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: site +spec: + replicas: 2 + selector: + matchLabels: + app: site + template: + metadata: + labels: + app: site + spec: + containers: + - name: site + image: ghcr.io/python-discord/site:latest + imagePullPolicy: Always + ports: + - containerPort: 8000 + livenessProbe: + httpGet: + path: / + port: 8000 + httpHeaders: + - name: Host + value: pythondiscord.com + failureThreshold: 2 + periodSeconds: 30 + timeoutSeconds: 5 + initialDelaySeconds: 10 + startupProbe: + httpGet: + path: / + port: 8000 + httpHeaders: + - name: Host + value: pythondiscord.com + failureThreshold: 15 + periodSeconds: 2 + timeoutSeconds: 5 + initialDelaySeconds: 10 + resources: + limits: + cpu: 500m + memory: 1000Mi + requests: + cpu: 250m + memory: 400Mi + env: + # Needs to match with the variable name being read in django-prometheus + # https://github.com/korfuri/django-prometheus/blob/434a3ba36bdada45c9633451f5f6cfd145814ccf/django_prometheus/exports.py#L119 + - name: prometheus_multiproc_dir + value: /tmp + envFrom: + - secretRef: + name: site-env + volumeMounts: + # Used for `gunicorn` worker heartbeats as well as the Prometheus + # client library's multiprocessing mode. + - name: django-tmp + mountPath: /tmp + securityContext: + readOnlyRootFilesystem: true + volumes: + - name: django-tmp + emptyDir: + medium: Memory + securityContext: + fsGroup: 1000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/default/site/ingress.yaml b/kubernetes/namespaces/default/site/ingress.yaml new file mode 100644 index 0000000..9f12daf --- /dev/null +++ b/kubernetes/namespaces/default/site/ingress.yaml @@ -0,0 +1,29 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + nginx.ingress.kubernetes.io/server-snippet: | + location ~* /metrics { + deny all; + return 403; + } + name: site +spec: + tls: + - hosts: + - "*.pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: www.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: site + port: + number: 80 diff --git a/kubernetes/namespaces/default/site/redirect.yaml b/kubernetes/namespaces/default/site/redirect.yaml new file mode 100644 index 0000000..33cf7d2 --- /dev/null +++ b/kubernetes/namespaces/default/site/redirect.yaml @@ -0,0 +1,28 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/server-snippet: | + location ~* / { + return 308 https://www.pythondiscord.com$request_uri; + } + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: www-redirect +spec: + tls: + - hosts: + - "pythondiscord.com" + secretName: pythondiscord.com-tls + rules: + - host: pythondiscord.com + http: + paths: + - path: /(.*) + pathType: Prefix + backend: + service: + name: site + port: + number: 80 diff --git a/kubernetes/namespaces/default/site/secrets.yaml b/kubernetes/namespaces/default/site/secrets.yaml Binary files differnew file mode 100644 index 0000000..1e42248 --- /dev/null +++ b/kubernetes/namespaces/default/site/secrets.yaml diff --git a/kubernetes/namespaces/default/site/service.yaml b/kubernetes/namespaces/default/site/service.yaml new file mode 100644 index 0000000..4f06394 --- /dev/null +++ b/kubernetes/namespaces/default/site/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: site + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8000" +spec: + selector: + app: site + ports: + - protocol: TCP + port: 80 + targetPort: 8000 diff --git a/kubernetes/namespaces/default/snekbox-forms/README.md b/kubernetes/namespaces/default/snekbox-forms/README.md new file mode 100644 index 0000000..4514032 --- /dev/null +++ b/kubernetes/namespaces/default/snekbox-forms/README.md @@ -0,0 +1,5 @@ +# Snekbox-forms + +This folder contains manifests for a Snekbox service specific to the forms project. This instance has no 3rd party libs installed, unlike regular snekbox, so submissions via forms can only use the stdlib. + +The deployment manifest for this service is based on in manifest found inside the snekbox repository at [python-discord/snekbox](https://github.com/python-discord/snekbox), modified only by removing the volume mount, and 3rd party dep installation script. diff --git a/kubernetes/namespaces/default/snekbox-forms/deployment.yaml b/kubernetes/namespaces/default/snekbox-forms/deployment.yaml new file mode 100644 index 0000000..2464249 --- /dev/null +++ b/kubernetes/namespaces/default/snekbox-forms/deployment.yaml @@ -0,0 +1,69 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: snekbox-forms +spec: + replicas: 1 + selector: + matchLabels: + app: snekbox-forms + template: + metadata: + labels: + app: snekbox-forms + spec: + initContainers: + - name: init-service + image: busybox:latest + command: ["/bin/sh", "-c"] + args: + - > + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/random.png -O /tmp/cj-inputs/random.png; + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/shuffled1_order.txt -O /tmp/cj-inputs/shuffled1_order.txt; + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/shuffled1.png -O /tmp/cj-inputs/shuffled1.png; + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/shuffled4_order.txt -O /tmp/cj-inputs/shuffled4_order.txt; + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/shuffled4.png -O /tmp/cj-inputs/shuffled4.png; + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/shuffled8_order.txt -O /tmp/cj-inputs/shuffled8_order.txt; + wget https://files.pydis.wtf/33649562-5739-4f23-9219-28fc236b86bc/shuffled8.png -O /tmp/cj-inputs/shuffled8.png; + wget https://raw.githubusercontent.com/python-discord/snekbox/main/config/snekbox.cfg -O /tmp/config/snekbox.cfg; + echo -e 'mount {\n src:"/snekbox/cj-inputs"\n dst:"/cj-inputs"\n is_bind: true\n rw: false}' >> /tmp/config/snekbox.cfg; + volumeMounts: + - name: snekbox-cj-inputs + mountPath: /tmp/cj-inputs/ + - name: snekbox-forms-config + mountPath: /tmp/config/ + containers: + - name: snekbox-forms + image: ghcr.io/python-discord/snekbox:latest + imagePullPolicy: Always + ports: + - containerPort: 8060 + securityContext: + privileged: true + volumeMounts: + - name: snekbox-forms-user-base-volume + mountPath: /snekbox/user_base + - name: snekbox-cj-inputs + mountPath: /snekbox/cj-inputs + - name: snekbox-forms-config + mountPath: /snekbox/config/snekbox.cfg + subPath: snekbox.cfg + lifecycle: + postStart: + exec: + command: + - "/bin/sh" + - "-c" + - >- + PYTHONUSERBASE=/snekbox/user_base + pip install --user --upgrade + pip + pillow==10.0.0 + opencv-python-headless==4.8.0.74 + volumes: + - name: snekbox-forms-user-base-volume + emptyDir: {} + - name: snekbox-cj-inputs + emptyDir: {} + - name: snekbox-forms-config + emptyDir: {} diff --git a/kubernetes/namespaces/default/snekbox-forms/service.yaml b/kubernetes/namespaces/default/snekbox-forms/service.yaml new file mode 100644 index 0000000..99937eb --- /dev/null +++ b/kubernetes/namespaces/default/snekbox-forms/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: snekbox-forms +spec: + selector: + app: snekbox-forms + ports: + - protocol: TCP + port: 80 + targetPort: 8060 diff --git a/kubernetes/namespaces/default/snekbox/README.md b/kubernetes/namespaces/default/snekbox/README.md new file mode 100644 index 0000000..bd7446b --- /dev/null +++ b/kubernetes/namespaces/default/snekbox/README.md @@ -0,0 +1,5 @@ +# Snekbox + +This folder contains manifests for Snekbox service. + +The actual snekbox deployment manifest can be found inside the snekbox repository at [python-discord/snekbox](https://github.com/python-discord/snekbox). diff --git a/kubernetes/namespaces/default/snekbox/service.yaml b/kubernetes/namespaces/default/snekbox/service.yaml new file mode 100644 index 0000000..9ae20b0 --- /dev/null +++ b/kubernetes/namespaces/default/snekbox/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: snekbox +spec: + selector: + app: snekbox + ports: + - protocol: TCP + port: 80 + targetPort: 8060 diff --git a/kubernetes/namespaces/kube-system/nginx/README.md b/kubernetes/namespaces/kube-system/nginx/README.md new file mode 100644 index 0000000..f940702 --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/README.md @@ -0,0 +1,7 @@ +# NGINX + +NGINX ingress is our ingress controller for all PyDis web properties. + +This directory contains resources for the Helm chart we use to deploy. + +Documentation for deploying nginx-ingress with Helm is located [here](https://kubernetes.github.io/ingress-nginx/deploy/#using-helm), the chart is located [here](https://github.com/kubernetes/ingress-nginx/tree/main/charts/ingress-nginx). diff --git a/kubernetes/namespaces/kube-system/nginx/internal-svc.yaml b/kubernetes/namespaces/kube-system/nginx/internal-svc.yaml new file mode 100644 index 0000000..636404a --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/internal-svc.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + name: ingress-nginx-internal + namespace: kube-system +spec: + selector: + app.kubernetes.io/instance: ingress-nginx + ports: + - protocol: TCP + port: 80 + name: http + targetPort: 80 + - protocol: TCP + port: 443 + name: https + targetPort: 443 diff --git a/kubernetes/namespaces/kube-system/nginx/mtls/Makefile b/kubernetes/namespaces/kube-system/nginx/mtls/Makefile new file mode 100644 index 0000000..3ee6b5f --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/mtls/Makefile @@ -0,0 +1,10 @@ +certs: + cat *.pem > ca.crt + +secret: + kubectl create secret -n kube-system generic mtls-client-crt-bundle --from-file=ca.crt=ca.crt + +all: certs secret + +delete: + kubectl delete secret -n kube-system mtls-client-crt-bundle diff --git a/kubernetes/namespaces/kube-system/nginx/mtls/ca.crt b/kubernetes/namespaces/kube-system/nginx/mtls/ca.crt new file mode 100644 index 0000000..f1567c5 --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/mtls/ca.crt @@ -0,0 +1,55 @@ +-----BEGIN CERTIFICATE----- +MIIGCjCCA/KgAwIBAgIIV5G6lVbCLmEwDQYJKoZIhvcNAQENBQAwgZAxCzAJBgNV +BAYTAlVTMRkwFwYDVQQKExBDbG91ZEZsYXJlLCBJbmMuMRQwEgYDVQQLEwtPcmln +aW4gUHVsbDEWMBQGA1UEBxMNU2FuIEZyYW5jaXNjbzETMBEGA1UECBMKQ2FsaWZv +cm5pYTEjMCEGA1UEAxMab3JpZ2luLXB1bGwuY2xvdWRmbGFyZS5uZXQwHhcNMTkx +MDEwMTg0NTAwWhcNMjkxMTAxMTcwMDAwWjCBkDELMAkGA1UEBhMCVVMxGTAXBgNV +BAoTEENsb3VkRmxhcmUsIEluYy4xFDASBgNVBAsTC09yaWdpbiBQdWxsMRYwFAYD +VQQHEw1TYW4gRnJhbmNpc2NvMRMwEQYDVQQIEwpDYWxpZm9ybmlhMSMwIQYDVQQD +ExpvcmlnaW4tcHVsbC5jbG91ZGZsYXJlLm5ldDCCAiIwDQYJKoZIhvcNAQEBBQAD +ggIPADCCAgoCggIBAN2y2zojYfl0bKfhp0AJBFeV+jQqbCw3sHmvEPwLmqDLqynI +42tZXR5y914ZB9ZrwbL/K5O46exd/LujJnV2b3dzcx5rtiQzso0xzljqbnbQT20e +ihx/WrF4OkZKydZzsdaJsWAPuplDH5P7J82q3re88jQdgE5hqjqFZ3clCG7lxoBw +hLaazm3NJJlUfzdk97ouRvnFGAuXd5cQVx8jYOOeU60sWqmMe4QHdOvpqB91bJoY +QSKVFjUgHeTpN8tNpKJfb9LIn3pun3bC9NKNHtRKMNX3Kl/sAPq7q/AlndvA2Kw3 +Dkum2mHQUGdzVHqcOgea9BGjLK2h7SuX93zTWL02u799dr6Xkrad/WShHchfjjRn +aL35niJUDr02YJtPgxWObsrfOU63B8juLUphW/4BOjjJyAG5l9j1//aUGEi/sEe5 +lqVv0P78QrxoxR+MMXiJwQab5FB8TG/ac6mRHgF9CmkX90uaRh+OC07XjTdfSKGR +PpM9hB2ZhLol/nf8qmoLdoD5HvODZuKu2+muKeVHXgw2/A6wM7OwrinxZiyBk5Hh +CvaADH7PZpU6z/zv5NU5HSvXiKtCzFuDu4/Zfi34RfHXeCUfHAb4KfNRXJwMsxUa ++4ZpSAX2G6RnGU5meuXpU5/V+DQJp/e69XyyY6RXDoMywaEFlIlXBqjRRA2pAgMB +AAGjZjBkMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/AgECMB0GA1Ud +DgQWBBRDWUsraYuA4REzalfNVzjann3F6zAfBgNVHSMEGDAWgBRDWUsraYuA4REz +alfNVzjann3F6zANBgkqhkiG9w0BAQ0FAAOCAgEAkQ+T9nqcSlAuW/90DeYmQOW1 +QhqOor5psBEGvxbNGV2hdLJY8h6QUq48BCevcMChg/L1CkznBNI40i3/6heDn3IS +zVEwXKf34pPFCACWVMZxbQjkNRTiH8iRur9EsaNQ5oXCPJkhwg2+IFyoPAAYURoX +VcI9SCDUa45clmYHJ/XYwV1icGVI8/9b2JUqklnOTa5tugwIUi5sTfipNcJXHhgz +6BKYDl0/UP0lLKbsUETXeTGDiDpxZYIgbcFrRDDkHC6BSvdWVEiH5b9mH2BON60z +0O0j8EEKTwi9jnafVtZQXP/D8yoVowdFDjXcKkOPF/1gIh9qrFR6GdoPVgB3SkLc +5ulBqZaCHm563jsvWb/kXJnlFxW+1bsO9BDD6DweBcGdNurgmH625wBXksSdD7y/ +fakk8DagjbjKShYlPEFOAqEcliwjF45eabL0t27MJV61O/jHzHL3dknXeE4BDa2j +bA+JbyJeUMtU7KMsxvx82RmhqBEJJDBCJ3scVptvhDMRrtqDBW5JShxoAOcpFQGm +iYWicn46nPDjgTU0bX1ZPpTpryXbvciVL5RkVBuyX2ntcOLDPlZWgxZCBp96x07F +AnOzKgZk4RzZPNAxCXERVxajn/FLcOhglVAKo5H0ac+AitlQ0ip55D2/mf8o72tM +fVQ6VpyjEXdiIXWUq/o= +-----END CERTIFICATE----- +-----BEGIN CERTIFICATE----- +MIIDTDCCAjQCCQDLTrV5jIanizANBgkqhkiG9w0BAQsFADBoMQswCQYDVQQGEwJV +SzEXMBUGA1UECgwOUHl0aG9uIERpc2NvcmQxGjAYBgNVBAMMEXB5dGhvbmRpc2Nv +cmQuY29tMSQwIgYJKoZIhvcNAQkBFhV0bHNAcHl0aG9uZGlzY29yZC5jb20wHhcN +MjEwMzA2MjMwMzQ0WhcNMjIwMzA2MjMwMzQ0WjBoMQswCQYDVQQGEwJVSzEXMBUG +A1UECgwOUHl0aG9uIERpc2NvcmQxGjAYBgNVBAMMEXB5dGhvbmRpc2NvcmQuY29t +MSQwIgYJKoZIhvcNAQkBFhV0bHNAcHl0aG9uZGlzY29yZC5jb20wggEiMA0GCSqG +SIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpRDoQqJPxGV38DsR4x3QKMV2P7lQiT6VK +fTBK9PIeExBgCTHsJN0s68eXAF9tIgInXbmwbF3ysIs+j8nXTO/OThGJ4jE6J6RA +MC31zfzHcF/0Qc4VNaQEK1x/gX7dT0SpgNIJ5eTvz8h++spMjYonEMJ3L4Mu5R8h +QDnLeD2/c7NfEtY9sv4AMTS3cn8I4q2HuwSEKTOGVs5LwAjruAtv4KvKG3W02PvX +Ja3hEisIHaHB2K7aAK2m4gKDrczeQaQWOtlPjlWmvTEnU/chI3PUXazvUJqeS449 +kw9JGoFjPwVSyY2sxYuFL9TAMNxVj+bJ/VX5GU6qCo1wW8R3ItltAgMBAAEwDQYJ +KoZIhvcNAQELBQADggEBAAaxy5Mbi0fuQFvmQ5ViR2Y6yySeUIDdYMPN/92rzubJ +w1yUS/liJ0L01RS/3VvAuchE+3uIK5ybaR2zwgnmWjIIyllC2cyOwNrzCbSTEZpH +3blSwmPr01fwIXFwANfK+Qz5NdG0LrrU6RloIajqkDXwgDXfMCfJwl6JnRORGUHk +QOGtP4mWA1KqHMtPRQKSv/7TK2s+Sbg/e1T+0iE/VbhzJZonF0/VDQk2huyD7Z7I +VJ62bzsI0V59pGmZYAen9g9EfmZXn2m3QTd+tQytzmnUKyuwfmXt4mnvxkral+ES +eB4Nzv4EDPjThS8LLp7xEL5PBS+FAF5EdZPK23FtexQ= +-----END CERTIFICATE----- diff --git a/kubernetes/namespaces/kube-system/nginx/mtls/cloudflare-cert.pem b/kubernetes/namespaces/kube-system/nginx/mtls/cloudflare-cert.pem new file mode 100644 index 0000000..965f0bf --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/mtls/cloudflare-cert.pem @@ -0,0 +1,35 @@ +-----BEGIN CERTIFICATE----- +MIIGCjCCA/KgAwIBAgIIV5G6lVbCLmEwDQYJKoZIhvcNAQENBQAwgZAxCzAJBgNV +BAYTAlVTMRkwFwYDVQQKExBDbG91ZEZsYXJlLCBJbmMuMRQwEgYDVQQLEwtPcmln +aW4gUHVsbDEWMBQGA1UEBxMNU2FuIEZyYW5jaXNjbzETMBEGA1UECBMKQ2FsaWZv +cm5pYTEjMCEGA1UEAxMab3JpZ2luLXB1bGwuY2xvdWRmbGFyZS5uZXQwHhcNMTkx +MDEwMTg0NTAwWhcNMjkxMTAxMTcwMDAwWjCBkDELMAkGA1UEBhMCVVMxGTAXBgNV +BAoTEENsb3VkRmxhcmUsIEluYy4xFDASBgNVBAsTC09yaWdpbiBQdWxsMRYwFAYD +VQQHEw1TYW4gRnJhbmNpc2NvMRMwEQYDVQQIEwpDYWxpZm9ybmlhMSMwIQYDVQQD +ExpvcmlnaW4tcHVsbC5jbG91ZGZsYXJlLm5ldDCCAiIwDQYJKoZIhvcNAQEBBQAD +ggIPADCCAgoCggIBAN2y2zojYfl0bKfhp0AJBFeV+jQqbCw3sHmvEPwLmqDLqynI +42tZXR5y914ZB9ZrwbL/K5O46exd/LujJnV2b3dzcx5rtiQzso0xzljqbnbQT20e +ihx/WrF4OkZKydZzsdaJsWAPuplDH5P7J82q3re88jQdgE5hqjqFZ3clCG7lxoBw +hLaazm3NJJlUfzdk97ouRvnFGAuXd5cQVx8jYOOeU60sWqmMe4QHdOvpqB91bJoY +QSKVFjUgHeTpN8tNpKJfb9LIn3pun3bC9NKNHtRKMNX3Kl/sAPq7q/AlndvA2Kw3 +Dkum2mHQUGdzVHqcOgea9BGjLK2h7SuX93zTWL02u799dr6Xkrad/WShHchfjjRn +aL35niJUDr02YJtPgxWObsrfOU63B8juLUphW/4BOjjJyAG5l9j1//aUGEi/sEe5 +lqVv0P78QrxoxR+MMXiJwQab5FB8TG/ac6mRHgF9CmkX90uaRh+OC07XjTdfSKGR +PpM9hB2ZhLol/nf8qmoLdoD5HvODZuKu2+muKeVHXgw2/A6wM7OwrinxZiyBk5Hh +CvaADH7PZpU6z/zv5NU5HSvXiKtCzFuDu4/Zfi34RfHXeCUfHAb4KfNRXJwMsxUa ++4ZpSAX2G6RnGU5meuXpU5/V+DQJp/e69XyyY6RXDoMywaEFlIlXBqjRRA2pAgMB +AAGjZjBkMA4GA1UdDwEB/wQEAwIBBjASBgNVHRMBAf8ECDAGAQH/AgECMB0GA1Ud +DgQWBBRDWUsraYuA4REzalfNVzjann3F6zAfBgNVHSMEGDAWgBRDWUsraYuA4REz +alfNVzjann3F6zANBgkqhkiG9w0BAQ0FAAOCAgEAkQ+T9nqcSlAuW/90DeYmQOW1 +QhqOor5psBEGvxbNGV2hdLJY8h6QUq48BCevcMChg/L1CkznBNI40i3/6heDn3IS +zVEwXKf34pPFCACWVMZxbQjkNRTiH8iRur9EsaNQ5oXCPJkhwg2+IFyoPAAYURoX +VcI9SCDUa45clmYHJ/XYwV1icGVI8/9b2JUqklnOTa5tugwIUi5sTfipNcJXHhgz +6BKYDl0/UP0lLKbsUETXeTGDiDpxZYIgbcFrRDDkHC6BSvdWVEiH5b9mH2BON60z +0O0j8EEKTwi9jnafVtZQXP/D8yoVowdFDjXcKkOPF/1gIh9qrFR6GdoPVgB3SkLc +5ulBqZaCHm563jsvWb/kXJnlFxW+1bsO9BDD6DweBcGdNurgmH625wBXksSdD7y/ +fakk8DagjbjKShYlPEFOAqEcliwjF45eabL0t27MJV61O/jHzHL3dknXeE4BDa2j +bA+JbyJeUMtU7KMsxvx82RmhqBEJJDBCJ3scVptvhDMRrtqDBW5JShxoAOcpFQGm +iYWicn46nPDjgTU0bX1ZPpTpryXbvciVL5RkVBuyX2ntcOLDPlZWgxZCBp96x07F +AnOzKgZk4RzZPNAxCXERVxajn/FLcOhglVAKo5H0ac+AitlQ0ip55D2/mf8o72tM +fVQ6VpyjEXdiIXWUq/o= +-----END CERTIFICATE----- diff --git a/kubernetes/namespaces/kube-system/nginx/mtls/pydis-cert.pem b/kubernetes/namespaces/kube-system/nginx/mtls/pydis-cert.pem new file mode 100644 index 0000000..d1dba63 --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/mtls/pydis-cert.pem @@ -0,0 +1,20 @@ +-----BEGIN CERTIFICATE----- +MIIDTDCCAjQCCQDLTrV5jIanizANBgkqhkiG9w0BAQsFADBoMQswCQYDVQQGEwJV +SzEXMBUGA1UECgwOUHl0aG9uIERpc2NvcmQxGjAYBgNVBAMMEXB5dGhvbmRpc2Nv +cmQuY29tMSQwIgYJKoZIhvcNAQkBFhV0bHNAcHl0aG9uZGlzY29yZC5jb20wHhcN +MjEwMzA2MjMwMzQ0WhcNMjIwMzA2MjMwMzQ0WjBoMQswCQYDVQQGEwJVSzEXMBUG +A1UECgwOUHl0aG9uIERpc2NvcmQxGjAYBgNVBAMMEXB5dGhvbmRpc2NvcmQuY29t +MSQwIgYJKoZIhvcNAQkBFhV0bHNAcHl0aG9uZGlzY29yZC5jb20wggEiMA0GCSqG +SIb3DQEBAQUAA4IBDwAwggEKAoIBAQDpRDoQqJPxGV38DsR4x3QKMV2P7lQiT6VK +fTBK9PIeExBgCTHsJN0s68eXAF9tIgInXbmwbF3ysIs+j8nXTO/OThGJ4jE6J6RA +MC31zfzHcF/0Qc4VNaQEK1x/gX7dT0SpgNIJ5eTvz8h++spMjYonEMJ3L4Mu5R8h +QDnLeD2/c7NfEtY9sv4AMTS3cn8I4q2HuwSEKTOGVs5LwAjruAtv4KvKG3W02PvX +Ja3hEisIHaHB2K7aAK2m4gKDrczeQaQWOtlPjlWmvTEnU/chI3PUXazvUJqeS449 +kw9JGoFjPwVSyY2sxYuFL9TAMNxVj+bJ/VX5GU6qCo1wW8R3ItltAgMBAAEwDQYJ +KoZIhvcNAQELBQADggEBAAaxy5Mbi0fuQFvmQ5ViR2Y6yySeUIDdYMPN/92rzubJ +w1yUS/liJ0L01RS/3VvAuchE+3uIK5ybaR2zwgnmWjIIyllC2cyOwNrzCbSTEZpH +3blSwmPr01fwIXFwANfK+Qz5NdG0LrrU6RloIajqkDXwgDXfMCfJwl6JnRORGUHk +QOGtP4mWA1KqHMtPRQKSv/7TK2s+Sbg/e1T+0iE/VbhzJZonF0/VDQk2huyD7Z7I +VJ62bzsI0V59pGmZYAen9g9EfmZXn2m3QTd+tQytzmnUKyuwfmXt4mnvxkral+ES +eB4Nzv4EDPjThS8LLp7xEL5PBS+FAF5EdZPK23FtexQ= +-----END CERTIFICATE----- diff --git a/kubernetes/namespaces/kube-system/nginx/values.yaml b/kubernetes/namespaces/kube-system/nginx/values.yaml new file mode 100644 index 0000000..858f041 --- /dev/null +++ b/kubernetes/namespaces/kube-system/nginx/values.yaml @@ -0,0 +1,34 @@ +controller: + # Will add custom headers before sending response traffic to the client according to: https://kubernetes.github.io/ingress-nginx/user-guide/nginx-configuration/configmap/#add-headers + addHeaders: + x-powered-by: Joe Banks + + config: + enable-real-ip: true + forwarded-for-header: cf-connecting-ip + generate-request-id: true + + extraArgs: + default-ssl-certificate: "default/pythondiscord.com-tls" + + # This section refers to the creation of the IngressClass resource + # IngressClass resources are supported since k8s >= 1.18 + ingressClassResource: + enabled: true + default: true + + kind: DaemonSet + + service: + annotations: + service.beta.kubernetes.io/linode-loadbalancer-throttle: 0 + service.beta.kubernetes.io/linode-loadbalancer-default-proxy-protocol: none + + metrics: + port: 10254 + enabled: true + + service: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "10254" diff --git a/kubernetes/namespaces/monitoring/alerts/Makefile b/kubernetes/namespaces/monitoring/alerts/Makefile new file mode 100644 index 0000000..c599ee6 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/Makefile @@ -0,0 +1,11 @@ +.PHONY: alerts alertmanager + +all: alerts alertmanager + +# Upload the alerting rules to the Kubernetes cluster +alerts: + kubectl create configmap -n monitoring prometheus-alert-rules --from-file=alerts.d/ -o yaml --dry-run=client | kubectl apply -f - + +# Upload the alertmanager configuration to the Kubernetes cluster +alertmanager: + kubectl create configmap -n monitoring alertmanager-config --from-file=alertmanager.yaml=alertmanager.yaml -o yaml --dry-run=client | kubectl apply -f - diff --git a/kubernetes/namespaces/monitoring/alerts/README.md b/kubernetes/namespaces/monitoring/alerts/README.md new file mode 100644 index 0000000..75f70ac --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/README.md @@ -0,0 +1,5 @@ +# Alerts + +This directory contains alerting rules and routing configuration for production. + +To build and upload this configuration, see the annotated `Makefile` in this directory. diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml new file mode 100644 index 0000000..bef166a --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager.yaml @@ -0,0 +1,24 @@ +route: + group_by: ['alertname', 'cluster', 'service'] + + group_wait: 15s + + group_interval: 1m + + receiver: devops-team + +receivers: +- name: devops-team + slack_configs: + - api_url_file: "/opt/pydis/alertmanager/webhooks/DEVOPS_HOOK" + send_resolved: true + title: '{{ if eq .Status "firing" }}[FIRING]{{ else }}[RESOLVED]{{ end }}' + text: | + {{ if eq .Status "firing" }}{{ range .Alerts }} + **{{ .Annotations.summary }}:** + {{ .Annotations.description }} [(Link)]({{.GeneratorURL}}) + + {{ end }}{{ else }}Alert has resolved.{{ end }} + fields: + - title: Alert + value: "{{ .GroupLabels.alertname }}" diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml new file mode 100644 index 0000000..4f1c322 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/deployment.yaml @@ -0,0 +1,92 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: alertmanager + namespace: monitoring +spec: + replicas: 3 + selector: + matchLabels: + app: alertmanager + template: + metadata: + labels: + app: alertmanager + spec: + serviceAccountName: prometheus + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - alertmanager + namespaces: + - monitoring + topologyKey: kubernetes.io/hostname + weight: 100 + initContainers: + - image: debian:bullseye-slim + imagePullPolicy: Always + name: alertmanager-peering-setup + command: [ + '/opt/pydis/alertmanager/init.d/find-pods.sh' + ] + volumeMounts: + - name: alertmanager-init + mountPath: /opt/pydis/alertmanager/init.d + - name: alertmanager-tmp + mountPath: /tmp + securityContext: + runAsUser: 0 + containers: + - image: prom/alertmanager:latest + imagePullPolicy: Always + name: alertmanager + command: + - /bin/sh + - -c + - | + exec /bin/alertmanager \ + --config.file=/opt/pydis/alertmanager/config.d/alertmanager.yaml \ + --web.external-url=https://alertmanager.pythondiscord.com \ + --storage.path=/data/alertmanager \ + $(cat /tmp/peers) + ports: + - name: am + containerPort: 9093 + - name: am-peering + containerPort: 9094 + volumeMounts: + - name: alertmanager-config + mountPath: /opt/pydis/alertmanager/config.d + - name: alertmanager-webhooks + mountPath: /opt/pydis/alertmanager/webhooks + - name: alertmanager-tmp-data + mountPath: /data + - name: alertmanager-tmp + mountPath: /tmp + securityContext: + readOnlyRootFilesystem: true + restartPolicy: Always + volumes: + - name: alertmanager-config + configMap: + name: alertmanager-config + - name: alertmanager-webhooks + secret: + secretName: alert-manager-hook + - name: alertmanager-tmp-data + emptyDir: {} + - name: alertmanager-tmp + emptyDir: {} + - name: alertmanager-init + configMap: + name: alertmanager-init + defaultMode: 0777 + securityContext: + fsGroup: 1000 + runAsUser: 1000 diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml new file mode 100644 index 0000000..fc99e52 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: alertmanager + namespace: monitoring +spec: + tls: + - hosts: + - "*.pythondiscord.com" + rules: + - host: alertmanager.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: alertmanager + port: + number: 9093 diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml new file mode 100644 index 0000000..f1f36e2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/initscript.yaml @@ -0,0 +1,30 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: alertmanager-init + namespace: monitoring +data: + find-pods.sh: | + #!/bin/sh + + # Install curl and jq for JSON parsing + apt update && apt install -y curl jq + + # Find the template hash + echo Finding template hash... + TEMPLATE_HASH=$(echo $HOSTNAME | cut -d- -f2) + + # Query kubernetes API for all matching pods + echo Querying Kubernetes API for pods... + PODS=$(curl \ + -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" \ + https://kubernetes.default/api/v1/namespaces/monitoring/pods\?labelSelector=pod-template-hash=$TEMPLATE_HASH\&pretty=false -sk -o /tmp/peers.json) + + echo Finding Alertmanager IPs... + AM_IPS=$(jq '.items[].status.podIP' /tmp/peers.json -r) + + echo Generating CLI flags for Alertmanager... + PEER_ARGS=$(echo $AM_IPS | sed 's/ /\n/g' | awk '{ print "--cluster.peer="$1":9094" }') + + echo Writing CLI flags to /tmp/peers... + echo $PEER_ARGS > /tmp/peers diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml new file mode 100644 index 0000000..8ec901a --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/sd-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager-sd + namespace: monitoring +spec: + selector: + app: alertmanager + clusterIP: None + ports: + - port: 9093 + targetPort: 9093 + name: am + - port: 9094 + targetPort: 9094 + name: am-peering diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml Binary files differnew file mode 100644 index 0000000..7cc1d95 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml new file mode 100644 index 0000000..3f26311 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/service-account.yaml @@ -0,0 +1,28 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: alertmanager +rules: +- apiGroups: [""] + resources: ["pods", "endpoints"] + verbs: ["get", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alertmanager + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: alertmanager +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: alertmanager +subjects: + - kind: ServiceAccount + name: alertmanager + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml b/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml new file mode 100644 index 0000000..145b1e2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alertmanager/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: alertmanager + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9093" +spec: + selector: + app: alertmanager + ports: + - port: 9093 + targetPort: 9093 diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml new file mode 100644 index 0000000..b3fcad9 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/alertmanager.yaml @@ -0,0 +1,21 @@ +groups: +- name: alertmanager + rules: + + - alert: AlertManagerClusterFailedPeers + expr: alertmanager_cluster_failed_peers > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "An Alertmanager node is reporting failed peers" + description: "AM {{ $labels.instance }} is reporting that {{ $value }} of it's peers is invalid." + + - alert: AlertManagerHealthScore + expr: alertmanager_cluster_health_score > 0 + for: 1m + labels: + severity: warning + annotations: + summary: "An AlertManagerNode is reporting an unhealthy cluster" + description: "AM {{ $labels.instance }} is reporting that the cluster has a health score of {{ $value }} (where 0 is healthy.)" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml new file mode 100644 index 0000000..10eb3dd --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/certificates.yaml @@ -0,0 +1,13 @@ +groups: +- name: certificates + interval: 1d + rules: + + - alert: CertificateExpiringSoon + expr: (certmanager_certificate_expiration_timestamp_seconds - time()) / 60 / 60 / 24 < 7 + for: 0m + labels: + severity: warning + annotations: + summary: "Certificate is expiring in < 7 days" + description: "The certificate named {{ $labels.name }} is due for expiry in {{ $value | humanize }} days." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml new file mode 100644 index 0000000..9daa660 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/coredns.yaml @@ -0,0 +1,20 @@ +groups: +- name: coredns + rules: + + - alert: CoreDNSPanics + expr: increase(coredns_panics_total[1m]) > 0 + for: 0m + labels: + severity: page + annotations: + summary: "CoreDNS is experiencing panic" + description: "Number of CoreDNS panics encountered: {{ $value }}" + + - alert: CoreDNSCacheMisses + expr: rate(coredns_cache_misses_total{}[10m]) / rate(coredns_cache_misses_total{}[10m] offset 10m) > 5.00 + labels: + severity: page + annotations: + summary: "High CoreDNS cache misses in last 10 minutes" + description: "This can sometimes be an indication of networking troubles, currently {{ $value | humanizePercentage }} over last 10 minutes." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml new file mode 100644 index 0000000..5e8868e --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/cpu.yaml @@ -0,0 +1,21 @@ +groups: +- name: cpu + rules: + + - alert: HighCPUThrottling + expr: rate(container_cpu_cfs_throttled_seconds_total{pod=~".+", container_name!="POD", image!=""}[5m]) > 1 + for: 5m + labels: + severity: page + annotations: + summary: "Container {{ $labels.container_name }} in {{ $labels.pod }} high throttling " + description: "{{ $labels.container_name }} inside {{ $labels.pod }} is at {{ $value }}" + + - alert: HighNodeCPU + expr: 100 - (avg by (kubernetes_node) (irate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: page + annotations: + summary: "Node {{ $labels.kubernetes_node }} has CPU over 80% for last 5 minute" + description: "CPU on {{ $labels.kubernetes_node }} is averaging {{ $value }}" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml new file mode 100644 index 0000000..723d267 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/jobs.yaml @@ -0,0 +1,20 @@ +groups: +- name: jobs + rules: + - alert: KubernetesCronjobSuspended + expr: kube_cronjob_spec_suspend != 0 + for: 0m + labels: + severity: page + annotations: + summary: "Kubernetes CronJob suspended: {{ $labels.cronjob }}" + description: "CronJob {{ $labels.kubernetes_namespace }}/{{ $labels.cronjob }} is suspended" + + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 0m + labels: + severity: page + annotations: + summary: "Kubernetes Job failed: {{ $labels.job_name }}" + description: "Job {{$labels.kubernetes_namespacenamespace}}/{{$labels.job_name}} failed to complete" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml new file mode 100644 index 0000000..d53da5e --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/memory.yaml @@ -0,0 +1,12 @@ +groups: +- name: memory + rules: + + - alert: NodeHighMemoryUsage + expr: node_memory_Active_bytes / node_memory_MemTotal_bytes > 0.8 + for: 30s + labels: + severity: page + annotations: + summary: "Node {{ $labels.kubernetes_node }} has RAM usage >80% for 5 minutes" + description: 'RAM usage is currently {{ $value | humanizePercentage }} on {{ $labels.kubernetes_node }}' diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml new file mode 100644 index 0000000..441f7df --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nginx.yaml @@ -0,0 +1,30 @@ +groups: +- name: nginx + rules: + + - alert: NGINX4XXRequests + expr: sum by(service) (rate(nginx_ingress_controller_requests{status=~"^4..", status!="404", service!="pixels"}[1m])) / sum by(service) (rate(nginx_ingress_controller_requests[1m])) > 0.5 + for: 1m + labels: + severity: page + annotations: + summary: "High rate of 4XX requests for inbound requests" + description: "Rate of 4XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" + + - alert: NGINX5XXRequests + expr: sum(rate(nginx_ingress_controller_requests{status=~"^5.."}[1m])) by (service) / sum(rate(nginx_ingress_controller_requests{}[1m])) by (service) > 0.5 + for: 1m + labels: + severity: page + annotations: + summary: "High rate of 5XX requests for inbound requests" + description: "Rate of 5XX errors is {{ $value | humanizePercentage }} on service `{{ $labels.service }}`" + + - alert: NGINXP99Timing + expr: histogram_quantile(0.99, sum by(host, service, le) (rate(nginx_ingress_controller_request_duration_seconds_bucket{service!~"(grafana|metabase|prestashop-svc)", host!="pydis-api.default.svc.cluster.local"}[5m]))) > 3 and on(service) increase(nginx_ingress_controller_requests[5m]) > 10 + for: 5m + labels: + severity: page + annotations: + summary: "Request timing P99 has been over 3 seconds for 5 minutes" + description: "Requests to service {{ $labels.host }} (to service {{ $labels.service }}) have taken over 3 seconds (P99) to complete." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml new file mode 100644 index 0000000..6bfa6d1 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/nodes.yaml @@ -0,0 +1,49 @@ +groups: +- name: nodes + rules: + + - alert: KubernetesNodeDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 1m + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing disk pressure + description: "{{ $labels.kubernetes_node }} does not have adequate space to work with." + + - alert: KubernetesNodeMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 15s + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing memory pressure + description: "{{ $labels.kubernetes_node }} does not have adequate RAM to work with." + + - alert: KubernetesNodeNetworkUnavailable + expr: kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1 + for: 15s + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing network problems + description: "{{ $labels.kubernetes_node }} is experiencing trouble with inbound and outbound connections" + + + - alert: KubernetesNodePIDPressure + expr: kube_node_status_condition{condition="PIDPressure",status="true"} == 1 + for: 15s + labels: + severity: page + annotations: + summary: Node {{ $labels.kubernetes_node }} is experiencing PID exhaustion + description: "{{ $labels.kubernetes_node }} does not have enough PIDs to work with." + + - alert: KubernetesNodeReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: page + annotations: + summary: Kubernetes node ({{ $labels.kubernetes_node }} ) is marked as unready + description: "Node {{ $labels.kubernetes_node }} has been unready for a long time" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml new file mode 100644 index 0000000..9efdffa --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/pods.yaml @@ -0,0 +1,20 @@ +groups: +- name: pods + rules: + - alert: KubernetesPodNotHealthy + expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[3m:1m]) > 0 + for: 3m + labels: + severity: page + annotations: + summary: "Kubernetes Pod not healthy: {{ $labels.namespace }}/{{ $labels.pod }}" + description: "Pod has been in a non-ready state for longer than 3 minutes." + + - alert: KubernetesPodCrashLooping + expr: increase(kube_pod_container_status_restarts_total[5m]) > 3 + for: 1m + labels: + severity: warning + annotations: + summary: "Kubernetes pod crash looping: {{ $labels.kubernetes_namespace }}/{{ $labels.pod }}" + description: "Pod {{ $labels.pod }} is crash looping" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml new file mode 100644 index 0000000..399a84b --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/postgres.yaml @@ -0,0 +1,29 @@ +groups: +- name: postgres + rules: + - alert: PostgresUp + expr: pg_up == 0 + for: 0m + labels: + severity: page + annotations: + summary: "PostgreSQL is offline" + description: "Postgres Exporter cannot connect to PostgreSQL." + + - alert: PostgresTooManyConnections + expr: (sum(pg_stat_activity_count) by (instance)) / on (instance) pg_settings_max_connections * 100 > 80 + for: 1m + labels: + severity: page + annotations: + summary: PostgreSQL connections near max_connections setting + description: "PostgreSQL instance is near the maximum connection limit, currently {{ $value }} connections" + + - alert: PostgresDeadlockedTable + expr: increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 3 + for: 1m + labels: + severity: page + annotations: + summary: Too many deadlocked tables + description: "PostgreSQL has dead-locks, value: {{ $value }}" diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml new file mode 100644 index 0000000..25e555d --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/prometheus.yaml @@ -0,0 +1,13 @@ +groups: +- name: prometheus + rules: + + # Alert for any instance that is unreachable for >5 minutes. + - alert: InstanceDown + expr: up == 0 + for: 5m + labels: + severity: page + annotations: + summary: "Instance {{ $labels.instance }} down" + description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes." diff --git a/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml new file mode 100644 index 0000000..6b946f6 --- /dev/null +++ b/kubernetes/namespaces/monitoring/alerts/alerts.d/redis.yaml @@ -0,0 +1,20 @@ +groups: +- name: redis + rules: + - alert: RedisDown + expr: redis_up == 0 + for: 1m + labels: + severity: page + annotations: + summary: "Redis is offline" + description: "Redis Exporter cannot connect to Redis." + + - alert: RedisOutOfMemory + expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.9 + for: 0m + labels: + severity: page + annotations: + summary: "Redis is approaching it's memory limit" + description: "Redis is currently using {{ $value | humanizePercentage }} of configured memory." diff --git a/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml b/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml new file mode 100644 index 0000000..5690881 --- /dev/null +++ b/kubernetes/namespaces/monitoring/calico-metrics-svc.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + name: felix-metrics-svc + namespace: kube-system + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9091" +spec: + selector: + k8s-app: calico-node + ports: + - port: 9091 + targetPort: 9091 diff --git a/kubernetes/namespaces/monitoring/exporters/README.md b/kubernetes/namespaces/monitoring/exporters/README.md new file mode 100644 index 0000000..6ed79f5 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/README.md @@ -0,0 +1,8 @@ +# Exporters +This directory contains prometheus exporters for various services running on our cluster. + +If any secrets are required for each exporter they will be in a secrets.yaml file next to the deployment. + +Below is a list of the exporters: +- [postgres_exporter](https://github.com/wrouesnel/postgres_exporter) +- [redis_exporter](https://github.com/oliver006/redis_exporter) diff --git a/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml b/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml new file mode 100644 index 0000000..5542d74 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/postgres/postgres_exporter.yaml @@ -0,0 +1,65 @@ +# Exporter for taking statistics on our PostgreSQL instance +apiVersion: apps/v1 +kind: Deployment +metadata: + name: postgres-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: postgres-exporter + template: + metadata: + labels: + app: postgres-exporter + spec: + containers: + - name: postgres-exporter + image: quay.io/prometheuscommunity/postgres-exporter:latest + imagePullPolicy: Always + resources: + requests: + cpu: 5m + memory: 20Mi + limits: + cpu: 20m + memory: 50Mi + ports: + - containerPort: 9187 + env: + - name: PG_EXPORTER_EXTEND_QUERY_PATH + value: /opt/python-discord/queries/queries.yaml + envFrom: + - secretRef: + name: postgres-exporter-env + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - mountPath: /opt/python-discord/queries + name: queries + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + volumes: + - configMap: + defaultMode: 420 + name: postgres-exporter-queries + name: queries +--- +apiVersion: v1 +kind: Service +metadata: + name: postgres-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9187" +spec: + selector: + app: postgres-exporter + ports: + - protocol: TCP + port: 9187 + targetPort: 9187 diff --git a/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml b/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml Binary files differnew file mode 100644 index 0000000..bec9067 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/postgres/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml b/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml new file mode 100644 index 0000000..28a8489 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/redis/redis_exporter.yaml @@ -0,0 +1,54 @@ +# Exporter for taking statistics on our Redis instance +apiVersion: apps/v1 +kind: Deployment +metadata: + name: redis-exporter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: redis-exporter + template: + metadata: + labels: + app: redis-exporter + spec: + containers: + - name: redis-exporter + image: oliver006/redis_exporter:latest + imagePullPolicy: Always + resources: + requests: + cpu: 5m + memory: 20Mi + limits: + cpu: 20m + memory: 50Mi + ports: + - containerPort: 9187 + envFrom: + - secretRef: + name: redis-exporter-env + securityContext: + readOnlyRootFilesystem: true + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true +--- +apiVersion: v1 +kind: Service +metadata: + name: redis-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9121" +spec: + selector: + app: redis-exporter + ports: + - protocol: TCP + port: 9121 + targetPort: 9121 diff --git a/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml b/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml Binary files differnew file mode 100644 index 0000000..f6ce9d0 --- /dev/null +++ b/kubernetes/namespaces/monitoring/exporters/redis/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml new file mode 100644 index 0000000..5b5c2e7 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kube-state-metrics/deployment.yaml @@ -0,0 +1,30 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: monitoring +spec: + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + containers: + - image: ghcr.io/python-discord/kube-state-metrics:v2.1.0 + imagePullPolicy: Always + args: + - --metric-labels-allowlist=pods=[*] + name: kube-state-metrics + securityContext: + readOnlyRootFilesystem: true + imagePullSecrets: + - name: ghcr-pull-secret + restartPolicy: Always + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml new file mode 100644 index 0000000..17b56cb --- /dev/null +++ b/kubernetes/namespaces/monitoring/kube-state-metrics/service-account.yaml @@ -0,0 +1,136 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kube-state-metrics + namespace: monitoring +rules: + - apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch + - apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + - ingresses + verbs: + - list + - watch + - apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch + - apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch + - apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch + - apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create + - apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create + - apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch + - apiGroups: + - certificates.k8s.io + resources: + - certificatesigningrequests + verbs: + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - list + - watch + - apiGroups: + - admissionregistration.k8s.io + resources: + - mutatingwebhookconfigurations + - validatingwebhookconfigurations + verbs: + - list + - watch + - apiGroups: + - networking.k8s.io + resources: + - networkpolicies + verbs: + - list + - watch + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: + - list + - watch +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: + - kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml b/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml new file mode 100644 index 0000000..7faa2c1 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kube-state-metrics/service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: kube-state-metrics + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" +spec: + selector: + app: kube-state-metrics + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 diff --git a/kubernetes/namespaces/monitoring/kubewatch/README.md b/kubernetes/namespaces/monitoring/kubewatch/README.md new file mode 100644 index 0000000..294c666 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/README.md @@ -0,0 +1,3 @@ +# Kubewatch + +> **kubewatch** is a Kubernetes watcher that currently publishes notification to available collaboration hubs/notification channels. Run it in your k8s cluster, and you will get event notifications through webhooks. diff --git a/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml b/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml new file mode 100644 index 0000000..902cfbc --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/configmap.yaml @@ -0,0 +1,34 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: kubewatch-config + namespace: monitoring +data: + .kubewatch.yaml: | + namespace: "" + handler: + discord: + webhook: "" + ignores: + - pixels-discord-channel + - cert-manager-cainjector-leader-election + - cert-manager-controller + - ingress-controller-leader-nginx + - cluster-autoscaler-status + - ingress-controller-leader + resource: + deployment: true + replicationcontroller: true + replicaset: true + daemonset: true + services: true + pod: true + job: true + node: false + clusterrole: true + serviceaccount: true + persistentvolume: true + namespace: true + secret: true + configmap: true + ingress: true diff --git a/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml b/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml new file mode 100644 index 0000000..a674648 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/deployment.yaml @@ -0,0 +1,32 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kubewatch + namespace: monitoring +spec: + selector: + matchLabels: + app: kubewatch + template: + metadata: + labels: + app: kubewatch + spec: + serviceAccountName: kubewatch + containers: + - image: ghcr.io/python-discord/kubewatch:latest + imagePullPolicy: Always + name: kubewatch + volumeMounts: + - name: config-volume + mountPath: /root + envFrom: + - secretRef: + name: kubewatch-secrets + securityContext: + readOnlyRootFilesystem: true + restartPolicy: Always + volumes: + - name: config-volume + configMap: + name: kubewatch-config diff --git a/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml b/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml Binary files differnew file mode 100644 index 0000000..7427da2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/secrets.yaml diff --git a/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml b/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml new file mode 100644 index 0000000..f0748ba --- /dev/null +++ b/kubernetes/namespaces/monitoring/kubewatch/service-account.yaml @@ -0,0 +1,30 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: kubewatch + namespace: monitoring +rules: +- apiGroups: ["", "extensions", "apps", "batch", "rbac.authorization.k8s.io", ] + resources: ["*"] + verbs: ["get", "watch", "list"] +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kubewatch + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kubewatch + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kubewatch +subjects: + - kind: ServiceAccount + name: kubewatch + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml b/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml new file mode 100644 index 0000000..075b1b7 --- /dev/null +++ b/kubernetes/namespaces/monitoring/node_exporter/daemonset.yaml @@ -0,0 +1,84 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter + namespace: monitoring +spec: + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 + selector: + matchLabels: + name: node-exporter + phase: prod + template: + metadata: + labels: + name: node-exporter + phase: prod + annotations: + seccomp.security.alpha.kubernetes.io/pod: 'docker/default' + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/os + operator: In + values: + - linux + - matchExpressions: + - key: beta.kubernetes.io/os + operator: In + values: + - linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + hostPID: true + containers: + - name: node-exporter + image: quay.io/prometheus/node-exporter:v1.2.0 + args: + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --path.rootfs=/host/root + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker|var/lib/containerd|var/lib/containers/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ + ports: + - name: metrics + containerPort: 9100 + securityContext: + readOnlyRootFilesystem: true + resources: + requests: + cpu: 10m + memory: 24Mi + limits: + cpu: 200m + memory: 100Mi + volumeMounts: + - name: proc + mountPath: /host/proc + readOnly: true + - name: sys + mountPath: /host/sys + readOnly: true + - name: root + mountPath: /host/root + readOnly: true + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + - name: root + hostPath: + path: / diff --git a/kubernetes/namespaces/monitoring/node_exporter/service.yaml b/kubernetes/namespaces/monitoring/node_exporter/service.yaml new file mode 100644 index 0000000..b6be8d5 --- /dev/null +++ b/kubernetes/namespaces/monitoring/node_exporter/service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + name: node-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' +spec: + type: ClusterIP + clusterIP: None + selector: + name: node-exporter + phase: prod + ports: + - name: metrics + protocol: TCP + port: 80 + targetPort: 9100 diff --git a/kubernetes/namespaces/monitoring/prometheus/deployment.yaml b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml new file mode 100644 index 0000000..5a806ff --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/deployment.yaml @@ -0,0 +1,58 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus + namespace: monitoring +spec: + strategy: + type: Recreate + selector: + matchLabels: + app: prometheus + template: + metadata: + labels: + app: prometheus + spec: + serviceAccountName: prometheus + containers: + - image: prom/prometheus:latest + imagePullPolicy: Always + args: [ + "--storage.tsdb.path", "/opt/prometheus/data", + "--config.file", "/etc/prometheus/prometheus.yaml", + "--web.external-url", "https://prometheus.pythondiscord.com", + "--web.enable-lifecycle", + "--web.enable-admin-api", + "--web.page-title", "Python Discord Prometheus", + "--storage.tsdb.retention.size", "28GB", + "--storage.tsdb.retention.time", "100d" + ] + name: prometheus + ports: + - name: prometheus + containerPort: 9090 + securityContext: + readOnlyRootFilesystem: true + volumeMounts: + - name: prometheus-data + mountPath: /opt/prometheus/data + - name: prometheus-config + mountPath: /etc/prometheus + - name: prometheus-alerts + mountPath: /opt/pydis/prometheus/alerts.d + restartPolicy: Always + securityContext: + fsGroup: 2000 + runAsUser: 1000 + runAsNonRoot: true + volumes: + - name: prometheus-data + persistentVolumeClaim: + claimName: prometheus-storage + - name: prometheus-config + configMap: + name: prometheus-config + - name: prometheus-alerts + configMap: + name: prometheus-alert-rules diff --git a/kubernetes/namespaces/monitoring/prometheus/ingress.yaml b/kubernetes/namespaces/monitoring/prometheus/ingress.yaml new file mode 100644 index 0000000..69e240a --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/ingress.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + nginx.ingress.kubernetes.io/auth-tls-verify-client: "on" + nginx.ingress.kubernetes.io/auth-tls-secret: "kube-system/mtls-client-crt-bundle" + nginx.ingress.kubernetes.io/auth-tls-error-page: "https://www.youtube.com/watch?v=dQw4w9WgXcQ" + name: prometheus + namespace: monitoring +spec: + tls: + - hosts: + - "*.pythondiscord.com" + rules: + - host: prometheus.pythondiscord.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: prometheus + port: + number: 9090 diff --git a/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml b/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml new file mode 100644 index 0000000..7ad047c --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/prometheus-config.yaml @@ -0,0 +1,267 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + prometheus.yaml: |- + # Global config + global: + scrape_interval: 15s + + rule_files: + - /opt/pydis/prometheus/alerts.d/*.yaml + + alerting: + alertmanagers: + - scheme: http + dns_sd_configs: + - names: + - alertmanager-sd.monitoring.svc.cluster.local + type: A + port: 9093 + + # Scrape configs for running Prometheus on a Kubernetes cluster. + # This uses separate scrape configs for cluster components (i.e. API server, node) + # and services to allow each to use different authentication configs. + # + # Kubernetes labels will be added as Prometheus labels on metrics via the + # `labelmap` relabeling action. + scrape_configs: + + # Scrape config for API servers. + # + # Kubernetes exposes API servers as endpoints to the default/kubernetes + # service so this uses `endpoints` role and uses relabelling to only keep + # the endpoints associated with the default/kubernetes service using the + # default named port `https`. This works for single API server deployments as + # well as HA API server deployments. + - job_name: 'kubernetes-apiservers' + kubernetes_sd_configs: + - role: endpoints + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Using endpoints to discover kube-apiserver targets finds the pod IP + # (host IP since apiserver uses host network) which is not used in + # the server certificate. + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + # Keep only the default/kubernetes service endpoints for the https port. This + # will add targets for each API server which Kubernetes adds an endpoint to + # the default/kubernetes service. + relabel_configs: + - source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name] + action: keep + regex: default;kubernetes;https + - replacement: apiserver + action: replace + target_label: job + + # Scrape config for node (i.e. kubelet) /metrics (e.g. 'kubelet_'). Explore + # metrics from a node by scraping kubelet (127.0.0.1:10250/metrics). + - job_name: 'kubelet' + kubernetes_sd_configs: + - role: node + + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + - replacement: 'monitoring' + target_label: kubernetes_namespace + + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace + + # Scrape config for Kubelet cAdvisor. Explore metrics from a node by + # scraping kubelet (127.0.0.1:10250/metrics/cadvisor). + - job_name: 'kubernetes-cadvisor' + kubernetes_sd_configs: + - role: node + + scheme: https + metrics_path: /metrics/cadvisor + tls_config: + # Kubelet certs don't have any fixed IP SANs + insecure_skip_verify: true + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token + + relabel_configs: + - action: labelmap + regex: __meta_kubernetes_node_label_(.+) + metric_relabel_configs: + - source_labels: + - namespace + action: replace + target_label: kubernetes_namespace + - source_labels: + - pod + regex: (.*) + replacement: $1 + action: replace + target_label: pod_name + - source_labels: + - container + regex: (.*) + replacement: $1 + action: replace + target_label: container_name + + # Scrap etcd metrics from masters via etcd-scraper-proxy + - job_name: 'etcd' + kubernetes_sd_configs: + - role: pod + scheme: http + relabel_configs: + - source_labels: [__meta_kubernetes_namespace] + action: keep + regex: 'kube-system' + - source_labels: [__meta_kubernetes_pod_label_component] + action: keep + regex: 'etcd-scraper-proxy' + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + + # Scrape config for service endpoints. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/scrape`: Only scrape services that have a value of `true` + # * `prometheus.io/scheme`: If the metrics endpoint is secured then you will need + # to set this to `https` & most likely set the `tls_config` of the scrape config. + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: If the metrics are exposed on a different port to the + # service then set this appropriately. + - job_name: 'kubernetes-service-endpoints' + + kubernetes_sd_configs: + - role: endpoints + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme] + action: replace + target_label: __scheme__ + regex: (https?) + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port] + action: replace + target_label: __address__ + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_service_name] + action: replace + target_label: job + - action: replace + source_labels: + - __meta_kubernetes_pod_node_name + target_label: kubernetes_node + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace + + # Example scrape config for probing services via the Blackbox Exporter. + # + # The relabeling allows the actual service scrape endpoint to be configured + # via the following annotations: + # + # * `prometheus.io/probe`: Only probe services that have a value of `true` + - job_name: 'kubernetes-services' + + metrics_path: /probe + params: + module: [http_2xx] + + kubernetes_sd_configs: + - role: service + + relabel_configs: + - source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe] + action: keep + regex: true + - source_labels: [__address__] + target_label: __param_target + - target_label: __address__ + replacement: blackbox + - source_labels: [__param_target] + target_label: instance + - action: labelmap + regex: __meta_kubernetes_service_label_(.+) + - source_labels: [__meta_kubernetes_service_name] + target_label: job + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace + + # Example scrape config for pods + # + # The relabeling allows the actual pod scrape endpoint to be configured via the + # following annotations: + # + # * `prometheus.io/scrape`: Only scrape pods that have a value of `true` + # * `prometheus.io/path`: If the metrics path is not `/metrics` override this. + # * `prometheus.io/port`: Scrape the pod on the indicated port instead of the + # pod's declared ports (default is a port-free target if none are declared). + - job_name: 'kubernetes-pods' + + kubernetes_sd_configs: + - role: pod + + relabel_configs: + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] + action: keep + regex: true + - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path] + action: replace + target_label: __metrics_path__ + regex: (.+) + - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port] + action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + target_label: __address__ + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - source_labels: [__meta_kubernetes_namespace] + action: replace + target_label: kubernetes_namespace + - source_labels: [__meta_kubernetes_pod_name] + action: replace + target_label: kubernetes_pod_name + metric_relabel_configs: + - source_labels: + - namespace + action: replace + regex: (.+) + target_label: kubernetes_namespace diff --git a/kubernetes/namespaces/monitoring/prometheus/service-account.yaml b/kubernetes/namespaces/monitoring/prometheus/service-account.yaml new file mode 100644 index 0000000..00cf0c2 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/service-account.yaml @@ -0,0 +1,32 @@ +--- +kind: ClusterRole +apiVersion: rbac.authorization.k8s.io/v1 +metadata: + name: prometheus +rules: +- apiGroups: ["*"] + resources: ["*"] + verbs: ["get", "list", "watch"] +- nonResourceURLs: + - "/metrics" + verbs: + - get +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus +subjects: + - kind: ServiceAccount + name: prometheus + namespace: monitoring diff --git a/kubernetes/namespaces/monitoring/prometheus/service.yaml b/kubernetes/namespaces/monitoring/prometheus/service.yaml new file mode 100644 index 0000000..5ec3a21 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/service.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus + namespace: monitoring +spec: + selector: + app: prometheus + ports: + - port: 9090 + targetPort: 9090 diff --git a/kubernetes/namespaces/monitoring/prometheus/volume.yaml b/kubernetes/namespaces/monitoring/prometheus/volume.yaml new file mode 100644 index 0000000..4468a20 --- /dev/null +++ b/kubernetes/namespaces/monitoring/prometheus/volume.yaml @@ -0,0 +1,14 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: prometheus-storage + namespace: monitoring + labels: + app: prometheus +spec: + storageClassName: linode-block-storage-retain + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 30Gi diff --git a/kubernetes/scripts/__init__.py b/kubernetes/scripts/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/kubernetes/scripts/__init__.py diff --git a/kubernetes/scripts/lint_manifests.py b/kubernetes/scripts/lint_manifests.py new file mode 100644 index 0000000..8660608 --- /dev/null +++ b/kubernetes/scripts/lint_manifests.py @@ -0,0 +1,28 @@ +import os +import sys +from pathlib import Path + + +def get_all_manifests() -> list[str]: + """Return a list of file paths that look like k8s manifests.""" + likely_manifests = [] + for file in Path.cwd().glob("**/*.yaml"): + if file.name in ("secrets.yaml", "ghcr-pull-secrets.yaml"): + # Don't lint secret files as they're git-crypted + continue + if file.stem.startswith("_"): + # Ignore manifests that start with _ + continue + if "apiVersion:" not in file.read_text(): + # Probably not a manifest + continue + likely_manifests.append(str(file)) + return likely_manifests + + +if __name__ == "__main__": + if sys.argv[1] == "diff": + arg = " -f ".join([""] + get_all_manifests()) + os.system("kubectl diff" + arg) # noqa: S605 + elif sys.argv[1] == "find": + print("\n".join(get_all_manifests())) # noqa: T201 |