diff options
author | 2021-04-10 13:10:51 -0700 | |
---|---|---|
committer | 2021-04-11 11:33:29 -0700 | |
commit | 150713eaf7b5c61667385f3e46587aca059191f4 (patch) | |
tree | 40582cc5870ac005463bdbdd660d6415d93c2476 | |
parent | Filtering: remove invisible characters before checking filters (diff) |
Filtering: use a more thorough regex for zalgo & invisible chars
Install the regex package to take advantage of its support for Unicode
categories.
-rw-r--r-- | Pipfile | 1 | ||||
-rw-r--r-- | Pipfile.lock | 89 | ||||
-rw-r--r-- | bot/exts/filters/filtering.py | 36 |
3 files changed, 91 insertions, 35 deletions
@@ -25,6 +25,7 @@ more_itertools = "~=8.2" python-dateutil = "~=2.8" python-frontmatter = "~=1.0.0" pyyaml = "~=5.1" +regex = "==2021.4.4" requests = "~=2.22" sentry-sdk = "~=0.19" sphinx = "~=2.2" diff --git a/Pipfile.lock b/Pipfile.lock index cbec48ef0..d6792ac35 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "91b5639198b35740611e7ac923cfc262e5897b8cbc3ca243dc98335705804ba7" + "sha256": "fc3421fc4c95d73b620f2b8b0a7dea288d4fc559e0d288ed4ad6cf4eb312f630" }, "pipfile-spec": 6, "requires": { @@ -221,6 +221,7 @@ "sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b", "sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2" ], + "index": "pypi", "markers": "sys_platform == 'win32'", "version": "==0.4.4" }, @@ -250,11 +251,11 @@ }, "docutils": { "hashes": [ - "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af", - "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc" + "sha256:a71042bb7207c03d5647f280427f14bfbd1a65c9eb84f4b341d85fafb6bb4bdf", + "sha256:e2ffeea817964356ba4470efba7c2f42b6b0de0b04e66378507e3e2504bbff4c" ], "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", - "version": "==0.16" + "version": "==0.17" }, "emoji": { "hashes": [ @@ -605,6 +606,15 @@ "markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'", "version": "==2.4.7" }, + "pyreadline": { + "hashes": [ + "sha256:4530592fc2e85b25b1a9f79664433da09237c1a270e4d78ea5aa3a2c7229e2d1", + "sha256:65540c21bfe14405a3a77e4c085ecfce88724743a4ead47c66b84defcf82c32e", + "sha256:9ce5fa65b8992dfa373bddc5b6e0864ead8f291c94fbfec05fbd5c836162e67b" + ], + "markers": "sys_platform == 'win32'", + "version": "==2.1" + }, "python-dateutil": { "hashes": [ "sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c", @@ -671,6 +681,53 @@ "markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'", "version": "==3.5.3" }, + "regex": { + "hashes": [ + "sha256:01afaf2ec48e196ba91b37451aa353cb7eda77efe518e481707e0515025f0cd5", + "sha256:11d773d75fa650cd36f68d7ca936e3c7afaae41b863b8c387a22aaa78d3c5c79", + "sha256:18c071c3eb09c30a264879f0d310d37fe5d3a3111662438889ae2eb6fc570c31", + "sha256:1e1c20e29358165242928c2de1482fb2cf4ea54a6a6dea2bd7a0e0d8ee321500", + "sha256:281d2fd05555079448537fe108d79eb031b403dac622621c78944c235f3fcf11", + "sha256:314d66636c494ed9c148a42731b3834496cc9a2c4251b1661e40936814542b14", + "sha256:32e65442138b7b76dd8173ffa2cf67356b7bc1768851dded39a7a13bf9223da3", + "sha256:339456e7d8c06dd36a22e451d58ef72cef293112b559010db3d054d5560ef439", + "sha256:3916d08be28a1149fb97f7728fca1f7c15d309a9f9682d89d79db75d5e52091c", + "sha256:3a9cd17e6e5c7eb328517969e0cb0c3d31fd329298dd0c04af99ebf42e904f82", + "sha256:47bf5bf60cf04d72bf6055ae5927a0bd9016096bf3d742fa50d9bf9f45aa0711", + "sha256:4c46e22a0933dd783467cf32b3516299fb98cfebd895817d685130cc50cd1093", + "sha256:4c557a7b470908b1712fe27fb1ef20772b78079808c87d20a90d051660b1d69a", + "sha256:52ba3d3f9b942c49d7e4bc105bb28551c44065f139a65062ab7912bef10c9afb", + "sha256:563085e55b0d4fb8f746f6a335893bda5c2cef43b2f0258fe1020ab1dd874df8", + "sha256:598585c9f0af8374c28edd609eb291b5726d7cbce16be6a8b95aa074d252ee17", + "sha256:619d71c59a78b84d7f18891fe914446d07edd48dc8328c8e149cbe0929b4e000", + "sha256:67bdb9702427ceddc6ef3dc382455e90f785af4c13d495f9626861763ee13f9d", + "sha256:6d1b01031dedf2503631d0903cb563743f397ccaf6607a5e3b19a3d76fc10480", + "sha256:741a9647fcf2e45f3a1cf0e24f5e17febf3efe8d4ba1281dcc3aa0459ef424dc", + "sha256:7c2a1af393fcc09e898beba5dd59196edaa3116191cc7257f9224beaed3e1aa0", + "sha256:7d9884d86dd4dd489e981d94a65cd30d6f07203d90e98f6f657f05170f6324c9", + "sha256:90f11ff637fe8798933fb29f5ae1148c978cccb0452005bf4c69e13db951e765", + "sha256:919859aa909429fb5aa9cf8807f6045592c85ef56fdd30a9a3747e513db2536e", + "sha256:96fcd1888ab4d03adfc9303a7b3c0bd78c5412b2bfbe76db5b56d9eae004907a", + "sha256:97f29f57d5b84e73fbaf99ab3e26134e6687348e95ef6b48cfd2c06807005a07", + "sha256:980d7be47c84979d9136328d882f67ec5e50008681d94ecc8afa8a65ed1f4a6f", + "sha256:a91aa8619b23b79bcbeb37abe286f2f408d2f2d6f29a17237afda55bb54e7aac", + "sha256:ade17eb5d643b7fead300a1641e9f45401c98eee23763e9ed66a43f92f20b4a7", + "sha256:b9c3db21af35e3b3c05764461b262d6f05bbca08a71a7849fd79d47ba7bc33ed", + "sha256:bd28bc2e3a772acbb07787c6308e00d9626ff89e3bfcdebe87fa5afbfdedf968", + "sha256:bf5824bfac591ddb2c1f0a5f4ab72da28994548c708d2191e3b87dd207eb3ad7", + "sha256:c0502c0fadef0d23b128605d69b58edb2c681c25d44574fc673b0e52dce71ee2", + "sha256:c38c71df845e2aabb7fb0b920d11a1b5ac8526005e533a8920aea97efb8ec6a4", + "sha256:ce15b6d103daff8e9fee13cf7f0add05245a05d866e73926c358e871221eae87", + "sha256:d3029c340cfbb3ac0a71798100ccc13b97dddf373a4ae56b6a72cf70dfd53bc8", + "sha256:e512d8ef5ad7b898cdb2d8ee1cb09a8339e4f8be706d27eaa180c2f177248a10", + "sha256:e8e5b509d5c2ff12f8418006d5a90e9436766133b564db0abaec92fd27fcee29", + "sha256:ee54ff27bf0afaf4c3b3a62bcd016c12c3fdb4ec4f413391a90bd38bc3624605", + "sha256:fa4537fb4a98fe8fde99626e4681cc644bdcf2a795038533f9f711513a862ae6", + "sha256:fd45ff9293d9274c5008a2054ecef86a9bfe819a67c7be1afb65e69b405b3042" + ], + "index": "pypi", + "version": "==2021.4.4" + }, "requests": { "hashes": [ "sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804", @@ -976,11 +1033,11 @@ }, "flake8-annotations": { "hashes": [ - "sha256:40a4d504cdf64126ea0bdca39edab1608bc6d515e96569b7e7c3c59c84f66c36", - "sha256:eabbfb2dd59ae0e9835f509f930e79cd99fa4ff1026fe6ca073503a57407037c" + "sha256:0d6cd2e770b5095f09689c9d84cc054c51b929c41a68969ea1beb4b825cac515", + "sha256:d10c4638231f8a50c0a597c4efce42bd7b7d85df4f620a0ddaca526138936a4f" ], "index": "pypi", - "version": "==2.6.1" + "version": "==2.6.2" }, "flake8-bugbear": { "hashes": [ @@ -1038,11 +1095,11 @@ }, "identify": { "hashes": [ - "sha256:43cb1965e84cdd247e875dec6d13332ef5be355ddc16776396d98089b9053d87", - "sha256:c7c0f590526008911ccc5ceee6ed7b085cbc92f7b6591d0ee5913a130ad64034" + "sha256:398cb92a7599da0b433c65301a1b62b9b1f4bb8248719b84736af6c0b22289d6", + "sha256:4537474817e0bbb8cea3e5b7504b7de6d44e3f169a90846cbc6adb0fc8294502" ], "markers": "python_full_version >= '3.6.1'", - "version": "==2.2.2" + "version": "==2.2.3" }, "idna": { "hashes": [ @@ -1061,10 +1118,10 @@ }, "nodeenv": { "hashes": [ - "sha256:5304d424c529c997bc888453aeaa6362d242b6b4631e90f3d4bf1b290f1c84a9", - "sha256:ab45090ae383b716c4ef89e690c41ff8c2b257b85b309f01f3654df3d084bd7c" + "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b", + "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7" ], - "version": "==1.5.0" + "version": "==1.6.0" }, "pep8-naming": { "hashes": [ @@ -1076,11 +1133,11 @@ }, "pre-commit": { "hashes": [ - "sha256:94c82f1bf5899d56edb1d926732f4e75a7df29a0c8c092559c77420c9d62428b", - "sha256:de55c5c72ce80d79106e48beb1b54104d16495ce7f95b0c7b13d4784193a00af" + "sha256:029d53cb83c241fe7d66eeee1e24db426f42c858f15a38d20bcefd8d8e05c9da", + "sha256:46b6ffbab37986c47d0a35e40906ae029376deed89a0eb2e446fb6e67b220427" ], "index": "pypi", - "version": "==2.11.1" + "version": "==2.12.0" }, "pycodestyle": { "hashes": [ diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index 1ae2610aa..464732453 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Mapping, NamedTuple, Optional, Tuple, Union import dateutil import discord.errors +import regex from async_rediscache import RedisCache from dateutil.relativedelta import relativedelta from discord import Colour, HTTPException, Member, Message, NotFound, TextChannel @@ -34,7 +35,11 @@ CODE_BLOCK_RE = re.compile( EVERYONE_PING_RE = re.compile(rf"@everyone|<@&{Guild.id}>|@here") SPOILER_RE = re.compile(r"(\|\|.+?\|\|)", re.DOTALL) URL_RE = re.compile(r"(https?://[^\s]+)", flags=re.IGNORECASE) -ZALGO_RE = re.compile(r"[\u0300-\u036F\u0489]") + +# Exclude variation selectors from zalgo because they're actually invisible. +VARIATION_SELECTORS = r"\uFE00-\uFE0F\U000E0100-\U000E01EF" +INVISIBLE_RE = regex.compile(rf"[{VARIATION_SELECTORS}\p{{UNASSIGNED}}\p{{FORMAT}}\p{{CONTROL}}--\s]", regex.V1) +ZALGO_RE = regex.compile(rf"[\p{{NONSPACING MARK}}\p{{ENCLOSING MARK}}--[{VARIATION_SELECTORS}]]", regex.V1) # Other constants. DAYS_BETWEEN_ALERTS = 3 @@ -178,7 +183,7 @@ class Filtering(Cog): def get_name_matches(self, name: str) -> List[re.Match]: """Check bad words from passed string (name). Return list of matches.""" - name = self.remove_invisible_chars(name) + name = self.clean_input(name) matches = [] watchlist_patterns = self._get_filterlist_items('filter_token', allowed=False) for pattern in watchlist_patterns: @@ -445,7 +450,7 @@ class Filtering(Cog): if SPOILER_RE.search(text): text = self._expand_spoilers(text) - text = self.remove_invisible_chars(text) + text = self.clean_input(text) # Make sure it's not a URL if URL_RE.search(text): @@ -465,7 +470,7 @@ class Filtering(Cog): Second return value is a reason of URL blacklisting (can be None). """ - text = self.remove_invisible_chars(text) + text = self.clean_input(text) if not URL_RE.search(text): return False, None @@ -496,7 +501,7 @@ class Filtering(Cog): Attempts to catch some of common ways to try to cheat the system. """ - text = self.remove_invisible_chars(text) + text = self.clean_input(text) # Remove backslashes to prevent escape character aroundfuckery like # discord\.gg/gdudes-pony-farm @@ -635,20 +640,13 @@ class Filtering(Cog): log.info(f"Deleted the offensive message with id {msg['id']}.") @staticmethod - def remove_invisible_chars(string: str) -> str: - """ - Remove invisible characters from `string`. - - Removed characters: - - - mongolian vowel separator - - zero width space - - zero width non-joiner - - zero width joiner - - word joiner - - zero width non-breaking space - """ - return re.sub("[\u180e\u200b\u200c\u200d\u2060\ufeff]", "", string) + def clean_input(string: str) -> str: + """Remove zalgo and invisible characters from `string`.""" + # For future consideration: remove characters in the Mc, Sk, and Lm categories too. + # Can be normalised with form C to merge char + combining char into a single char to avoid + # removing legit diacritics, but this would open up a way to bypass filters. + no_zalgo = ZALGO_RE.sub("", string) + return INVISIBLE_RE.sub("", no_zalgo) def setup(bot: Bot) -> None: |