aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar MarkKoz <[email protected]>2021-04-10 13:10:51 -0700
committerGravatar MarkKoz <[email protected]>2021-04-11 11:33:29 -0700
commit150713eaf7b5c61667385f3e46587aca059191f4 (patch)
tree40582cc5870ac005463bdbdd660d6415d93c2476
parentFiltering: remove invisible characters before checking filters (diff)
Filtering: use a more thorough regex for zalgo & invisible chars
Install the regex package to take advantage of its support for Unicode categories.
-rw-r--r--Pipfile1
-rw-r--r--Pipfile.lock89
-rw-r--r--bot/exts/filters/filtering.py36
3 files changed, 91 insertions, 35 deletions
diff --git a/Pipfile b/Pipfile
index 7fab198f3..2ac5645dd 100644
--- a/Pipfile
+++ b/Pipfile
@@ -25,6 +25,7 @@ more_itertools = "~=8.2"
python-dateutil = "~=2.8"
python-frontmatter = "~=1.0.0"
pyyaml = "~=5.1"
+regex = "==2021.4.4"
requests = "~=2.22"
sentry-sdk = "~=0.19"
sphinx = "~=2.2"
diff --git a/Pipfile.lock b/Pipfile.lock
index cbec48ef0..d6792ac35 100644
--- a/Pipfile.lock
+++ b/Pipfile.lock
@@ -1,7 +1,7 @@
{
"_meta": {
"hash": {
- "sha256": "91b5639198b35740611e7ac923cfc262e5897b8cbc3ca243dc98335705804ba7"
+ "sha256": "fc3421fc4c95d73b620f2b8b0a7dea288d4fc559e0d288ed4ad6cf4eb312f630"
},
"pipfile-spec": 6,
"requires": {
@@ -221,6 +221,7 @@
"sha256:5941b2b48a20143d2267e95b1c2a7603ce057ee39fd88e7329b0c292aa16869b",
"sha256:9f47eda37229f68eee03b24b9748937c7dc3868f906e8ba69fbcbdd3bc5dc3e2"
],
+ "index": "pypi",
"markers": "sys_platform == 'win32'",
"version": "==0.4.4"
},
@@ -250,11 +251,11 @@
},
"docutils": {
"hashes": [
- "sha256:0c5b78adfbf7762415433f5515cd5c9e762339e23369dbe8000d84a4bf4ab3af",
- "sha256:c2de3a60e9e7d07be26b7f2b00ca0309c207e06c100f9cc2a94931fc75a478fc"
+ "sha256:a71042bb7207c03d5647f280427f14bfbd1a65c9eb84f4b341d85fafb6bb4bdf",
+ "sha256:e2ffeea817964356ba4470efba7c2f42b6b0de0b04e66378507e3e2504bbff4c"
],
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
- "version": "==0.16"
+ "version": "==0.17"
},
"emoji": {
"hashes": [
@@ -605,6 +606,15 @@
"markers": "python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3'",
"version": "==2.4.7"
},
+ "pyreadline": {
+ "hashes": [
+ "sha256:4530592fc2e85b25b1a9f79664433da09237c1a270e4d78ea5aa3a2c7229e2d1",
+ "sha256:65540c21bfe14405a3a77e4c085ecfce88724743a4ead47c66b84defcf82c32e",
+ "sha256:9ce5fa65b8992dfa373bddc5b6e0864ead8f291c94fbfec05fbd5c836162e67b"
+ ],
+ "markers": "sys_platform == 'win32'",
+ "version": "==2.1"
+ },
"python-dateutil": {
"hashes": [
"sha256:73ebfe9dbf22e832286dafa60473e4cd239f8592f699aa5adaf10050e6e1823c",
@@ -671,6 +681,53 @@
"markers": "python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'",
"version": "==3.5.3"
},
+ "regex": {
+ "hashes": [
+ "sha256:01afaf2ec48e196ba91b37451aa353cb7eda77efe518e481707e0515025f0cd5",
+ "sha256:11d773d75fa650cd36f68d7ca936e3c7afaae41b863b8c387a22aaa78d3c5c79",
+ "sha256:18c071c3eb09c30a264879f0d310d37fe5d3a3111662438889ae2eb6fc570c31",
+ "sha256:1e1c20e29358165242928c2de1482fb2cf4ea54a6a6dea2bd7a0e0d8ee321500",
+ "sha256:281d2fd05555079448537fe108d79eb031b403dac622621c78944c235f3fcf11",
+ "sha256:314d66636c494ed9c148a42731b3834496cc9a2c4251b1661e40936814542b14",
+ "sha256:32e65442138b7b76dd8173ffa2cf67356b7bc1768851dded39a7a13bf9223da3",
+ "sha256:339456e7d8c06dd36a22e451d58ef72cef293112b559010db3d054d5560ef439",
+ "sha256:3916d08be28a1149fb97f7728fca1f7c15d309a9f9682d89d79db75d5e52091c",
+ "sha256:3a9cd17e6e5c7eb328517969e0cb0c3d31fd329298dd0c04af99ebf42e904f82",
+ "sha256:47bf5bf60cf04d72bf6055ae5927a0bd9016096bf3d742fa50d9bf9f45aa0711",
+ "sha256:4c46e22a0933dd783467cf32b3516299fb98cfebd895817d685130cc50cd1093",
+ "sha256:4c557a7b470908b1712fe27fb1ef20772b78079808c87d20a90d051660b1d69a",
+ "sha256:52ba3d3f9b942c49d7e4bc105bb28551c44065f139a65062ab7912bef10c9afb",
+ "sha256:563085e55b0d4fb8f746f6a335893bda5c2cef43b2f0258fe1020ab1dd874df8",
+ "sha256:598585c9f0af8374c28edd609eb291b5726d7cbce16be6a8b95aa074d252ee17",
+ "sha256:619d71c59a78b84d7f18891fe914446d07edd48dc8328c8e149cbe0929b4e000",
+ "sha256:67bdb9702427ceddc6ef3dc382455e90f785af4c13d495f9626861763ee13f9d",
+ "sha256:6d1b01031dedf2503631d0903cb563743f397ccaf6607a5e3b19a3d76fc10480",
+ "sha256:741a9647fcf2e45f3a1cf0e24f5e17febf3efe8d4ba1281dcc3aa0459ef424dc",
+ "sha256:7c2a1af393fcc09e898beba5dd59196edaa3116191cc7257f9224beaed3e1aa0",
+ "sha256:7d9884d86dd4dd489e981d94a65cd30d6f07203d90e98f6f657f05170f6324c9",
+ "sha256:90f11ff637fe8798933fb29f5ae1148c978cccb0452005bf4c69e13db951e765",
+ "sha256:919859aa909429fb5aa9cf8807f6045592c85ef56fdd30a9a3747e513db2536e",
+ "sha256:96fcd1888ab4d03adfc9303a7b3c0bd78c5412b2bfbe76db5b56d9eae004907a",
+ "sha256:97f29f57d5b84e73fbaf99ab3e26134e6687348e95ef6b48cfd2c06807005a07",
+ "sha256:980d7be47c84979d9136328d882f67ec5e50008681d94ecc8afa8a65ed1f4a6f",
+ "sha256:a91aa8619b23b79bcbeb37abe286f2f408d2f2d6f29a17237afda55bb54e7aac",
+ "sha256:ade17eb5d643b7fead300a1641e9f45401c98eee23763e9ed66a43f92f20b4a7",
+ "sha256:b9c3db21af35e3b3c05764461b262d6f05bbca08a71a7849fd79d47ba7bc33ed",
+ "sha256:bd28bc2e3a772acbb07787c6308e00d9626ff89e3bfcdebe87fa5afbfdedf968",
+ "sha256:bf5824bfac591ddb2c1f0a5f4ab72da28994548c708d2191e3b87dd207eb3ad7",
+ "sha256:c0502c0fadef0d23b128605d69b58edb2c681c25d44574fc673b0e52dce71ee2",
+ "sha256:c38c71df845e2aabb7fb0b920d11a1b5ac8526005e533a8920aea97efb8ec6a4",
+ "sha256:ce15b6d103daff8e9fee13cf7f0add05245a05d866e73926c358e871221eae87",
+ "sha256:d3029c340cfbb3ac0a71798100ccc13b97dddf373a4ae56b6a72cf70dfd53bc8",
+ "sha256:e512d8ef5ad7b898cdb2d8ee1cb09a8339e4f8be706d27eaa180c2f177248a10",
+ "sha256:e8e5b509d5c2ff12f8418006d5a90e9436766133b564db0abaec92fd27fcee29",
+ "sha256:ee54ff27bf0afaf4c3b3a62bcd016c12c3fdb4ec4f413391a90bd38bc3624605",
+ "sha256:fa4537fb4a98fe8fde99626e4681cc644bdcf2a795038533f9f711513a862ae6",
+ "sha256:fd45ff9293d9274c5008a2054ecef86a9bfe819a67c7be1afb65e69b405b3042"
+ ],
+ "index": "pypi",
+ "version": "==2021.4.4"
+ },
"requests": {
"hashes": [
"sha256:27973dd4a904a4f13b263a19c866c13b92a39ed1c964655f025f3f8d3d75b804",
@@ -976,11 +1033,11 @@
},
"flake8-annotations": {
"hashes": [
- "sha256:40a4d504cdf64126ea0bdca39edab1608bc6d515e96569b7e7c3c59c84f66c36",
- "sha256:eabbfb2dd59ae0e9835f509f930e79cd99fa4ff1026fe6ca073503a57407037c"
+ "sha256:0d6cd2e770b5095f09689c9d84cc054c51b929c41a68969ea1beb4b825cac515",
+ "sha256:d10c4638231f8a50c0a597c4efce42bd7b7d85df4f620a0ddaca526138936a4f"
],
"index": "pypi",
- "version": "==2.6.1"
+ "version": "==2.6.2"
},
"flake8-bugbear": {
"hashes": [
@@ -1038,11 +1095,11 @@
},
"identify": {
"hashes": [
- "sha256:43cb1965e84cdd247e875dec6d13332ef5be355ddc16776396d98089b9053d87",
- "sha256:c7c0f590526008911ccc5ceee6ed7b085cbc92f7b6591d0ee5913a130ad64034"
+ "sha256:398cb92a7599da0b433c65301a1b62b9b1f4bb8248719b84736af6c0b22289d6",
+ "sha256:4537474817e0bbb8cea3e5b7504b7de6d44e3f169a90846cbc6adb0fc8294502"
],
"markers": "python_full_version >= '3.6.1'",
- "version": "==2.2.2"
+ "version": "==2.2.3"
},
"idna": {
"hashes": [
@@ -1061,10 +1118,10 @@
},
"nodeenv": {
"hashes": [
- "sha256:5304d424c529c997bc888453aeaa6362d242b6b4631e90f3d4bf1b290f1c84a9",
- "sha256:ab45090ae383b716c4ef89e690c41ff8c2b257b85b309f01f3654df3d084bd7c"
+ "sha256:3ef13ff90291ba2a4a7a4ff9a979b63ffdd00a464dbe04acf0ea6471517a4c2b",
+ "sha256:621e6b7076565ddcacd2db0294c0381e01fd28945ab36bcf00f41c5daf63bef7"
],
- "version": "==1.5.0"
+ "version": "==1.6.0"
},
"pep8-naming": {
"hashes": [
@@ -1076,11 +1133,11 @@
},
"pre-commit": {
"hashes": [
- "sha256:94c82f1bf5899d56edb1d926732f4e75a7df29a0c8c092559c77420c9d62428b",
- "sha256:de55c5c72ce80d79106e48beb1b54104d16495ce7f95b0c7b13d4784193a00af"
+ "sha256:029d53cb83c241fe7d66eeee1e24db426f42c858f15a38d20bcefd8d8e05c9da",
+ "sha256:46b6ffbab37986c47d0a35e40906ae029376deed89a0eb2e446fb6e67b220427"
],
"index": "pypi",
- "version": "==2.11.1"
+ "version": "==2.12.0"
},
"pycodestyle": {
"hashes": [
diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py
index 1ae2610aa..464732453 100644
--- a/bot/exts/filters/filtering.py
+++ b/bot/exts/filters/filtering.py
@@ -6,6 +6,7 @@ from typing import Any, Dict, List, Mapping, NamedTuple, Optional, Tuple, Union
import dateutil
import discord.errors
+import regex
from async_rediscache import RedisCache
from dateutil.relativedelta import relativedelta
from discord import Colour, HTTPException, Member, Message, NotFound, TextChannel
@@ -34,7 +35,11 @@ CODE_BLOCK_RE = re.compile(
EVERYONE_PING_RE = re.compile(rf"@everyone|<@&{Guild.id}>|@here")
SPOILER_RE = re.compile(r"(\|\|.+?\|\|)", re.DOTALL)
URL_RE = re.compile(r"(https?://[^\s]+)", flags=re.IGNORECASE)
-ZALGO_RE = re.compile(r"[\u0300-\u036F\u0489]")
+
+# Exclude variation selectors from zalgo because they're actually invisible.
+VARIATION_SELECTORS = r"\uFE00-\uFE0F\U000E0100-\U000E01EF"
+INVISIBLE_RE = regex.compile(rf"[{VARIATION_SELECTORS}\p{{UNASSIGNED}}\p{{FORMAT}}\p{{CONTROL}}--\s]", regex.V1)
+ZALGO_RE = regex.compile(rf"[\p{{NONSPACING MARK}}\p{{ENCLOSING MARK}}--[{VARIATION_SELECTORS}]]", regex.V1)
# Other constants.
DAYS_BETWEEN_ALERTS = 3
@@ -178,7 +183,7 @@ class Filtering(Cog):
def get_name_matches(self, name: str) -> List[re.Match]:
"""Check bad words from passed string (name). Return list of matches."""
- name = self.remove_invisible_chars(name)
+ name = self.clean_input(name)
matches = []
watchlist_patterns = self._get_filterlist_items('filter_token', allowed=False)
for pattern in watchlist_patterns:
@@ -445,7 +450,7 @@ class Filtering(Cog):
if SPOILER_RE.search(text):
text = self._expand_spoilers(text)
- text = self.remove_invisible_chars(text)
+ text = self.clean_input(text)
# Make sure it's not a URL
if URL_RE.search(text):
@@ -465,7 +470,7 @@ class Filtering(Cog):
Second return value is a reason of URL blacklisting (can be None).
"""
- text = self.remove_invisible_chars(text)
+ text = self.clean_input(text)
if not URL_RE.search(text):
return False, None
@@ -496,7 +501,7 @@ class Filtering(Cog):
Attempts to catch some of common ways to try to cheat the system.
"""
- text = self.remove_invisible_chars(text)
+ text = self.clean_input(text)
# Remove backslashes to prevent escape character aroundfuckery like
# discord\.gg/gdudes-pony-farm
@@ -635,20 +640,13 @@ class Filtering(Cog):
log.info(f"Deleted the offensive message with id {msg['id']}.")
@staticmethod
- def remove_invisible_chars(string: str) -> str:
- """
- Remove invisible characters from `string`.
-
- Removed characters:
-
- - mongolian vowel separator
- - zero width space
- - zero width non-joiner
- - zero width joiner
- - word joiner
- - zero width non-breaking space
- """
- return re.sub("[\u180e\u200b\u200c\u200d\u2060\ufeff]", "", string)
+ def clean_input(string: str) -> str:
+ """Remove zalgo and invisible characters from `string`."""
+ # For future consideration: remove characters in the Mc, Sk, and Lm categories too.
+ # Can be normalised with form C to merge char + combining char into a single char to avoid
+ # removing legit diacritics, but this would open up a way to bypass filters.
+ no_zalgo = ZALGO_RE.sub("", string)
+ return INVISIBLE_RE.sub("", no_zalgo)
def setup(bot: Bot) -> None: