diff options
author | 2019-09-24 09:54:17 +0200 | |
---|---|---|
committer | 2019-09-24 10:42:06 +0200 | |
commit | 072a0a65f9fc964e6b9975b50a262e88d5c43b08 (patch) | |
tree | 0fb2d4352958aea7cba81ecaea6c4281f18035c3 | |
parent | Merge pull request #449 from python-discord/duration-converter-fix (diff) |
Prevent false-positives of the rich embed filter
https://github.com/python-discord/bot/issues/293
The rich embed filter is plagued by false positives now Discord has
added more custom preview embeds for various websites. Since these
embeds have the `rich` type instead of the `link` type, these embeds
triggered the filter we had in place.
This commit remedies that by using the existing URL regex pattern to
list all the URLs contained in the message content and then checking
if the embed url is a member of that list. If so, it's very likely
that the embed was auto-generated from that URL, so we should ignore
it. This approach deviates slightly from that outlined in #293.
This does increase the probability of a false-negative, as a "true"
user-generated rich embed could also have a url that's contained in
the message body. However, I've checked most of the triggers we have
had in the past and none of the legitimate triggers would have been a
false-negative under the new rules. Therefore, I think it's very
reasonable to adopt this strategy.
In addition to the change in behavior of the rich embed filter, I
have also kaizened the existing regex patterns by compiling them at
load time. Since we check a lot of regex patterns for every message
received by the bot, this should be beneficial for performance.
-rw-r--r-- | bot/cogs/filtering.py | 48 |
1 files changed, 33 insertions, 15 deletions
diff --git a/bot/cogs/filtering.py b/bot/cogs/filtering.py index 9cd1b7203..bd8c6ed67 100644 --- a/bot/cogs/filtering.py +++ b/bot/cogs/filtering.py @@ -15,18 +15,26 @@ from bot.constants import ( log = logging.getLogger(__name__) -INVITE_RE = ( +INVITE_RE = re.compile( r"(?:discord(?:[\.,]|dot)gg|" # Could be discord.gg/ r"discord(?:[\.,]|dot)com(?:\/|slash)invite|" # or discord.com/invite/ r"discordapp(?:[\.,]|dot)com(?:\/|slash)invite|" # or discordapp.com/invite/ r"discord(?:[\.,]|dot)me|" # or discord.me r"discord(?:[\.,]|dot)io" # or discord.io. r")(?:[\/]|slash)" # / or 'slash' - r"([a-zA-Z0-9]+)" # the invite code itself + r"([a-zA-Z0-9]+)", # the invite code itself + flags=re.IGNORECASE ) -URL_RE = r"(https?://[^\s]+)" -ZALGO_RE = r"[\u0300-\u036F\u0489]" +URL_RE = re.compile(r"(https?://[^\s]+)", flags=re.IGNORECASE) +ZALGO_RE = re.compile(r"[\u0300-\u036F\u0489]") + +WORD_WATCHLIST_PATTERNS = [ + re.compile(fr'\b{expression}\b', flags=re.IGNORECASE) for expression in Filter.word_watchlist +] +TOKEN_WATCHLIST_PATTERNS = [ + re.compile(fr'{expression}', flags=re.IGNORECASE) for expression in Filter.token_watchlist +] class Filtering(Cog): @@ -228,8 +236,8 @@ class Filtering(Cog): Only matches words with boundaries before and after the expression. """ - for expression in Filter.word_watchlist: - if re.search(fr"\b{expression}\b", text, re.IGNORECASE): + for regex_pattern in WORD_WATCHLIST_PATTERNS: + if regex_pattern.search(text): return True return False @@ -241,11 +249,11 @@ class Filtering(Cog): This will match the expression even if it does not have boundaries before and after. """ - for expression in Filter.token_watchlist: - if re.search(fr"{expression}", text, re.IGNORECASE): + for regex_pattern in TOKEN_WATCHLIST_PATTERNS: + if regex_pattern.search(text): # Make sure it's not a URL - if not re.search(URL_RE, text, re.IGNORECASE): + if not URL_RE.search(text): return True return False @@ -253,7 +261,7 @@ class Filtering(Cog): @staticmethod async def _has_urls(text: str) -> bool: """Returns True if the text contains one of the blacklisted URLs from the config file.""" - if not re.search(URL_RE, text, re.IGNORECASE): + if not URL_RE.search(text): return False text = text.lower() @@ -271,7 +279,7 @@ class Filtering(Cog): Zalgo range is \u0300 – \u036F and \u0489. """ - return bool(re.search(ZALGO_RE, text)) + return bool(ZALGO_RE.search(text)) async def _has_invites(self, text: str) -> Union[dict, bool]: """ @@ -286,7 +294,7 @@ class Filtering(Cog): # discord\.gg/gdudes-pony-farm text = text.replace("\\", "") - invites = re.findall(INVITE_RE, text, re.IGNORECASE) + invites = INVITE_RE.findall(text) invite_data = dict() for invite in invites: if invite in invite_data: @@ -323,11 +331,21 @@ class Filtering(Cog): @staticmethod async def _has_rich_embed(msg: Message) -> bool: - """Returns True if any of the embeds in the message are of type 'rich', but are not twitter embeds.""" + """Determines if `msg` contains any rich embeds not auto-generated from a URL.""" if msg.embeds: for embed in msg.embeds: - if embed.type == "rich" and (not embed.url or "twitter.com" not in embed.url): - return True + if embed.type == "rich": + urls = URL_RE.findall(msg.content) + if not embed.url or embed.url not in urls: + # If `embed.url` does not exist or if `embed.url` is not part of the content + # of the message, it's unlikely to be an auto-generated embed by Discord. + return True + else: + log.trace( + "Found a rich embed sent by a regular user account, " + "but it was likely just an automatic URL embed." + ) + return False return False async def notify_member(self, filtered_member: Member, reason: str, channel: TextChannel) -> None: |