Prevent false-positives of the rich embed filter

https://github.com/python-discord/bot/issues/293 The rich embed filter is plagued by false positives now Discord has added more custom preview embeds for various websites. Since these embeds have the `rich` type instead of the `link` type, these embeds triggered the filter we had in place. This commit remedies that by using the existing URL regex pattern to list all the URLs contained in the message content and then checking if the embed url is a member of that list. If so, it's very likely that the embed was auto-generated from that URL, so we should ignore it. This approach deviates slightly from that outlined in #293. This does increase the probability of a false-negative, as a "true" user-generated rich embed could also have a url that's contained in the message body. However, I've checked most of the triggers we have had in the past and none of the legitimate triggers would have been a false-negative under the new rules. Therefore, I think it's very reasonable to adopt this strategy. In addition to the change in behavior of the rich embed filter, I have also kaizened the existing regex patterns by compiling them at load time. Since we check a lot of regex patterns for every message received by the bot, this should be beneficial for performance.
author: Sebastiaan Zeeff <[email protected]> 2019-09-24 09:54:17 +0200
committer: Sebastiaan Zeeff <[email protected]> 2019-09-24 10:42:06 +0200
commit: 072a0a65f9fc964e6b9975b50a262e88d5c43b08 (patch)
tree: 0fb2d4352958aea7cba81ecaea6c4281f18035c3
parent: Merge pull request #449 from python-discord/duration-converter-fix (diff)
1 files changed, 33 insertions, 15 deletions
diff --git a/bot/cogs/filtering.py b/bot/cogs/filtering.py
index 9cd1b7203..bd8c6ed67 100644
--- a/bot/cogs/filtering.py
+++ b/bot/cogs/filtering.py
@@ -15,18 +15,26 @@ from bot.constants import (
 
 log = logging.getLogger(__name__)
 
-INVITE_RE = (
+INVITE_RE = re.compile(
     r"(?:discord(?:[\.,]|dot)gg|"                     # Could be discord.gg/
     r"discord(?:[\.,]|dot)com(?:\/|slash)invite|"     # or discord.com/invite/
     r"discordapp(?:[\.,]|dot)com(?:\/|slash)invite|"  # or discordapp.com/invite/
     r"discord(?:[\.,]|dot)me|"                        # or discord.me
     r"discord(?:[\.,]|dot)io"                         # or discord.io.
     r")(?:[\/]|slash)"                                # / or 'slash'
-    r"([a-zA-Z0-9]+)"                                 # the invite code itself
+    r"([a-zA-Z0-9]+)",                                # the invite code itself
+    flags=re.IGNORECASE
 )
 
-URL_RE = r"(https?://[^\s]+)"
-ZALGO_RE = r"[\u0300-\u036F\u0489]"
+URL_RE = re.compile(r"(https?://[^\s]+)", flags=re.IGNORECASE)
+ZALGO_RE = re.compile(r"[\u0300-\u036F\u0489]")
+
+WORD_WATCHLIST_PATTERNS = [
+    re.compile(fr'\b{expression}\b', flags=re.IGNORECASE) for expression in Filter.word_watchlist
+]
+TOKEN_WATCHLIST_PATTERNS = [
+    re.compile(fr'{expression}', flags=re.IGNORECASE) for expression in Filter.token_watchlist
+]
 
 
 class Filtering(Cog):
@@ -228,8 +236,8 @@ class Filtering(Cog):
 
         Only matches words with boundaries before and after the expression.
         """
-        for expression in Filter.word_watchlist:
-            if re.search(fr"\b{expression}\b", text, re.IGNORECASE):
+        for regex_pattern in WORD_WATCHLIST_PATTERNS:
+            if regex_pattern.search(text):
                 return True
 
         return False
@@ -241,11 +249,11 @@ class Filtering(Cog):
 
         This will match the expression even if it does not have boundaries before and after.
         """
-        for expression in Filter.token_watchlist:
-            if re.search(fr"{expression}", text, re.IGNORECASE):
+        for regex_pattern in TOKEN_WATCHLIST_PATTERNS:
+            if regex_pattern.search(text):
 
                 # Make sure it's not a URL
-                if not re.search(URL_RE, text, re.IGNORECASE):
+                if not URL_RE.search(text):
                     return True
 
         return False
@@ -253,7 +261,7 @@ class Filtering(Cog):
     @staticmethod
     async def _has_urls(text: str) -> bool:
         """Returns True if the text contains one of the blacklisted URLs from the config file."""
-        if not re.search(URL_RE, text, re.IGNORECASE):
+        if not URL_RE.search(text):
             return False
 
         text = text.lower()
@@ -271,7 +279,7 @@ class Filtering(Cog):
 
         Zalgo range is \u0300 – \u036F and \u0489.
         """
-        return bool(re.search(ZALGO_RE, text))
+        return bool(ZALGO_RE.search(text))
 
     async def _has_invites(self, text: str) -> Union[dict, bool]:
         """
@@ -286,7 +294,7 @@ class Filtering(Cog):
         # discord\.gg/gdudes-pony-farm
         text = text.replace("\\", "")
 
-        invites = re.findall(INVITE_RE, text, re.IGNORECASE)
+        invites = INVITE_RE.findall(text)
         invite_data = dict()
         for invite in invites:
             if invite in invite_data:
@@ -323,11 +331,21 @@ class Filtering(Cog):
 
     @staticmethod
     async def _has_rich_embed(msg: Message) -> bool:
-        """Returns True if any of the embeds in the message are of type 'rich', but are not twitter embeds."""
+        """Determines if `msg` contains any rich embeds not auto-generated from a URL."""
         if msg.embeds:
             for embed in msg.embeds:
-                if embed.type == "rich" and (not embed.url or "twitter.com" not in embed.url):
-                    return True
+                if embed.type == "rich":
+                    urls = URL_RE.findall(msg.content)
+                    if not embed.url or embed.url not in urls:
+                        # If `embed.url` does not exist or if `embed.url` is not part of the content
+                        # of the message, it's unlikely to be an auto-generated embed by Discord.
+                        return True
+                    else:
+                        log.trace(
+                            "Found a rich embed sent by a regular user account, "
+                            "but it was likely just an automatic URL embed."
+                        )
+                        return False
         return False
 
     async def notify_member(self, filtered_member: Member, reason: str, channel: TextChannel) -> None:
author	Sebastiaan Zeeff <[email protected]>	2019-09-24 09:54:17 +0200
committer	Sebastiaan Zeeff <[email protected]>	2019-09-24 10:42:06 +0200
commit	072a0a65f9fc964e6b9975b50a262e88d5c43b08 (patch)
tree	0fb2d4352958aea7cba81ecaea6c4281f18035c3
parent	Merge pull request #449 from python-discord/duration-converter-fix (diff)