Merge pull request #450 from python-discord/rich-embed-false-positive-fix

Prevent false-positives of the rich embed filter
author: S. Co1 <[email protected]> 2019-09-24 13:35:28 -0400
committer: GitHub <[email protected]> 2019-09-24 13:35:28 -0400
commit: cea29d3600f83ffcf4cd4593024ae435c9d8979d (patch)
tree: ebcc8c52948189abd5da9048cf789507c1cfc48d
parent: Merge pull request #443 from Akarys42/master (diff)
parent: Merge branch 'master' into rich-embed-false-positive-fix (diff)
1 files changed, 33 insertions, 15 deletions
diff --git a/bot/cogs/filtering.py b/bot/cogs/filtering.py
index 9cd1b7203..bd8c6ed67 100644
--- a/bot/cogs/filtering.py
+++ b/bot/cogs/filtering.py
@@ -15,18 +15,26 @@ from bot.constants import (
 
 log = logging.getLogger(__name__)
 
-INVITE_RE = (
+INVITE_RE = re.compile(
     r"(?:discord(?:[\.,]|dot)gg|"                     # Could be discord.gg/
     r"discord(?:[\.,]|dot)com(?:\/|slash)invite|"     # or discord.com/invite/
     r"discordapp(?:[\.,]|dot)com(?:\/|slash)invite|"  # or discordapp.com/invite/
     r"discord(?:[\.,]|dot)me|"                        # or discord.me
     r"discord(?:[\.,]|dot)io"                         # or discord.io.
     r")(?:[\/]|slash)"                                # / or 'slash'
-    r"([a-zA-Z0-9]+)"                                 # the invite code itself
+    r"([a-zA-Z0-9]+)",                                # the invite code itself
+    flags=re.IGNORECASE
 )
 
-URL_RE = r"(https?://[^\s]+)"
-ZALGO_RE = r"[\u0300-\u036F\u0489]"
+URL_RE = re.compile(r"(https?://[^\s]+)", flags=re.IGNORECASE)
+ZALGO_RE = re.compile(r"[\u0300-\u036F\u0489]")
+
+WORD_WATCHLIST_PATTERNS = [
+    re.compile(fr'\b{expression}\b', flags=re.IGNORECASE) for expression in Filter.word_watchlist
+]
+TOKEN_WATCHLIST_PATTERNS = [
+    re.compile(fr'{expression}', flags=re.IGNORECASE) for expression in Filter.token_watchlist
+]
 
 
 class Filtering(Cog):
@@ -228,8 +236,8 @@ class Filtering(Cog):
 
         Only matches words with boundaries before and after the expression.
         """
-        for expression in Filter.word_watchlist:
-            if re.search(fr"\b{expression}\b", text, re.IGNORECASE):
+        for regex_pattern in WORD_WATCHLIST_PATTERNS:
+            if regex_pattern.search(text):
                 return True
 
         return False
@@ -241,11 +249,11 @@ class Filtering(Cog):
 
         This will match the expression even if it does not have boundaries before and after.
         """
-        for expression in Filter.token_watchlist:
-            if re.search(fr"{expression}", text, re.IGNORECASE):
+        for regex_pattern in TOKEN_WATCHLIST_PATTERNS:
+            if regex_pattern.search(text):
 
                 # Make sure it's not a URL
-                if not re.search(URL_RE, text, re.IGNORECASE):
+                if not URL_RE.search(text):
                     return True
 
         return False
@@ -253,7 +261,7 @@ class Filtering(Cog):
     @staticmethod
     async def _has_urls(text: str) -> bool:
         """Returns True if the text contains one of the blacklisted URLs from the config file."""
-        if not re.search(URL_RE, text, re.IGNORECASE):
+        if not URL_RE.search(text):
             return False
 
         text = text.lower()
@@ -271,7 +279,7 @@ class Filtering(Cog):
 
         Zalgo range is \u0300 – \u036F and \u0489.
         """
-        return bool(re.search(ZALGO_RE, text))
+        return bool(ZALGO_RE.search(text))
 
     async def _has_invites(self, text: str) -> Union[dict, bool]:
         """
@@ -286,7 +294,7 @@ class Filtering(Cog):
         # discord\.gg/gdudes-pony-farm
         text = text.replace("\\", "")
 
-        invites = re.findall(INVITE_RE, text, re.IGNORECASE)
+        invites = INVITE_RE.findall(text)
         invite_data = dict()
         for invite in invites:
             if invite in invite_data:
@@ -323,11 +331,21 @@ class Filtering(Cog):
 
     @staticmethod
     async def _has_rich_embed(msg: Message) -> bool:
-        """Returns True if any of the embeds in the message are of type 'rich', but are not twitter embeds."""
+        """Determines if `msg` contains any rich embeds not auto-generated from a URL."""
         if msg.embeds:
             for embed in msg.embeds:
-                if embed.type == "rich" and (not embed.url or "twitter.com" not in embed.url):
-                    return True
+                if embed.type == "rich":
+                    urls = URL_RE.findall(msg.content)
+                    if not embed.url or embed.url not in urls:
+                        # If `embed.url` does not exist or if `embed.url` is not part of the content
+                        # of the message, it's unlikely to be an auto-generated embed by Discord.
+                        return True
+                    else:
+                        log.trace(
+                            "Found a rich embed sent by a regular user account, "
+                            "but it was likely just an automatic URL embed."
+                        )
+                        return False
         return False
 
     async def notify_member(self, filtered_member: Member, reason: str, channel: TextChannel) -> None:
author	S. Co1 <[email protected]>	2019-09-24 13:35:28 -0400
committer	GitHub <[email protected]>	2019-09-24 13:35:28 -0400
commit	cea29d3600f83ffcf4cd4593024ae435c9d8979d (patch)
tree	ebcc8c52948189abd5da9048cf789507c1cfc48d
parent	Merge pull request #443 from Akarys42/master (diff)
parent	Merge branch 'master' into rich-embed-false-positive-fix (diff)