diff options
| author | 2019-09-24 13:35:28 -0400 | |
|---|---|---|
| committer | 2019-09-24 13:35:28 -0400 | |
| commit | cea29d3600f83ffcf4cd4593024ae435c9d8979d (patch) | |
| tree | ebcc8c52948189abd5da9048cf789507c1cfc48d | |
| parent | Merge pull request #443 from Akarys42/master (diff) | |
| parent | Merge branch 'master' into rich-embed-false-positive-fix (diff) | |
Merge pull request #450 from python-discord/rich-embed-false-positive-fix
Prevent false-positives of the rich embed filter
| -rw-r--r-- | bot/cogs/filtering.py | 48 | 
1 files changed, 33 insertions, 15 deletions
diff --git a/bot/cogs/filtering.py b/bot/cogs/filtering.py index 9cd1b7203..bd8c6ed67 100644 --- a/bot/cogs/filtering.py +++ b/bot/cogs/filtering.py @@ -15,18 +15,26 @@ from bot.constants import (  log = logging.getLogger(__name__) -INVITE_RE = ( +INVITE_RE = re.compile(      r"(?:discord(?:[\.,]|dot)gg|"                     # Could be discord.gg/      r"discord(?:[\.,]|dot)com(?:\/|slash)invite|"     # or discord.com/invite/      r"discordapp(?:[\.,]|dot)com(?:\/|slash)invite|"  # or discordapp.com/invite/      r"discord(?:[\.,]|dot)me|"                        # or discord.me      r"discord(?:[\.,]|dot)io"                         # or discord.io.      r")(?:[\/]|slash)"                                # / or 'slash' -    r"([a-zA-Z0-9]+)"                                 # the invite code itself +    r"([a-zA-Z0-9]+)",                                # the invite code itself +    flags=re.IGNORECASE  ) -URL_RE = r"(https?://[^\s]+)" -ZALGO_RE = r"[\u0300-\u036F\u0489]" +URL_RE = re.compile(r"(https?://[^\s]+)", flags=re.IGNORECASE) +ZALGO_RE = re.compile(r"[\u0300-\u036F\u0489]") + +WORD_WATCHLIST_PATTERNS = [ +    re.compile(fr'\b{expression}\b', flags=re.IGNORECASE) for expression in Filter.word_watchlist +] +TOKEN_WATCHLIST_PATTERNS = [ +    re.compile(fr'{expression}', flags=re.IGNORECASE) for expression in Filter.token_watchlist +]  class Filtering(Cog): @@ -228,8 +236,8 @@ class Filtering(Cog):          Only matches words with boundaries before and after the expression.          """ -        for expression in Filter.word_watchlist: -            if re.search(fr"\b{expression}\b", text, re.IGNORECASE): +        for regex_pattern in WORD_WATCHLIST_PATTERNS: +            if regex_pattern.search(text):                  return True          return False @@ -241,11 +249,11 @@ class Filtering(Cog):          This will match the expression even if it does not have boundaries before and after.          """ -        for expression in Filter.token_watchlist: -            if re.search(fr"{expression}", text, re.IGNORECASE): +        for regex_pattern in TOKEN_WATCHLIST_PATTERNS: +            if regex_pattern.search(text):                  # Make sure it's not a URL -                if not re.search(URL_RE, text, re.IGNORECASE): +                if not URL_RE.search(text):                      return True          return False @@ -253,7 +261,7 @@ class Filtering(Cog):      @staticmethod      async def _has_urls(text: str) -> bool:          """Returns True if the text contains one of the blacklisted URLs from the config file.""" -        if not re.search(URL_RE, text, re.IGNORECASE): +        if not URL_RE.search(text):              return False          text = text.lower() @@ -271,7 +279,7 @@ class Filtering(Cog):          Zalgo range is \u0300 – \u036F and \u0489.          """ -        return bool(re.search(ZALGO_RE, text)) +        return bool(ZALGO_RE.search(text))      async def _has_invites(self, text: str) -> Union[dict, bool]:          """ @@ -286,7 +294,7 @@ class Filtering(Cog):          # discord\.gg/gdudes-pony-farm          text = text.replace("\\", "") -        invites = re.findall(INVITE_RE, text, re.IGNORECASE) +        invites = INVITE_RE.findall(text)          invite_data = dict()          for invite in invites:              if invite in invite_data: @@ -323,11 +331,21 @@ class Filtering(Cog):      @staticmethod      async def _has_rich_embed(msg: Message) -> bool: -        """Returns True if any of the embeds in the message are of type 'rich', but are not twitter embeds.""" +        """Determines if `msg` contains any rich embeds not auto-generated from a URL."""          if msg.embeds:              for embed in msg.embeds: -                if embed.type == "rich" and (not embed.url or "twitter.com" not in embed.url): -                    return True +                if embed.type == "rich": +                    urls = URL_RE.findall(msg.content) +                    if not embed.url or embed.url not in urls: +                        # If `embed.url` does not exist or if `embed.url` is not part of the content +                        # of the message, it's unlikely to be an auto-generated embed by Discord. +                        return True +                    else: +                        log.trace( +                            "Found a rich embed sent by a regular user account, " +                            "but it was likely just an automatic URL embed." +                        ) +                        return False          return False      async def notify_member(self, filtered_member: Member, reason: str, channel: TextChannel) -> None:  |