Merge pull request #846 from python-discord/feat/filter/322/merge-tokens-words

Filtering: merge the word and token watch filters
author: S. Co1 <[email protected]> 2020-03-31 13:46:22 -0400
committer: GitHub <[email protected]> 2020-03-31 13:46:22 -0400
commit: 624748b26003b3758d9f2b8dd71a68728caa48ea (patch)
tree: 486464fc26f8db4ab5fb234136ecb1a230d5e334
parent: Merge pull request #833 from ks129/zen-match-fix (diff)
parent: Merge branch 'master' into feat/filter/322/merge-tokens-words (diff)
3 files changed, 21 insertions, 41 deletions
diff --git a/bot/cogs/filtering.py b/bot/cogs/filtering.py
index 6651d38e4..3f3dbb853 100644
--- a/bot/cogs/filtering.py
+++ b/bot/cogs/filtering.py
@@ -38,6 +38,7 @@ WORD_WATCHLIST_PATTERNS = [
 TOKEN_WATCHLIST_PATTERNS = [
     re.compile(fr'{expression}', flags=re.IGNORECASE) for expression in Filter.token_watchlist
 ]
+WATCHLIST_PATTERNS = WORD_WATCHLIST_PATTERNS + TOKEN_WATCHLIST_PATTERNS
 
 
 def expand_spoilers(text: str) -> str:
@@ -88,24 +89,18 @@ class Filtering(Cog):
                     f"Your URL has been removed because it matched a blacklisted domain. {staff_mistake_str}"
                 )
             },
+            "watch_regex": {
+                "enabled": Filter.watch_regex,
+                "function": self._has_watch_regex_match,
+                "type": "watchlist",
+                "content_only": True,
+            },
             "watch_rich_embeds": {
                 "enabled": Filter.watch_rich_embeds,
                 "function": self._has_rich_embed,
                 "type": "watchlist",
                 "content_only": False,
             },
-            "watch_words": {
-                "enabled": Filter.watch_words,
-                "function": self._has_watchlist_words,
-                "type": "watchlist",
-                "content_only": True,
-            },
-            "watch_tokens": {
-                "enabled": Filter.watch_tokens,
-                "function": self._has_watchlist_tokens,
-                "type": "watchlist",
-                "content_only": True,
-            },
         }
 
     @property
@@ -191,8 +186,8 @@ class Filtering(Cog):
                         else:
                             channel_str = f"in {msg.channel.mention}"
 
-                        # Word and match stats for watch_words and watch_tokens
-                        if filter_name in ("watch_words", "watch_tokens"):
+                        # Word and match stats for watch_regex
+                        if filter_name == "watch_regex":
                             surroundings = match.string[max(match.start() - 10, 0): match.end() + 10]
                             message_content = (
                                 f"**Match:** '{match[0]}'\n"
@@ -248,37 +243,24 @@ class Filtering(Cog):
                         break  # We don't want multiple filters to trigger
 
     @staticmethod
-    async def _has_watchlist_words(text: str) -> Union[bool, re.Match]:
+    async def _has_watch_regex_match(text: str) -> Union[bool, re.Match]:
         """
-        Returns True if the text contains one of the regular expressions from the word_watchlist in our filter config.
+        Return True if `text` matches any regex from `word_watchlist` or `token_watchlist` configs.
 
-        Only matches words with boundaries before and after the expression.
+        `word_watchlist`'s patterns are placed between word boundaries while `token_watchlist` is
+        matched as-is. Spoilers are expanded, if any, and URLs are ignored.
         """
         if SPOILER_RE.search(text):
             text = expand_spoilers(text)
-        for regex_pattern in WORD_WATCHLIST_PATTERNS:
-            match = regex_pattern.search(text)
-            if match:
-                return match  # match objects always have a boolean value of True
 
-        return False
-
-    @staticmethod
-    async def _has_watchlist_tokens(text: str) -> Union[bool, re.Match]:
-        """
-        Returns True if the text contains one of the regular expressions from the token_watchlist in our filter config.
+        # Make sure it's not a URL
+        if URL_RE.search(text):
+            return False
 
-        This will match the expression even if it does not have boundaries before and after.
-        """
-        for regex_pattern in TOKEN_WATCHLIST_PATTERNS:
-            match = regex_pattern.search(text)
+        for pattern in WATCHLIST_PATTERNS:
+            match = pattern.search(text)
             if match:
-
-                # Make sure it's not a URL
-                if not URL_RE.search(text):
-                    return match  # match objects always have a boolean value of True
-
-        return False
+                return match
 
     @staticmethod
     async def _has_urls(text: str) -> bool:
diff --git a/bot/constants.py b/bot/constants.py
index 14f8dc094..549e69c8f 100644
--- a/bot/constants.py
+++ b/bot/constants.py
@@ -206,9 +206,8 @@ class Filter(metaclass=YAMLGetter):
     filter_zalgo: bool
     filter_invites: bool
     filter_domains: bool
+    watch_regex: bool
     watch_rich_embeds: bool
-    watch_words: bool
-    watch_tokens: bool
 
     # Notifications are not expected for "watchlist" type filters
     notify_user_zalgo: bool
diff --git a/config-default.yml b/config-default.yml
index 5788d1e12..ef0ed970f 100644
--- a/config-default.yml
+++ b/config-default.yml
@@ -248,9 +248,8 @@ filter:
     filter_zalgo:       false
     filter_invites:     true
     filter_domains:     true
+    watch_regex:        true
     watch_rich_embeds:  true
-    watch_words:        true
-    watch_tokens:       true
 
     # Notify user on filter?
     # Notifications are not expected for "watchlist" type filters
author	S. Co1 <[email protected]>	2020-03-31 13:46:22 -0400
committer	GitHub <[email protected]>	2020-03-31 13:46:22 -0400
commit	624748b26003b3758d9f2b8dd71a68728caa48ea (patch)
tree	486464fc26f8db4ab5fb234136ecb1a230d5e334
parent	Merge pull request #833 from ks129/zen-match-fix (diff)
parent	Merge branch 'master' into feat/filter/322/merge-tokens-words (diff)