From aa08fe2258ce4205272c7f27e1e2380c37275552 Mon Sep 17 00:00:00 2001 From: Chris Lovering Date: Mon, 18 Oct 2021 22:22:47 +0100 Subject: Normalise names before checking for matches --- bot/exts/filters/filtering.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index 79b7abe9f..e51d2aad6 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -2,6 +2,7 @@ import asyncio import re from datetime import timedelta from typing import Any, Dict, List, Mapping, NamedTuple, Optional, Tuple, Union +from unicodedata import normalize import arrow import dateutil.parser @@ -207,12 +208,19 @@ class Filtering(Cog): def get_name_matches(self, name: str) -> List[re.Match]: """Check bad words from passed string (name). Return list of matches.""" - name = self.clean_input(name) + normalised_name = normalize("NFKC", name) matches = [] + + # Run filters against normalized and original version, + # in case we have filters for one but not the other. + names_to_check = (name, normalised_name) + watchlist_patterns = self._get_filterlist_items('filter_token', allowed=False) for pattern in watchlist_patterns: - if match := re.search(pattern, name, flags=re.IGNORECASE): - matches.append(match) + for name in names_to_check: + if match := re.search(pattern, name, flags=re.IGNORECASE): + matches.append(match) + break # No need to see if other variations of this name match too. return matches async def check_send_alert(self, member: Member) -> bool: -- cgit v1.2.3 From baf8239be8c6a4f6da4bd7ce8f8b2abeaf55e58a Mon Sep 17 00:00:00 2001 From: Chris Lovering Date: Mon, 18 Oct 2021 22:51:31 +0100 Subject: Check if we recently alerted about a bad name before running all filter tokens again --- bot/exts/filters/filtering.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index e51d2aad6..4b1de9638 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -237,10 +237,14 @@ class Filtering(Cog): """Send a mod alert every 3 days if a username still matches a watchlist pattern.""" # Use lock to avoid race conditions async with self.name_lock: + # Check if we recently alerted about this user first, + # to avoid running all the filter tokens against their name again. + if not await self.check_send_alert(member): + return + # Check whether the users display name contains any words in our blacklist matches = self.get_name_matches(member.display_name) - - if not matches or not await self.check_send_alert(member): + if not matches: return log.info(f"Sending bad nickname alert for '{member.display_name}' ({member.id}).") -- cgit v1.2.3 From 8efbff61aa9a8697ddb140fa5978630a6c609054 Mon Sep 17 00:00:00 2001 From: Chris Lovering Date: Mon, 18 Oct 2021 22:56:22 +0100 Subject: Return early when getting name matches Ss soon as we get a match for a bad name, return it, rather than running it against the rest of the filters. --- bot/exts/filters/filtering.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index 4b1de9638..fb1d62e48 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -206,10 +206,9 @@ class Filtering(Cog): delta = relativedelta(after.edited_at, before.edited_at).microseconds await self._filter_message(after, delta) - def get_name_matches(self, name: str) -> List[re.Match]: - """Check bad words from passed string (name). Return list of matches.""" + def get_name_match(self, name: str) -> Optional[re.Match]: + """Check bad words from passed string (name). Return the first match found.""" normalised_name = normalize("NFKC", name) - matches = [] # Run filters against normalized and original version, # in case we have filters for one but not the other. @@ -219,9 +218,8 @@ class Filtering(Cog): for pattern in watchlist_patterns: for name in names_to_check: if match := re.search(pattern, name, flags=re.IGNORECASE): - matches.append(match) - break # No need to see if other variations of this name match too. - return matches + return match + return None async def check_send_alert(self, member: Member) -> bool: """When there is less than 3 days after last alert, return `False`, otherwise `True`.""" @@ -243,8 +241,8 @@ class Filtering(Cog): return # Check whether the users display name contains any words in our blacklist - matches = self.get_name_matches(member.display_name) - if not matches: + match = self.get_name_match(member.display_name) + if not match: return log.info(f"Sending bad nickname alert for '{member.display_name}' ({member.id}).") @@ -252,7 +250,7 @@ class Filtering(Cog): log_string = ( f"**User:** {format_user(member)}\n" f"**Display Name:** {escape_markdown(member.display_name)}\n" - f"**Bad Matches:** {', '.join(match.group() for match in matches)}" + f"**Bad Match:** {match.group()}" ) await self.mod_log.send_log_message( -- cgit v1.2.3 From 5901ac0ba4544f2bd479a74d5d6a345b3d31cb01 Mon Sep 17 00:00:00 2001 From: Chris Lovering Date: Tue, 19 Oct 2021 17:00:30 +0100 Subject: Also run name filters against a cleaned version of the normalised name --- bot/exts/filters/filtering.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index fb1d62e48..21ed090ea 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -1,8 +1,8 @@ import asyncio import re +import unicodedata from datetime import timedelta from typing import Any, Dict, List, Mapping, NamedTuple, Optional, Tuple, Union -from unicodedata import normalize import arrow import dateutil.parser @@ -208,11 +208,12 @@ class Filtering(Cog): def get_name_match(self, name: str) -> Optional[re.Match]: """Check bad words from passed string (name). Return the first match found.""" - normalised_name = normalize("NFKC", name) + normalised_name = unicodedata.normalize("NFKC", name) + cleaned_normalised_name = "".join(c for c in normalised_name if not unicodedata.combining(c)) - # Run filters against normalized and original version, + # Run filters against normalised, cleaned normalised and the original name, # in case we have filters for one but not the other. - names_to_check = (name, normalised_name) + names_to_check = (name, normalised_name, cleaned_normalised_name) watchlist_patterns = self._get_filterlist_items('filter_token', allowed=False) for pattern in watchlist_patterns: -- cgit v1.2.3 From d0dc7a0e4e3fc6618ae49d43b24938c84793dcf0 Mon Sep 17 00:00:00 2001 From: Chris Lovering Date: Mon, 6 Dec 2021 22:59:35 +0000 Subject: Build an intermediate list for speed in filtering cog --- bot/exts/filters/filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index 21ed090ea..8accc61f8 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -209,7 +209,7 @@ class Filtering(Cog): def get_name_match(self, name: str) -> Optional[re.Match]: """Check bad words from passed string (name). Return the first match found.""" normalised_name = unicodedata.normalize("NFKC", name) - cleaned_normalised_name = "".join(c for c in normalised_name if not unicodedata.combining(c)) + cleaned_normalised_name = "".join([c for c in normalised_name if not unicodedata.combining(c)]) # Run filters against normalised, cleaned normalised and the original name, # in case we have filters for one but not the other. -- cgit v1.2.3