From b7a468d88c296e6603e7ae1962265b07a24f1782 Mon Sep 17 00:00:00 2001 From: Amrou Bellalouna Date: Wed, 28 Sep 2022 08:29:21 +0100 Subject: add remove_subdomain_from_url in bot util helpers --- bot/utils/helpers.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/bot/utils/helpers.py b/bot/utils/helpers.py index 3501a3933..3e45a71a3 100644 --- a/bot/utils/helpers.py +++ b/bot/utils/helpers.py @@ -1,5 +1,6 @@ from abc import ABCMeta from typing import Optional +from urllib.parse import urlparse from discord.ext.commands import CogMeta @@ -30,3 +31,14 @@ def has_lines(string: str, count: int) -> bool: def pad_base64(data: str) -> str: """Return base64 `data` with padding characters to ensure its length is a multiple of 4.""" return data + "=" * (-len(data) % 4) + + +def remove_subdomain_from_url(url: str) -> str: + """Transforms potential relative urls to absolute ones.""" + parsed_url = urlparse(url) + netloc_components = parsed_url.netloc.split(".") + # Eliminate subdomain and use the second level domain and top level domain only + netloc_components[:] = netloc_components[-2:] + netloc = ".".join(netloc_components) + parsed_url = parsed_url._replace(netloc=netloc) + return parsed_url.geturl() -- cgit v1.2.3 From 6a0da9a2d24f4acdc6b1d5044c821e6dc2271dc3 Mon Sep 17 00:00:00 2001 From: Amrou Bellalouna Date: Wed, 28 Sep 2022 08:32:36 +0100 Subject: transform urls matched in messages We add the transformed urls to the set to account for the way discord renders relative urls in cases like Twitter's --- bot/exts/filters/filtering.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index e4df0b1fd..db13df9b2 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -23,10 +23,12 @@ from bot.constants import Bot as BotConfig, Channels, Colours, Filter, Guild, Ic from bot.exts.events.code_jams._channels import CATEGORY_NAME as JAM_CATEGORY_NAME from bot.exts.moderation.modlog import ModLog from bot.log import get_logger +from bot.utils.helpers import remove_subdomain_from_url from bot.utils.messages import format_user log = get_logger(__name__) + # Regular expressions CODE_BLOCK_RE = re.compile( r"(?P``?)[^`]+?(?P=delim)(?!`+)" # Inline codeblock @@ -583,7 +585,7 @@ class Filtering(Cog): """ text = self.clean_input(text) - # Remove backslashes to prevent escape character aroundfuckery like + # Remove backslashes to prevent escape character around fuckery like # discord\.gg/gdudes-pony-farm text = text.replace("\\", "") @@ -648,7 +650,12 @@ class Filtering(Cog): if msg.embeds: for embed in msg.embeds: if embed.type == "rich": - urls = URL_RE.findall(msg.content) + urls = set(URL_RE.findall(msg.content)) + # This is due to way discord renders relative urls in Embdes + # if we send the following url: https://mobile.twitter.com/something + # Discord renders it as https://twitter.com/something + for url in urls: + urls.add(remove_subdomain_from_url(url)) if not embed.url or embed.url not in urls: # If `embed.url` does not exist or if `embed.url` is not part of the content # of the message, it's unlikely to be an auto-generated embed by Discord. -- cgit v1.2.3 From ce16838adeff343156175d644b90dc68b0e88519 Mon Sep 17 00:00:00 2001 From: Amrou Bellalouna Date: Wed, 28 Sep 2022 08:39:48 +0100 Subject: collect all urls in a set to avoid duplicates --- bot/exts/filters/filtering.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index db13df9b2..b61204f4a 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -650,12 +650,13 @@ class Filtering(Cog): if msg.embeds: for embed in msg.embeds: if embed.type == "rich": - urls = set(URL_RE.findall(msg.content)) + urls = URL_RE.findall(msg.content) + final_urls = set(urls) # This is due to way discord renders relative urls in Embdes # if we send the following url: https://mobile.twitter.com/something # Discord renders it as https://twitter.com/something for url in urls: - urls.add(remove_subdomain_from_url(url)) + final_urls.add(remove_subdomain_from_url(url)) if not embed.url or embed.url not in urls: # If `embed.url` does not exist or if `embed.url` is not part of the content # of the message, it's unlikely to be an auto-generated embed by Discord. -- cgit v1.2.3 From caa26982526518dbfa06c187a560078c731ca774 Mon Sep 17 00:00:00 2001 From: Amrou Bellalouna Date: Wed, 28 Sep 2022 09:42:52 +0100 Subject: check for the url existence in the final_urls set --- bot/exts/filters/filtering.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/exts/filters/filtering.py b/bot/exts/filters/filtering.py index b61204f4a..3fb40b719 100644 --- a/bot/exts/filters/filtering.py +++ b/bot/exts/filters/filtering.py @@ -657,7 +657,7 @@ class Filtering(Cog): # Discord renders it as https://twitter.com/something for url in urls: final_urls.add(remove_subdomain_from_url(url)) - if not embed.url or embed.url not in urls: + if not embed.url or embed.url not in final_urls: # If `embed.url` does not exist or if `embed.url` is not part of the content # of the message, it's unlikely to be an auto-generated embed by Discord. return msg.embeds -- cgit v1.2.3 From 0bf93ebc2a7b4b7bb0a3a5618353fda5c576939c Mon Sep 17 00:00:00 2001 From: Amrou Bellalouna Date: Wed, 28 Sep 2022 13:38:35 +0100 Subject: use tldextract for a correct url decomposition --- bot/utils/helpers.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bot/utils/helpers.py b/bot/utils/helpers.py index 3e45a71a3..75bcc2ede 100644 --- a/bot/utils/helpers.py +++ b/bot/utils/helpers.py @@ -3,6 +3,7 @@ from typing import Optional from urllib.parse import urlparse from discord.ext.commands import CogMeta +from tldextract import extract class CogABCMeta(CogMeta, ABCMeta): @@ -34,11 +35,10 @@ def pad_base64(data: str) -> str: def remove_subdomain_from_url(url: str) -> str: - """Transforms potential relative urls to absolute ones.""" + """Removes subdomains from a URL whilst preserving the original URL composition.""" parsed_url = urlparse(url) - netloc_components = parsed_url.netloc.split(".") - # Eliminate subdomain and use the second level domain and top level domain only - netloc_components[:] = netloc_components[-2:] - netloc = ".".join(netloc_components) + extracted_url = extract(url) + # Eliminate subdomain by using the registered domain only + netloc = extracted_url.registered_domain parsed_url = parsed_url._replace(netloc=netloc) return parsed_url.geturl() -- cgit v1.2.3