Token remover: match only base64 in regex

Making the regex more accurate reduces false positives at an earlier stage. There's no benefit to matching non-base64 as that would just be weeded out as invalid at a later stage anyway when it tries to decode it.
author: MarkKoz <[email protected]> 2020-05-21 21:34:10 -0700
committer: MarkKoz <[email protected]> 2020-05-21 21:34:10 -0700
commit: 95ef2dc01143902289c9aacde7969fb5c9e1a85c (patch)
tree: f0b66ab847b06fc5cd9ebf2f3a1221037fdd555b
parent: Token remover: decode ID using URL-safe base64 (diff)
1 files changed, 6 insertions, 7 deletions
diff --git a/bot/cogs/token_remover.py b/bot/cogs/token_remover.py
index 5b4598959..fa0647828 100644
--- a/bot/cogs/token_remover.py
+++ b/bot/cogs/token_remover.py
@@ -29,13 +29,12 @@ DELETION_MESSAGE_TEMPLATE = (
 )
 DISCORD_EPOCH = 1_420_070_400_000
 TOKEN_EPOCH = 1_293_840_000
-TOKEN_RE = re.compile(
-    r"[^\s\.()\"']+"  # Matches token part 1: The user ID string, encoded as base64
-    r"\."             # Matches a literal dot between the token parts
-    r"[^\s\.()\"']+"  # Matches token part 2: The creation timestamp, as an integer
-    r"\."             # Matches a literal dot between the token parts
-    r"[^\s\.()\"']+"  # Matches token part 3: The HMAC, unused by us, but check that it isn't empty
-)
+
+# Three parts delimited by dots: user ID, creation timestamp, HMAC.
+# The HMAC isn't parsed further, but it's in the regex to ensure it at least exists in the string.
+# Each part only matches base64 URL-safe characters.
+# Padding has never been observed, but the padding character '=' is matched just in case.
+TOKEN_RE = re.compile(r"[\w-=]+\.[\w-=]+\.[\w-=]+", re.ASCII)
 
 
 class TokenRemover(Cog):
author	MarkKoz <[email protected]>	2020-05-21 21:34:10 -0700
committer	MarkKoz <[email protected]>	2020-05-21 21:34:10 -0700
commit	95ef2dc01143902289c9aacde7969fb5c9e1a85c (patch)
tree	f0b66ab847b06fc5cd9ebf2f3a1221037fdd555b
parent	Token remover: decode ID using URL-safe base64 (diff)