diff options
author | 2020-10-14 16:33:35 -0700 | |
---|---|---|
committer | 2020-10-14 16:33:35 -0700 | |
commit | 4255fdef8b7a75c59f45928abf9da250e24ddc00 (patch) | |
tree | 15164d4d7fa32f88539248cf5baa929899056499 | |
parent | Remove trailing whitespace from verification.py (diff) | |
parent | Utils: clarify why has_lines counts by splitting by newlines (diff) |
Merge pull request #886 from python-discord/bug/utility/829/non-py-code-detection
Code block detection rewrite
-rw-r--r-- | LICENSE-THIRD-PARTY | 36 | ||||
-rw-r--r-- | bot/constants.py | 9 | ||||
-rw-r--r-- | bot/exts/help_channels.py | 45 | ||||
-rw-r--r-- | bot/exts/info/codeblock/__init__.py | 8 | ||||
-rw-r--r-- | bot/exts/info/codeblock/_cog.py | 186 | ||||
-rw-r--r-- | bot/exts/info/codeblock/_instructions.py | 184 | ||||
-rw-r--r-- | bot/exts/info/codeblock/_parsing.py | 224 | ||||
-rw-r--r-- | bot/exts/info/stats.py | 5 | ||||
-rw-r--r-- | bot/exts/utils/bot.py | 326 | ||||
-rw-r--r-- | bot/utils/__init__.py | 4 | ||||
-rw-r--r-- | bot/utils/channel.py | 33 | ||||
-rw-r--r-- | bot/utils/helpers.py | 9 | ||||
-rw-r--r-- | config-default.yml | 21 |
13 files changed, 733 insertions, 357 deletions
diff --git a/LICENSE-THIRD-PARTY b/LICENSE-THIRD-PARTY new file mode 100644 index 000000000..3349d7c05 --- /dev/null +++ b/LICENSE-THIRD-PARTY @@ -0,0 +1,36 @@ +BSD 3-Clause License + +Applies to: +- _RE_PYTHON_REPL and portions of _RE_IPYTHON_REPL in bot/cogs/codeblock/parsing.py + +- Copyright (c) 2008-Present, IPython Development Team +- Copyright (c) 2001-2007, Fernando Perez <[email protected]> +- Copyright (c) 2001, Janko Hauser <[email protected]> +- Copyright (c) 2001, Nathaniel Gray <[email protected]> + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/bot/constants.py b/bot/constants.py index c21fd52e0..6c8b933af 100644 --- a/bot/constants.py +++ b/bot/constants.py @@ -528,6 +528,15 @@ class BigBrother(metaclass=YAMLGetter): header_message_limit: int +class CodeBlock(metaclass=YAMLGetter): + section = 'code_block' + + channel_whitelist: List[int] + cooldown_channels: List[int] + cooldown_seconds: int + minimum_lines: int + + class Free(metaclass=YAMLGetter): section = 'free' diff --git a/bot/exts/help_channels.py b/bot/exts/help_channels.py index f5c9a5dd0..062d4fcfe 100644 --- a/bot/exts/help_channels.py +++ b/bot/exts/help_channels.py @@ -14,6 +14,7 @@ from discord.ext import commands from bot import constants from bot.bot import Bot +from bot.utils import channel as channel_utils from bot.utils.scheduling import Scheduler log = logging.getLogger(__name__) @@ -378,11 +379,18 @@ class HelpChannels(commands.Cog): log.trace("Getting the CategoryChannel objects for the help categories.") try: - self.available_category = await self.try_get_channel( - constants.Categories.help_available + self.available_category = await channel_utils.try_get_channel( + constants.Categories.help_available, + self.bot + ) + self.in_use_category = await channel_utils.try_get_channel( + constants.Categories.help_in_use, + self.bot + ) + self.dormant_category = await channel_utils.try_get_channel( + constants.Categories.help_dormant, + self.bot ) - self.in_use_category = await self.try_get_channel(constants.Categories.help_in_use) - self.dormant_category = await self.try_get_channel(constants.Categories.help_dormant) except discord.HTTPException: log.exception("Failed to get a category; cog will be removed") self.bot.remove_cog(self.qualified_name) @@ -442,12 +450,6 @@ class HelpChannels(commands.Cog): return False return message.author == self.bot.user and bot_msg_desc.strip() == description.strip() - @staticmethod - def is_in_category(channel: discord.TextChannel, category_id: int) -> bool: - """Return True if `channel` is within a category with `category_id`.""" - actual_category = getattr(channel, "category", None) - return actual_category is not None and actual_category.id == category_id - async def move_idle_channel(self, channel: discord.TextChannel, has_task: bool = True) -> None: """ Make the `channel` dormant if idle or schedule the move if still active. @@ -498,7 +500,7 @@ class HelpChannels(commands.Cog): options should be avoided, as it may interfere with the category move we perform. """ # Get a fresh copy of the category from the bot to avoid the cache mismatch issue we had. - category = await self.try_get_channel(category_id) + category = await channel_utils.try_get_channel(category_id, self.bot) payload = [{"id": c.id, "position": c.position} for c in category.channels] @@ -646,7 +648,7 @@ class HelpChannels(commands.Cog): channel = message.channel # Confirm the channel is an in use help channel - if self.is_in_category(channel, constants.Categories.help_in_use): + if channel_utils.is_in_category(channel, constants.Categories.help_in_use): log.trace(f"Checking if #{channel} ({channel.id}) has been answered.") # Check if there is an entry in unanswered @@ -671,7 +673,8 @@ class HelpChannels(commands.Cog): await self.check_for_answer(message) - if not self.is_in_category(channel, constants.Categories.help_available) or self.is_excluded_channel(channel): + is_available = channel_utils.is_in_category(channel, constants.Categories.help_available) + if not is_available or self.is_excluded_channel(channel): return # Ignore messages outside the Available category or in excluded channels. log.trace("Waiting for the cog to be ready before processing messages.") @@ -681,7 +684,7 @@ class HelpChannels(commands.Cog): async with self.on_message_lock: log.trace(f"on_message lock acquired for {message.id}.") - if not self.is_in_category(channel, constants.Categories.help_available): + if not channel_utils.is_in_category(channel, constants.Categories.help_available): log.debug( f"Message {message.id} will not make #{channel} ({channel.id}) in-use " f"because another message in the channel already triggered that." @@ -719,7 +722,7 @@ class HelpChannels(commands.Cog): The new time for the dormant task is configured with `HelpChannels.deleted_idle_minutes`. """ - if not self.is_in_category(msg.channel, constants.Categories.help_in_use): + if not channel_utils.is_in_category(msg.channel, constants.Categories.help_in_use): return if not await self.is_empty(msg.channel): @@ -844,18 +847,6 @@ class HelpChannels(commands.Cog): log.trace(f"Dormant message not found in {channel_info}; sending a new message.") await channel.send(embed=embed) - async def try_get_channel(self, channel_id: int) -> discord.abc.GuildChannel: - """Attempt to get or fetch a channel and return it.""" - log.trace(f"Getting the channel {channel_id}.") - - channel = self.bot.get_channel(channel_id) - if not channel: - log.debug(f"Channel {channel_id} is not in cache; fetching from API.") - channel = await self.bot.fetch_channel(channel_id) - - log.trace(f"Channel #{channel} ({channel_id}) retrieved.") - return channel - async def pin_wrapper(self, msg_id: int, channel: discord.TextChannel, *, pin: bool) -> bool: """ Pin message `msg_id` in `channel` if `pin` is True or unpin if it's False. diff --git a/bot/exts/info/codeblock/__init__.py b/bot/exts/info/codeblock/__init__.py new file mode 100644 index 000000000..5c55bc5e3 --- /dev/null +++ b/bot/exts/info/codeblock/__init__.py @@ -0,0 +1,8 @@ +from bot.bot import Bot + + +def setup(bot: Bot) -> None: + """Load the CodeBlockCog cog.""" + # Defer import to reduce side effects from importing the codeblock package. + from bot.exts.info.codeblock._cog import CodeBlockCog + bot.add_cog(CodeBlockCog(bot)) diff --git a/bot/exts/info/codeblock/_cog.py b/bot/exts/info/codeblock/_cog.py new file mode 100644 index 000000000..1e0feab0d --- /dev/null +++ b/bot/exts/info/codeblock/_cog.py @@ -0,0 +1,186 @@ +import logging +import time +from typing import Optional + +import discord +from discord import Message, RawMessageUpdateEvent +from discord.ext.commands import Cog + +from bot import constants +from bot.bot import Bot +from bot.exts.filters.token_remover import TokenRemover +from bot.exts.filters.webhook_remover import WEBHOOK_URL_RE +from bot.exts.info.codeblock._instructions import get_instructions +from bot.utils import has_lines +from bot.utils.channel import is_help_channel +from bot.utils.messages import wait_for_deletion + +log = logging.getLogger(__name__) + + +class CodeBlockCog(Cog, name="Code Block"): + """ + Detect improperly formatted Markdown code blocks and suggest proper formatting. + + There are four basic ways in which a code block is considered improperly formatted: + + 1. The code is not within a code block at all + * Ignored if the code is not valid Python or Python REPL code + 2. Incorrect characters are used for backticks + 3. A language for syntax highlighting is not specified + * Ignored if the code is not valid Python or Python REPL code + 4. A syntax highlighting language is incorrectly specified + * Ignored if the language specified doesn't look like it was meant for Python + * This can go wrong in two ways: + 1. Spaces before the language + 2. No newline immediately following the language + + Messages or code blocks must meet a minimum line count to be detected. Detecting multiple code + blocks is supported. However, if at least one code block is correct, then instructions will not + be sent even if others are incorrect. When multiple incorrect code blocks are found, only the + first one is used as the basis for the instructions sent. + + When an issue is detected, an embed is sent containing specific instructions on fixing what + is wrong. If the user edits their message to fix the code block, the instructions will be + removed. If they fail to fix the code block with an edit, the instructions will be updated to + show what is still incorrect after the user's edit. The embed can be manually deleted with a + reaction. Otherwise, it will automatically be removed after 5 minutes. + + The cog only detects messages in whitelisted channels. Channels may also have a cooldown on the + instructions being sent. Note all help channels are also whitelisted with cooldowns enabled. + + For configurable parameters, see the `code_block` section in config-default.py. + """ + + def __init__(self, bot: Bot): + self.bot = bot + + # Stores allowed channels plus epoch times since the last instructional messages sent. + self.channel_cooldowns = dict.fromkeys(constants.CodeBlock.cooldown_channels, 0.0) + + # Maps users' messages to the messages the bot sent with instructions. + self.codeblock_message_ids = {} + + @staticmethod + def create_embed(instructions: str) -> discord.Embed: + """Return an embed which displays code block formatting `instructions`.""" + return discord.Embed(description=instructions) + + async def get_sent_instructions(self, payload: RawMessageUpdateEvent) -> Optional[Message]: + """ + Return the bot's sent instructions message associated with a user's message `payload`. + + Return None if the message cannot be found. In this case, it's likely the message was + deleted either manually via a reaction or automatically by a timer. + """ + log.trace(f"Retrieving instructions message for ID {payload.message_id}") + channel = self.bot.get_channel(payload.channel_id) + + try: + return await channel.fetch_message(self.codeblock_message_ids[payload.message_id]) + except discord.NotFound: + log.debug("Could not find instructions message; it was probably deleted.") + return None + + def is_on_cooldown(self, channel: discord.TextChannel) -> bool: + """ + Return True if an embed was sent too recently for `channel`. + + The cooldown is configured by `constants.CodeBlock.cooldown_seconds`. + Note: only channels in the `channel_cooldowns` have cooldowns enabled. + """ + log.trace(f"Checking if #{channel} is on cooldown.") + cooldown = constants.CodeBlock.cooldown_seconds + return (time.time() - self.channel_cooldowns.get(channel.id, 0)) < cooldown + + def is_valid_channel(self, channel: discord.TextChannel) -> bool: + """Return True if `channel` is a help channel, may be on a cooldown, or is whitelisted.""" + log.trace(f"Checking if #{channel} qualifies for code block detection.") + return ( + is_help_channel(channel) + or channel.id in self.channel_cooldowns + or channel.id in constants.CodeBlock.channel_whitelist + ) + + async def send_instructions(self, message: discord.Message, instructions: str) -> None: + """ + Send an embed with `instructions` on fixing an incorrect code block in a `message`. + + The embed will be deleted automatically after 5 minutes. + """ + log.info(f"Sending code block formatting instructions for message {message.id}.") + + embed = self.create_embed(instructions) + bot_message = await message.channel.send(f"Hey {message.author.mention}!", embed=embed) + self.codeblock_message_ids[message.id] = bot_message.id + + self.bot.loop.create_task(wait_for_deletion(bot_message, (message.author.id,), self.bot)) + + # Increase amount of codeblock correction in stats + self.bot.stats.incr("codeblock_corrections") + + def should_parse(self, message: discord.Message) -> bool: + """ + Return True if `message` should be parsed. + + A qualifying message: + + 1. Is not authored by a bot + 2. Is in a valid channel + 3. Has more than 3 lines + 4. Has no bot or webhook token + """ + return ( + not message.author.bot + and self.is_valid_channel(message.channel) + and has_lines(message.content, constants.CodeBlock.minimum_lines) + and not TokenRemover.find_token_in_message(message) + and not WEBHOOK_URL_RE.search(message.content) + ) + + @Cog.listener() + async def on_message(self, msg: Message) -> None: + """Detect incorrect Markdown code blocks in `msg` and send instructions to fix them.""" + if not self.should_parse(msg): + log.trace(f"Skipping code block detection of {msg.id}: message doesn't qualify.") + return + + # When debugging, ignore cooldowns. + if self.is_on_cooldown(msg.channel) and not constants.DEBUG_MODE: + log.trace(f"Skipping code block detection of {msg.id}: #{msg.channel} is on cooldown.") + return + + instructions = get_instructions(msg.content) + if instructions: + await self.send_instructions(msg, instructions) + + if msg.channel.id not in constants.CodeBlock.channel_whitelist: + log.debug(f"Adding #{msg.channel} to the channel cooldowns.") + self.channel_cooldowns[msg.channel.id] = time.time() + + @Cog.listener() + async def on_raw_message_edit(self, payload: RawMessageUpdateEvent) -> None: + """Delete the instructional message if an edited message had its code blocks fixed.""" + if payload.message_id not in self.codeblock_message_ids: + log.trace(f"Ignoring message edit {payload.message_id}: message isn't being tracked.") + return + + if payload.data.get("content") is None or payload.data.get("channel_id") is None: + log.trace(f"Ignoring message edit {payload.message_id}: missing content or channel ID.") + return + + # Parse the message to see if the code blocks have been fixed. + content = payload.data.get("content") + instructions = get_instructions(content) + + bot_message = await self.get_sent_instructions(payload) + if not bot_message: + return + + if not instructions: + log.info("User's incorrect code block has been fixed. Removing instructions message.") + await bot_message.delete() + del self.codeblock_message_ids[payload.message_id] + else: + log.info("Message edited but still has invalid code blocks; editing the instructions.") + await bot_message.edit(embed=self.create_embed(instructions)) diff --git a/bot/exts/info/codeblock/_instructions.py b/bot/exts/info/codeblock/_instructions.py new file mode 100644 index 000000000..508f157fb --- /dev/null +++ b/bot/exts/info/codeblock/_instructions.py @@ -0,0 +1,184 @@ +"""This module generates and formats instructional messages about fixing Markdown code blocks.""" + +import logging +from typing import Optional + +from bot.exts.info.codeblock import _parsing + +log = logging.getLogger(__name__) + +_EXAMPLE_PY = "{lang}\nprint('Hello, world!')" # Make sure to escape any Markdown symbols here. +_EXAMPLE_CODE_BLOCKS = ( + "\\`\\`\\`{content}\n\\`\\`\\`\n\n" + "**This will result in the following:**\n" + "```{content}```" +) + + +def _get_example(language: str) -> str: + """Return an example of a correct code block using `language` for syntax highlighting.""" + # Determine the example code to put in the code block based on the language specifier. + if language.lower() in _parsing.PY_LANG_CODES: + log.trace(f"Code block has a Python language specifier `{language}`.") + content = _EXAMPLE_PY.format(lang=language) + elif language: + log.trace(f"Code block has a foreign language specifier `{language}`.") + # It's not feasible to determine what would be a valid example for other languages. + content = f"{language}\n..." + else: + log.trace("Code block has no language specifier.") + content = "\nHello, world!" + + return _EXAMPLE_CODE_BLOCKS.format(content=content) + + +def _get_bad_ticks_message(code_block: _parsing.CodeBlock) -> Optional[str]: + """Return instructions on using the correct ticks for `code_block`.""" + log.trace("Creating instructions for incorrect code block ticks.") + + valid_ticks = f"\\{_parsing.BACKTICK}" * 3 + instructions = ( + "It looks like you are trying to paste code into this channel.\n\n" + "You seem to be using the wrong symbols to indicate where the code block should start. " + f"The correct symbols would be {valid_ticks}, not `{code_block.tick * 3}`." + ) + + log.trace("Check if the bad ticks code block also has issues with the language specifier.") + addition_msg = _get_bad_lang_message(code_block.content) + if not addition_msg and not code_block.language: + addition_msg = _get_no_lang_message(code_block.content) + + # Combine the back ticks message with the language specifier message. The latter will + # already have an example code block. + if addition_msg: + log.trace("Language specifier issue found; appending additional instructions.") + + # The first line has double newlines which are not desirable when appending the msg. + addition_msg = addition_msg.replace("\n\n", " ", 1) + + # Make the first character of the addition lower case. + instructions += "\n\nFurthermore, " + addition_msg[0].lower() + addition_msg[1:] + else: + log.trace("No issues with the language specifier found.") + example_blocks = _get_example(code_block.language) + instructions += f"\n\n**Here is an example of how it should look:**\n{example_blocks}" + + return instructions + + +def _get_no_ticks_message(content: str) -> Optional[str]: + """If `content` is Python/REPL code, return instructions on using code blocks.""" + log.trace("Creating instructions for a missing code block.") + + if _parsing.is_python_code(content): + example_blocks = _get_example("python") + return ( + "It looks like you're trying to paste code into this channel.\n\n" + "Discord has support for Markdown, which allows you to post code with full " + "syntax highlighting. Please use these whenever you paste code, as this " + "helps improve the legibility and makes it easier for us to help you.\n\n" + f"**To do this, use the following method:**\n{example_blocks}" + ) + else: + log.trace("Aborting missing code block instructions: content is not Python code.") + + +def _get_bad_lang_message(content: str) -> Optional[str]: + """ + Return instructions on fixing the Python language specifier for a code block. + + If `code_block` does not have a Python language specifier, return None. + If there's nothing wrong with the language specifier, return None. + """ + log.trace("Creating instructions for a poorly specified language.") + + info = _parsing.parse_bad_language(content) + if not info: + log.trace("Aborting bad language instructions: language specified isn't Python.") + return + + lines = [] + language = info.language + + if info.has_leading_spaces: + log.trace("Language specifier was preceded by a space.") + lines.append(f"Make sure there are no spaces between the back ticks and `{language}`.") + + if not info.has_terminal_newline: + log.trace("Language specifier was not followed by a newline.") + lines.append( + f"Make sure you put your code on a new line following `{language}`. " + f"There must not be any spaces after `{language}`." + ) + + if lines: + lines = " ".join(lines) + example_blocks = _get_example(language) + + # Note that _get_bad_ticks_message expects the first line to have two newlines. + return ( + f"It looks like you incorrectly specified a language for your code block.\n\n{lines}" + f"\n\n**Here is an example of how it should look:**\n{example_blocks}" + ) + else: + log.trace("Nothing wrong with the language specifier; no instructions to return.") + + +def _get_no_lang_message(content: str) -> Optional[str]: + """ + Return instructions on specifying a language for a code block. + + If `content` is not valid Python or Python REPL code, return None. + """ + log.trace("Creating instructions for a missing language.") + + if _parsing.is_python_code(content): + example_blocks = _get_example("python") + + # Note that _get_bad_ticks_message expects the first line to have two newlines. + return ( + "It looks like you pasted Python code without syntax highlighting.\n\n" + "Please use syntax highlighting to improve the legibility of your code and make " + "it easier for us to help you.\n\n" + f"**To do this, use the following method:**\n{example_blocks}" + ) + else: + log.trace("Aborting missing language instructions: content is not Python code.") + + +def get_instructions(content: str) -> Optional[str]: + """ + Parse `content` and return code block formatting instructions if something is wrong. + + Return None if `content` lacks code block formatting issues. + """ + log.trace("Getting formatting instructions.") + + blocks = _parsing.find_code_blocks(content) + if blocks is None: + log.trace("At least one valid code block found; no instructions to return.") + return + + if not blocks: + log.trace("No code blocks were found in message.") + instructions = _get_no_ticks_message(content) + else: + log.trace("Searching results for a code block with invalid ticks.") + block = next((block for block in blocks if block.tick != _parsing.BACKTICK), None) + + if block: + log.trace("A code block exists but has invalid ticks.") + instructions = _get_bad_ticks_message(block) + else: + log.trace("A code block exists but is missing a language.") + block = blocks[0] + + # Check for a bad language first to avoid parsing content into an AST. + instructions = _get_bad_lang_message(block.content) + if not instructions: + instructions = _get_no_lang_message(block.content) + + if instructions: + instructions += "\nYou can **edit your original message** to correct your code block." + + return instructions diff --git a/bot/exts/info/codeblock/_parsing.py b/bot/exts/info/codeblock/_parsing.py new file mode 100644 index 000000000..e67224494 --- /dev/null +++ b/bot/exts/info/codeblock/_parsing.py @@ -0,0 +1,224 @@ +"""This module provides functions for parsing Markdown code blocks.""" + +import ast +import logging +import re +import textwrap +from typing import NamedTuple, Optional, Sequence + +from bot import constants +from bot.utils import has_lines + +log = logging.getLogger(__name__) + +BACKTICK = "`" +PY_LANG_CODES = ("python", "pycon", "py") # Order is important; "py" is last cause it's a subset. +_TICKS = { + BACKTICK, + "'", + '"', + "\u00b4", # ACUTE ACCENT + "\u2018", # LEFT SINGLE QUOTATION MARK + "\u2019", # RIGHT SINGLE QUOTATION MARK + "\u2032", # PRIME + "\u201c", # LEFT DOUBLE QUOTATION MARK + "\u201d", # RIGHT DOUBLE QUOTATION MARK + "\u2033", # DOUBLE PRIME + "\u3003", # VERTICAL KANA REPEAT MARK UPPER HALF +} + +_RE_PYTHON_REPL = re.compile(r"^(>>>|\.\.\.)( |$)") +_RE_IPYTHON_REPL = re.compile(r"^((In|Out) \[\d+\]: |\s*\.{3,}: ?)") + +_RE_CODE_BLOCK = re.compile( + fr""" + (?P<ticks> + (?P<tick>[{''.join(_TICKS)}]) # Put all ticks into a character class within a group. + \2{{2}} # Match previous group 2 more times to ensure the same char. + ) + (?P<lang>[^\W_]+\n)? # Optionally match a language specifier followed by a newline. + (?P<code>.+?) # Match the actual code within the block. + \1 # Match the same 3 ticks used at the start of the block. + """, + re.DOTALL | re.VERBOSE +) + +_RE_LANGUAGE = re.compile( + fr""" + ^(?P<spaces>\s+)? # Optionally match leading spaces from the beginning. + (?P<lang>{'|'.join(PY_LANG_CODES)}) # Match a Python language. + (?P<newline>\n)? # Optionally match a newline following the language. + """, + re.IGNORECASE | re.VERBOSE +) + + +class CodeBlock(NamedTuple): + """Represents a Markdown code block.""" + + content: str + language: str + tick: str + + +class BadLanguage(NamedTuple): + """Parsed information about a poorly formatted language specifier.""" + + language: str + has_leading_spaces: bool + has_terminal_newline: bool + + +def find_code_blocks(message: str) -> Optional[Sequence[CodeBlock]]: + """ + Find and return all Markdown code blocks in the `message`. + + Code blocks with 3 or fewer lines are excluded. + + If the `message` contains at least one code block with valid ticks and a specified language, + return None. This is based on the assumption that if the user managed to get one code block + right, they already know how to fix the rest themselves. + """ + log.trace("Finding all code blocks in a message.") + + code_blocks = [] + for match in _RE_CODE_BLOCK.finditer(message): + # Used to ensure non-matched groups have an empty string as the default value. + groups = match.groupdict("") + language = groups["lang"].strip() # Strip the newline cause it's included in the group. + + if groups["tick"] == BACKTICK and language: + log.trace("Message has a valid code block with a language; returning None.") + return None + elif has_lines(groups["code"], constants.CodeBlock.minimum_lines): + code_block = CodeBlock(groups["code"], language, groups["tick"]) + code_blocks.append(code_block) + else: + log.trace("Skipped a code block shorter than 4 lines.") + + return code_blocks + + +def _is_python_code(content: str) -> bool: + """Return True if `content` is valid Python consisting of more than just expressions.""" + log.trace("Checking if content is Python code.") + try: + # Attempt to parse the message into an AST node. + # Invalid Python code will raise a SyntaxError. + tree = ast.parse(content) + except SyntaxError: + log.trace("Code is not valid Python.") + return False + + # Multiple lines of single words could be interpreted as expressions. + # This check is to avoid all nodes being parsed as expressions. + # (e.g. words over multiple lines) + if not all(isinstance(node, ast.Expr) for node in tree.body): + log.trace("Code is valid python.") + return True + else: + log.trace("Code consists only of expressions.") + return False + + +def _is_repl_code(content: str, threshold: int = 3) -> bool: + """Return True if `content` has at least `threshold` number of (I)Python REPL-like lines.""" + log.trace(f"Checking if content is (I)Python REPL code using a threshold of {threshold}.") + + repl_lines = 0 + patterns = (_RE_PYTHON_REPL, _RE_IPYTHON_REPL) + + for line in content.splitlines(): + # Check the line against all patterns. + for pattern in patterns: + if pattern.match(line): + repl_lines += 1 + + # Once a pattern is matched, only use that pattern for the remaining lines. + patterns = (pattern,) + break + + if repl_lines == threshold: + log.trace("Content is (I)Python REPL code.") + return True + + log.trace("Content is not (I)Python REPL code.") + return False + + +def is_python_code(content: str) -> bool: + """Return True if `content` is valid Python code or (I)Python REPL output.""" + dedented = textwrap.dedent(content) + + # Parse AST twice in case _fix_indentation ends up breaking code due to its inaccuracies. + return ( + _is_python_code(dedented) + or _is_repl_code(dedented) + or _is_python_code(_fix_indentation(content)) + ) + + +def parse_bad_language(content: str) -> Optional[BadLanguage]: + """ + Return information about a poorly formatted Python language in code block `content`. + + If the language is not Python, return None. + """ + log.trace("Parsing bad language.") + + match = _RE_LANGUAGE.match(content) + if not match: + return None + + return BadLanguage( + language=match["lang"], + has_leading_spaces=match["spaces"] is not None, + has_terminal_newline=match["newline"] is not None, + ) + + +def _get_leading_spaces(content: str) -> int: + """Return the number of spaces at the start of the first line in `content`.""" + leading_spaces = 0 + for char in content: + if char == " ": + leading_spaces += 1 + else: + return leading_spaces + + +def _fix_indentation(content: str) -> str: + """ + Attempt to fix badly indented code in `content`. + + In most cases, this works like textwrap.dedent. However, if the first line ends with a colon, + all subsequent lines are re-indented to only be one level deep relative to the first line. + The intent is to fix cases where the leading spaces of the first line of code were accidentally + not copied, which makes the first line appear not indented. + + This is fairly naïve and inaccurate. Therefore, it may break some code that was otherwise valid. + It's meant to catch really common cases, so that's acceptable. Its flaws are: + + - It assumes that if the first line ends with a colon, it is the start of an indented block + - It uses 4 spaces as the indentation, regardless of what the rest of the code uses + """ + lines = content.splitlines(keepends=True) + + # Dedent the first line + first_indent = _get_leading_spaces(content) + first_line = lines[0][first_indent:] + + second_indent = _get_leading_spaces(lines[1]) + + # If the first line ends with a colon, all successive lines need to be indented one + # additional level (assumes an indent width of 4). + if first_line.rstrip().endswith(":"): + second_indent -= 4 + + # All lines must be dedented at least by the same amount as the first line. + first_indent = max(first_indent, second_indent) + + # Dedent the rest of the lines and join them together with the first line. + content = first_line + "".join(line[first_indent:] for line in lines[1:]) + + return content diff --git a/bot/exts/info/stats.py b/bot/exts/info/stats.py index 21aa91873..4d8bb645e 100644 --- a/bot/exts/info/stats.py +++ b/bot/exts/info/stats.py @@ -6,7 +6,7 @@ from discord.ext.tasks import loop from bot.bot import Bot from bot.constants import Categories, Channels, Guild - +from bot.utils.channel import is_in_category CHANNEL_NAME_OVERRIDES = { Channels.off_topic_0: "off_topic_0", @@ -35,8 +35,7 @@ class Stats(Cog): if message.guild.id != Guild.id: return - cat = getattr(message.channel, "category", None) - if cat is not None and cat.id == Categories.modmail: + if is_in_category(message.channel, Categories.modmail): if message.channel.id != Channels.incidents: # Do not report modmail channels to stats, there are too many # of them for interesting statistics to be drawn out of this. diff --git a/bot/exts/utils/bot.py b/bot/exts/utils/bot.py index ba1fd2a5c..69d623581 100644 --- a/bot/exts/utils/bot.py +++ b/bot/exts/utils/bot.py @@ -1,22 +1,14 @@ -import ast import logging -import re -import time -from typing import Optional, Tuple +from typing import Optional -from discord import Embed, Message, RawMessageUpdateEvent, TextChannel +from discord import Embed, TextChannel from discord.ext.commands import Cog, Context, command, group, has_any_role from bot.bot import Bot -from bot.constants import Categories, Channels, DEBUG_MODE, Guild, MODERATION_ROLES, Roles, URLs -from bot.exts.filters.token_remover import TokenRemover -from bot.exts.filters.webhook_remover import WEBHOOK_URL_RE -from bot.utils.messages import wait_for_deletion +from bot.constants import Guild, MODERATION_ROLES, Roles, URLs log = logging.getLogger(__name__) -RE_MARKDOWN = re.compile(r'([*_~`|>])') - class BotCog(Cog, name="Bot"): """Bot information commands.""" @@ -24,19 +16,6 @@ class BotCog(Cog, name="Bot"): def __init__(self, bot: Bot): self.bot = bot - # Stores allowed channels plus epoch time since last call. - self.channel_cooldowns = { - Channels.python_discussion: 0, - } - - # These channels will also work, but will not be subject to cooldown - self.channel_whitelist = ( - Channels.bot_commands, - ) - - # Stores improperly formatted Python codeblock message ids and the corresponding bot message - self.codeblock_message_ids = {} - @group(invoke_without_command=True, name="bot", hidden=True) @has_any_role(Roles.verified) async def botinfo_group(self, ctx: Context) -> None: @@ -81,305 +60,6 @@ class BotCog(Cog, name="Bot"): else: await channel.send(embed=embed) - def codeblock_stripping(self, msg: str, bad_ticks: bool) -> Optional[Tuple[Tuple[str, ...], str]]: - """ - Strip msg in order to find Python code. - - Tries to strip out Python code out of msg and returns the stripped block or - None if the block is a valid Python codeblock. - """ - if msg.count("\n") >= 3: - # Filtering valid Python codeblocks and exiting if a valid Python codeblock is found. - if re.search("```(?:py|python)\n(.*?)```", msg, re.IGNORECASE | re.DOTALL) and not bad_ticks: - log.trace( - "Someone wrote a message that was already a " - "valid Python syntax highlighted code block. No action taken." - ) - return None - - else: - # Stripping backticks from every line of the message. - log.trace(f"Stripping backticks from message.\n\n{msg}\n\n") - content = "" - for line in msg.splitlines(keepends=True): - content += line.strip("`") - - content = content.strip() - - # Remove "Python" or "Py" from start of the message if it exists. - log.trace(f"Removing 'py' or 'python' from message.\n\n{content}\n\n") - pycode = False - if content.lower().startswith("python"): - content = content[6:] - pycode = True - elif content.lower().startswith("py"): - content = content[2:] - pycode = True - - if pycode: - content = content.splitlines(keepends=True) - - # Check if there might be code in the first line, and preserve it. - first_line = content[0] - if " " in content[0]: - first_space = first_line.index(" ") - content[0] = first_line[first_space:] - content = "".join(content) - - # If there's no code we can just get rid of the first line. - else: - content = "".join(content[1:]) - - # Strip it again to remove any leading whitespace. This is necessary - # if the first line of the message looked like ```python <code> - old = content.strip() - - # Strips REPL code out of the message if there is any. - content, repl_code = self.repl_stripping(old) - if old != content: - return (content, old), repl_code - - # Try to apply indentation fixes to the code. - content = self.fix_indentation(content) - - # Check if the code contains backticks, if it does ignore the message. - if "`" in content: - log.trace("Detected ` inside the code, won't reply") - return None - else: - log.trace(f"Returning message.\n\n{content}\n\n") - return (content,), repl_code - - def fix_indentation(self, msg: str) -> str: - """Attempts to fix badly indented code.""" - def unindent(code: str, skip_spaces: int = 0) -> str: - """Unindents all code down to the number of spaces given in skip_spaces.""" - final = "" - current = code[0] - leading_spaces = 0 - - # Get numbers of spaces before code in the first line. - while current == " ": - current = code[leading_spaces + 1] - leading_spaces += 1 - leading_spaces -= skip_spaces - - # If there are any, remove that number of spaces from every line. - if leading_spaces > 0: - for line in code.splitlines(keepends=True): - line = line[leading_spaces:] - final += line - return final - else: - return code - - # Apply fix for "all lines are overindented" case. - msg = unindent(msg) - - # If the first line does not end with a colon, we can be - # certain the next line will be on the same indentation level. - # - # If it does end with a colon, we will need to indent all successive - # lines one additional level. - first_line = msg.splitlines()[0] - code = "".join(msg.splitlines(keepends=True)[1:]) - if not first_line.endswith(":"): - msg = f"{first_line}\n{unindent(code)}" - else: - msg = f"{first_line}\n{unindent(code, 4)}" - return msg - - def repl_stripping(self, msg: str) -> Tuple[str, bool]: - """ - Strip msg in order to extract Python code out of REPL output. - - Tries to strip out REPL Python code out of msg and returns the stripped msg. - - Returns True for the boolean if REPL code was found in the input msg. - """ - final = "" - for line in msg.splitlines(keepends=True): - if line.startswith(">>>") or line.startswith("..."): - final += line[4:] - log.trace(f"Formatted: \n\n{msg}\n\n to \n\n{final}\n\n") - if not final: - log.trace(f"Found no REPL code in \n\n{msg}\n\n") - return msg, False - else: - log.trace(f"Found REPL code in \n\n{msg}\n\n") - return final.rstrip(), True - - def has_bad_ticks(self, msg: Message) -> bool: - """Check to see if msg contains ticks that aren't '`'.""" - not_backticks = [ - "'''", '"""', "\u00b4\u00b4\u00b4", "\u2018\u2018\u2018", "\u2019\u2019\u2019", - "\u2032\u2032\u2032", "\u201c\u201c\u201c", "\u201d\u201d\u201d", "\u2033\u2033\u2033", - "\u3003\u3003\u3003" - ] - - return msg.content[:3] in not_backticks - - @Cog.listener() - async def on_message(self, msg: Message) -> None: - """ - Detect poorly formatted Python code in new messages. - - If poorly formatted code is detected, send the user a helpful message explaining how to do - properly formatted Python syntax highlighting codeblocks. - """ - is_help_channel = ( - getattr(msg.channel, "category", None) - and msg.channel.category.id in (Categories.help_available, Categories.help_in_use) - ) - parse_codeblock = ( - ( - is_help_channel - or msg.channel.id in self.channel_cooldowns - or msg.channel.id in self.channel_whitelist - ) - and not msg.author.bot - and len(msg.content.splitlines()) > 3 - and not TokenRemover.find_token_in_message(msg) - and not WEBHOOK_URL_RE.search(msg.content) - ) - - if parse_codeblock: # no token in the msg - on_cooldown = (time.time() - self.channel_cooldowns.get(msg.channel.id, 0)) < 300 - if not on_cooldown or DEBUG_MODE: - try: - if self.has_bad_ticks(msg): - ticks = msg.content[:3] - content = self.codeblock_stripping(f"```{msg.content[3:-3]}```", True) - if content is None: - return - - content, repl_code = content - - if len(content) == 2: - content = content[1] - else: - content = content[0] - - space_left = 204 - if len(content) >= space_left: - current_length = 0 - lines_walked = 0 - for line in content.splitlines(keepends=True): - if current_length + len(line) > space_left or lines_walked == 10: - break - current_length += len(line) - lines_walked += 1 - content = content[:current_length] + "#..." - content_escaped_markdown = RE_MARKDOWN.sub(r'\\\1', content) - howto = ( - "It looks like you are trying to paste code into this channel.\n\n" - "You seem to be using the wrong symbols to indicate where the codeblock should start. " - f"The correct symbols would be \\`\\`\\`, not `{ticks}`.\n\n" - "**Here is an example of how it should look:**\n" - f"\\`\\`\\`python\n{content_escaped_markdown}\n\\`\\`\\`\n\n" - "**This will result in the following:**\n" - f"```python\n{content}\n```" - ) - - else: - howto = "" - content = self.codeblock_stripping(msg.content, False) - if content is None: - return - - content, repl_code = content - # Attempts to parse the message into an AST node. - # Invalid Python code will raise a SyntaxError. - tree = ast.parse(content[0]) - - # Multiple lines of single words could be interpreted as expressions. - # This check is to avoid all nodes being parsed as expressions. - # (e.g. words over multiple lines) - if not all(isinstance(node, ast.Expr) for node in tree.body) or repl_code: - # Shorten the code to 10 lines and/or 204 characters. - space_left = 204 - if content and repl_code: - content = content[1] - else: - content = content[0] - - if len(content) >= space_left: - current_length = 0 - lines_walked = 0 - for line in content.splitlines(keepends=True): - if current_length + len(line) > space_left or lines_walked == 10: - break - current_length += len(line) - lines_walked += 1 - content = content[:current_length] + "#..." - - content_escaped_markdown = RE_MARKDOWN.sub(r'\\\1', content) - howto += ( - "It looks like you're trying to paste code into this channel.\n\n" - "Discord has support for Markdown, which allows you to post code with full " - "syntax highlighting. Please use these whenever you paste code, as this " - "helps improve the legibility and makes it easier for us to help you.\n\n" - f"**To do this, use the following method:**\n" - f"\\`\\`\\`python\n{content_escaped_markdown}\n\\`\\`\\`\n\n" - "**This will result in the following:**\n" - f"```python\n{content}\n```" - ) - - log.debug(f"{msg.author} posted something that needed to be put inside python code " - "blocks. Sending the user some instructions.") - else: - log.trace("The code consists only of expressions, not sending instructions") - - if howto != "": - # Increase amount of codeblock correction in stats - self.bot.stats.incr("codeblock_corrections") - howto_embed = Embed(description=howto) - bot_message = await msg.channel.send(f"Hey {msg.author.mention}!", embed=howto_embed) - self.codeblock_message_ids[msg.id] = bot_message.id - - self.bot.loop.create_task( - wait_for_deletion(bot_message, (msg.author.id,), self.bot) - ) - else: - return - - if msg.channel.id not in self.channel_whitelist: - self.channel_cooldowns[msg.channel.id] = time.time() - - except SyntaxError: - log.trace( - f"{msg.author} posted in a help channel, and when we tried to parse it as Python code, " - "ast.parse raised a SyntaxError. This probably just means it wasn't Python code. " - f"The message that was posted was:\n\n{msg.content}\n\n" - ) - - @Cog.listener() - async def on_raw_message_edit(self, payload: RawMessageUpdateEvent) -> None: - """Check to see if an edited message (previously called out) still contains poorly formatted code.""" - if ( - # Checks to see if the message was called out by the bot - payload.message_id not in self.codeblock_message_ids - # Makes sure that there is content in the message - or payload.data.get("content") is None - # Makes sure there's a channel id in the message payload - or payload.data.get("channel_id") is None - ): - return - - # Retrieve channel and message objects for use later - channel = self.bot.get_channel(int(payload.data.get("channel_id"))) - user_message = await channel.fetch_message(payload.message_id) - - # Checks to see if the user has corrected their codeblock. If it's fixed, has_fixed_codeblock will be None - has_fixed_codeblock = self.codeblock_stripping(payload.data.get("content"), self.has_bad_ticks(user_message)) - - # If the message is fixed, delete the bot message and the entry from the id dictionary - if has_fixed_codeblock is None: - bot_message = await channel.fetch_message(self.codeblock_message_ids[payload.message_id]) - await bot_message.delete() - del self.codeblock_message_ids[payload.message_id] - log.trace("User's incorrect code block has been fixed. Removing bot formatting message.") - def setup(bot: Bot) -> None: """Load the Bot cog.""" diff --git a/bot/utils/__init__.py b/bot/utils/__init__.py index 60170a88f..13533a467 100644 --- a/bot/utils/__init__.py +++ b/bot/utils/__init__.py @@ -1,4 +1,4 @@ -from bot.utils.helpers import CogABCMeta, find_nth_occurrence, pad_base64 +from bot.utils.helpers import CogABCMeta, find_nth_occurrence, has_lines, pad_base64 from bot.utils.services import send_to_paste_service -__all__ = ['CogABCMeta', 'find_nth_occurrence', 'pad_base64', 'send_to_paste_service'] +__all__ = ['CogABCMeta', 'find_nth_occurrence', 'has_lines', 'pad_base64', 'send_to_paste_service'] diff --git a/bot/utils/channel.py b/bot/utils/channel.py new file mode 100644 index 000000000..851f9e1fe --- /dev/null +++ b/bot/utils/channel.py @@ -0,0 +1,33 @@ +import logging + +import discord + +from bot.constants import Categories + +log = logging.getLogger(__name__) + + +def is_help_channel(channel: discord.TextChannel) -> bool: + """Return True if `channel` is in one of the help categories (excluding dormant).""" + log.trace(f"Checking if #{channel} is a help channel.") + categories = (Categories.help_available, Categories.help_in_use) + + return any(is_in_category(channel, category) for category in categories) + + +def is_in_category(channel: discord.TextChannel, category_id: int) -> bool: + """Return True if `channel` is within a category with `category_id`.""" + return getattr(channel, "category_id", None) == category_id + + +async def try_get_channel(channel_id: int, client: discord.Client) -> discord.abc.GuildChannel: + """Attempt to get or fetch a channel and return it.""" + log.trace(f"Getting the channel {channel_id}.") + + channel = client.get_channel(channel_id) + if not channel: + log.debug(f"Channel {channel_id} is not in cache; fetching from API.") + channel = await client.fetch_channel(channel_id) + + log.trace(f"Channel #{channel} ({channel_id}) retrieved.") + return channel diff --git a/bot/utils/helpers.py b/bot/utils/helpers.py index d9b60af07..3501a3933 100644 --- a/bot/utils/helpers.py +++ b/bot/utils/helpers.py @@ -18,6 +18,15 @@ def find_nth_occurrence(string: str, substring: str, n: int) -> Optional[int]: return index +def has_lines(string: str, count: int) -> bool: + """Return True if `string` has at least `count` lines.""" + # Benchmarks show this is significantly faster than using str.count("\n") or a for loop & break. + split = string.split("\n", count - 1) + + # Make sure the last part isn't empty, which would happen if there was a final newline. + return split[-1] and len(split) == count + + def pad_base64(data: str) -> str: """Return base64 `data` with padding characters to ensure its length is a multiple of 4.""" return data + "=" * (-len(data) % 4) diff --git a/config-default.yml b/config-default.yml index 4f7b1e217..0e7ebf2e3 100644 --- a/config-default.yml +++ b/config-default.yml @@ -145,8 +145,8 @@ guild: dev_log: &DEV_LOG 622895325144940554 # Discussion - meta: 429409067623251969 - python_discussion: 267624335836053506 + meta: 429409067623251969 + python_discussion: &PY_DISCUSSION 267624335836053506 # Python Help: Available how_to_get_help: 704250143020417084 @@ -394,6 +394,23 @@ big_brother: header_message_limit: 15 +code_block: + # The channels in which code blocks will be detected. They are not subject to a cooldown. + channel_whitelist: + - *BOT_CMD + + # The channels which will be affected by a cooldown. These channels are also whitelisted. + cooldown_channels: + - *PY_DISCUSSION + + # Sending instructions triggers a cooldown on a per-channel basis. + # More instruction messages will not be sent in the same channel until the cooldown has elapsed. + cooldown_seconds: 300 + + # The minimum amount of lines a message or code block must have for instructions to be sent. + minimum_lines: 4 + + free: # Seconds to elapse for a channel # to be considered inactive. |