diff options
| -rw-r--r-- | bot/cogs/codeblock/cog.py | 128 | ||||
| -rw-r--r-- | bot/cogs/codeblock/parsing.py | 117 |
2 files changed, 126 insertions, 119 deletions
diff --git a/bot/cogs/codeblock/cog.py b/bot/cogs/codeblock/cog.py index d0ffcab3f..dad0cc9cc 100644 --- a/bot/cogs/codeblock/cog.py +++ b/bot/cogs/codeblock/cog.py @@ -1,8 +1,6 @@ -import ast import logging -import re import time -from typing import NamedTuple, Optional, Sequence +from typing import Optional import discord from discord import Embed, Message, RawMessageUpdateEvent @@ -11,46 +9,10 @@ from discord.ext.commands import Bot, Cog from bot.cogs.token_remover import TokenRemover from bot.constants import Categories, Channels, DEBUG_MODE from bot.utils.messages import wait_for_deletion +from . import parsing log = logging.getLogger(__name__) -RE_MARKDOWN = re.compile(r'([*_~`|>])') -RE_CODE_BLOCK_LANGUAGE = re.compile(r"```(?:[^\W_]+)\n(.*?)```", re.DOTALL) -BACKTICK = "`" -TICKS = { - BACKTICK, - "'", - '"', - "\u00b4", # ACUTE ACCENT - "\u2018", # LEFT SINGLE QUOTATION MARK - "\u2019", # RIGHT SINGLE QUOTATION MARK - "\u2032", # PRIME - "\u201c", # LEFT DOUBLE QUOTATION MARK - "\u201d", # RIGHT DOUBLE QUOTATION MARK - "\u2033", # DOUBLE PRIME - "\u3003", # VERTICAL KANA REPEAT MARK UPPER HALF -} -RE_CODE_BLOCK = re.compile( - fr""" - ( - ([{''.join(TICKS)}]) # Put all ticks into a character class within a group. - \2{{2}} # Match the previous group 2 more times to ensure it's the same char. - ) - ([^\W_]+\n)? # Optionally match a language specifier followed by a newline. - (.+?) # Match the actual code within the block. - \1 # Match the same 3 ticks used at the start of the block. - """, - re.DOTALL | re.VERBOSE -) - - -class CodeBlock(NamedTuple): - """Represents a Markdown code block.""" - - content: str - language: str - tick: str - class CodeBlockCog(Cog, name="Code Block"): """Detect improperly formatted code blocks and suggest proper formatting.""" @@ -85,8 +47,8 @@ class CodeBlockCog(Cog, name="Code Block"): else: content = content[0] - content = self.truncate(content) - content_escaped_markdown = RE_MARKDOWN.sub(r'\\\1', content) + content = parsing.truncate(content) + content_escaped_markdown = parsing.RE_MARKDOWN.sub(r'\\\1', content) return ( "It looks like you are trying to paste code into this channel.\n\n" @@ -106,7 +68,7 @@ class CodeBlockCog(Cog, name="Code Block"): content, repl_code = content - if not repl_code and not self.is_python_code(content[0]): + if not repl_code and not parsing.is_python_code(content[0]): return if content and repl_code: @@ -114,14 +76,14 @@ class CodeBlockCog(Cog, name="Code Block"): else: content = content[0] - content = self.truncate(content) + content = parsing.truncate(content) log.debug( f"{message.author} posted something that needed to be put inside python code " f"blocks. Sending the user some instructions." ) - content_escaped_markdown = RE_MARKDOWN.sub(r'\\\1', content) + content_escaped_markdown = parsing.RE_MARKDOWN.sub(r'\\\1', content) return ( "It looks like you're trying to paste code into this channel.\n\n" "Discord has support for Markdown, which allows you to post code with full " @@ -134,44 +96,6 @@ class CodeBlockCog(Cog, name="Code Block"): ) @staticmethod - def find_code_blocks(message: str) -> Sequence[CodeBlock]: - """ - Find and return all Markdown code blocks in the `message`. - - Code blocks with 3 or less lines are excluded. - - If the `message` contains at least one code block with valid ticks and a specified language, - return an empty sequence. This is based on the assumption that if the user managed to get - one code block right, they already know how to fix the rest themselves. - """ - code_blocks = [] - for _, tick, language, content in RE_CODE_BLOCK.finditer(message): - language = language.strip() - if tick == BACKTICK and language: - return () - elif len(content.split("\n", 3)) > 3: - code_block = CodeBlock(content, language, tick) - code_blocks.append(code_block) - - @staticmethod - def is_repl_code(content: str, threshold: int = 3) -> bool: - """Return True if `content` has at least `threshold` number of Python REPL-like lines.""" - repl_lines = 0 - for line in content.splitlines(): - if line.startswith(">>> ") or line.startswith("... "): - repl_lines += 1 - - if repl_lines == threshold: - return True - - return False - - @staticmethod - def has_bad_ticks(message: discord.Message) -> bool: - """Return True if `message` starts with 3 characters which look like but aren't '`'.""" - return message.content[:3] in TICKS - - @staticmethod def is_help_channel(channel: discord.TextChannel) -> bool: """Return True if `channel` is in one of the help categories.""" return ( @@ -187,26 +111,6 @@ class CodeBlockCog(Cog, name="Code Block"): """ return (time.time() - self.channel_cooldowns.get(channel.id, 0)) < 300 - @staticmethod - def is_python_code(content: str) -> bool: - """Return True if `content` is valid Python consisting of more than just expressions.""" - try: - # Attempt to parse the message into an AST node. - # Invalid Python code will raise a SyntaxError. - tree = ast.parse(content) - except SyntaxError: - log.trace("Code is not valid Python.") - return False - - # Multiple lines of single words could be interpreted as expressions. - # This check is to avoid all nodes being parsed as expressions. - # (e.g. words over multiple lines) - if not all(isinstance(node, ast.Expr) for node in tree.body): - return True - else: - log.trace("Code consists only of expressions.") - return False - def is_valid_channel(self, channel: discord.TextChannel) -> bool: """Return True if `channel` is a help channel, may be on cooldown, or is whitelisted.""" return ( @@ -247,20 +151,6 @@ class CodeBlockCog(Cog, name="Code Block"): and not TokenRemover.find_token_in_message(message) ) - @staticmethod - def truncate(content: str, max_chars: int = 204, max_lines: int = 10) -> str: - """Return `content` truncated to be at most `max_chars` or `max_lines` in length.""" - current_length = 0 - lines_walked = 0 - - for line in content.splitlines(keepends=True): - if current_length + len(line) > max_chars or lines_walked == max_lines: - break - current_length += len(line) - lines_walked += 1 - - return content[:current_length] + "#..." - @Cog.listener() async def on_message(self, msg: Message) -> None: """ @@ -277,7 +167,7 @@ class CodeBlockCog(Cog, name="Code Block"): return try: - if self.has_bad_ticks(msg): + if parsing.has_bad_ticks(msg): description = self.format_bad_ticks_message(msg) else: description = self.format_guide_message(msg) @@ -311,7 +201,7 @@ class CodeBlockCog(Cog, name="Code Block"): user_message = await channel.fetch_message(payload.message_id) # Checks to see if the user has corrected their codeblock. If it's fixed, has_fixed_codeblock will be None - has_fixed_codeblock = self.codeblock_stripping(payload.data.get("content"), self.has_bad_ticks(user_message)) + has_fixed_codeblock = self.codeblock_stripping(payload.data.get("content"), parsing.has_bad_ticks(user_message)) # If the message is fixed, delete the bot message and the entry from the id dictionary if has_fixed_codeblock is None: diff --git a/bot/cogs/codeblock/parsing.py b/bot/cogs/codeblock/parsing.py new file mode 100644 index 000000000..7a096758b --- /dev/null +++ b/bot/cogs/codeblock/parsing.py @@ -0,0 +1,117 @@ +import ast +import logging +import re +from typing import NamedTuple, Sequence + +import discord + +log = logging.getLogger(__name__) + +RE_MARKDOWN = re.compile(r'([*_~`|>])') +RE_CODE_BLOCK_LANGUAGE = re.compile(r"```(?:[^\W_]+)\n(.*?)```", re.DOTALL) +BACKTICK = "`" +TICKS = { + BACKTICK, + "'", + '"', + "\u00b4", # ACUTE ACCENT + "\u2018", # LEFT SINGLE QUOTATION MARK + "\u2019", # RIGHT SINGLE QUOTATION MARK + "\u2032", # PRIME + "\u201c", # LEFT DOUBLE QUOTATION MARK + "\u201d", # RIGHT DOUBLE QUOTATION MARK + "\u2033", # DOUBLE PRIME + "\u3003", # VERTICAL KANA REPEAT MARK UPPER HALF +} +RE_CODE_BLOCK = re.compile( + fr""" + ( + ([{''.join(TICKS)}]) # Put all ticks into a character class within a group. + \2{{2}} # Match the previous group 2 more times to ensure it's the same char. + ) + ([^\W_]+\n)? # Optionally match a language specifier followed by a newline. + (.+?) # Match the actual code within the block. + \1 # Match the same 3 ticks used at the start of the block. + """, + re.DOTALL | re.VERBOSE +) + + +class CodeBlock(NamedTuple): + """Represents a Markdown code block.""" + + content: str + language: str + tick: str + + +def find_code_blocks(message: str) -> Sequence[CodeBlock]: + """ + Find and return all Markdown code blocks in the `message`. + + Code blocks with 3 or less lines are excluded. + + If the `message` contains at least one code block with valid ticks and a specified language, + return an empty sequence. This is based on the assumption that if the user managed to get + one code block right, they already know how to fix the rest themselves. + """ + code_blocks = [] + for _, tick, language, content in RE_CODE_BLOCK.finditer(message): + language = language.strip() + if tick == BACKTICK and language: + return () + elif len(content.split("\n", 3)) > 3: + code_block = CodeBlock(content, language, tick) + code_blocks.append(code_block) + + +def has_bad_ticks(message: discord.Message) -> bool: + """Return True if `message` starts with 3 characters which look like but aren't '`'.""" + return message.content[:3] in TICKS + + +def is_python_code(content: str) -> bool: + """Return True if `content` is valid Python consisting of more than just expressions.""" + try: + # Attempt to parse the message into an AST node. + # Invalid Python code will raise a SyntaxError. + tree = ast.parse(content) + except SyntaxError: + log.trace("Code is not valid Python.") + return False + + # Multiple lines of single words could be interpreted as expressions. + # This check is to avoid all nodes being parsed as expressions. + # (e.g. words over multiple lines) + if not all(isinstance(node, ast.Expr) for node in tree.body): + return True + else: + log.trace("Code consists only of expressions.") + return False + + +def is_repl_code(content: str, threshold: int = 3) -> bool: + """Return True if `content` has at least `threshold` number of Python REPL-like lines.""" + repl_lines = 0 + for line in content.splitlines(): + if line.startswith(">>> ") or line.startswith("... "): + repl_lines += 1 + + if repl_lines == threshold: + return True + + return False + + +def truncate(content: str, max_chars: int = 204, max_lines: int = 10) -> str: + """Return `content` truncated to be at most `max_chars` or `max_lines` in length.""" + current_length = 0 + lines_walked = 0 + + for line in content.splitlines(keepends=True): + if current_length + len(line) > max_chars or lines_walked == max_lines: + break + current_length += len(line) + lines_walked += 1 + + return content[:current_length] + "#..." |