Code block: use regex to parse incorrect languages

Regex is simpler and more versatile in this case. The functions in the `instructions` module should be more focused on formatting than parsing, so the parsing was moved to the `parsing` module. * Move _PY_LANG_CODES to the `parsing` module * Create a separate function in the `parsing` module to parse bad languages
author: MarkKoz <[email protected]> 2020-05-07 18:33:34 -0700
committer: MarkKoz <[email protected]> 2020-06-13 11:21:08 -0700
commit: 2bfac307c4b06682db93e2a75108012a586d1c7d (patch)
tree: 03e9cd15cf695db05e93b144f0f1209b61c8be04
parent: Code block: create a function to format the example code blocks (diff)
2 files changed, 51 insertions, 18 deletions
diff --git a/bot/cogs/codeblock/instructions.py b/bot/cogs/codeblock/instructions.py
index bba84c66a..c1a6645b3 100644
--- a/bot/cogs/codeblock/instructions.py
+++ b/bot/cogs/codeblock/instructions.py
@@ -5,7 +5,6 @@ from . import parsing
 
 log = logging.getLogger(__name__)
 
-_PY_LANG_CODES = ("python", "py")  # Order is important; "py" is second cause it's a subset.
 _EXAMPLE_PY = "{lang}\nprint('Hello, world!')"  # Make sure to escape any Markdown symbols here.
 _EXAMPLE_CODE_BLOCKS = (
     "\\`\\`\\`{content}\n\\`\\`\\`\n\n"
@@ -16,16 +15,14 @@ _EXAMPLE_CODE_BLOCKS = (
 
 def _get_example(language: str) -> str:
     """Return an example of a correct code block using `language` for syntax highlighting."""
-    language_lower = language.lower()  # It's only valid if it's all lowercase.
-
     # Determine the example code to put in the code block based on the language specifier.
-    if language_lower in _PY_LANG_CODES:
+    if language.lower() in parsing.PY_LANG_CODES:
         log.trace(f"Code block has a Python language specifier `{language}`.")
-        content = _EXAMPLE_PY.format(lang=language_lower)
-    elif language_lower:
+        content = _EXAMPLE_PY.format(lang=language)
+    elif language:
         log.trace(f"Code block has a foreign language specifier `{language}`.")
         # It's not feasible to determine what would be a valid example for other languages.
-        content = f"{language_lower}\n..."
+        content = f"{language}\n..."
     else:
         log.trace("Code block has no language specifier.")
         content = "Hello, world!"
@@ -92,26 +89,25 @@ def _get_bad_lang_message(content: str) -> Optional[str]:
     If `content` doesn't start with "python" or "py" as the language specifier, return None.
     """
     log.trace("Creating instructions for a poorly specified language.")
+    info = parsing.parse_bad_language(content)
 
-    stripped = content.lstrip().lower()
-    lang = next((lang for lang in _PY_LANG_CODES if stripped.startswith(lang)), None)
-
-    if lang:
+    if info:
         # Note that _get_bad_ticks_message expects the first line to have an extra newline.
         lines = ["It looks like you incorrectly specified a language for your code block.\n"]
+        language = info.language
 
-        if content.startswith(" "):
+        if info.leading_spaces:
             log.trace("Language specifier was preceded by a space.")
-            lines.append(f"Make sure there are no spaces between the back ticks and `{lang}`.")
+            lines.append(f"Make sure there are no spaces between the back ticks and `{language}`.")
 
-        if stripped[len(lang)] != "\n":
+        if not info.terminal_newline:
             log.trace("Language specifier was not followed by a newline.")
             lines.append(
-                f"Make sure you put your code on a new line following `{lang}`. "
-                f"There must not be any spaces after `{lang}`."
+                f"Make sure you put your code on a new line following `{language}`. "
+                f"There must not be any spaces after `{language}`."
             )
 
-        example_blocks = _get_example(lang)
+        example_blocks = _get_example(language)
         lines.append(f"\n**Here is an example of how it should look:**\n{example_blocks}")
 
         return "\n".join(lines)
diff --git a/bot/cogs/codeblock/parsing.py b/bot/cogs/codeblock/parsing.py
index a49ecc8f7..6fa6811cc 100644
--- a/bot/cogs/codeblock/parsing.py
+++ b/bot/cogs/codeblock/parsing.py
@@ -22,7 +22,7 @@ _TICKS = {
 _RE_CODE_BLOCK = re.compile(
     fr"""
     (?P<ticks>
-        (?P<tick>[{''.join(_TICKS)}])  # Put all ticks into a character class within a group.
+        (?P<tick>[{''.join(_TICKS)}]) # Put all ticks into a character class within a group.
         \2{{2}}                       # Match previous group 2 more times to ensure the same char.
     )
     (?P<lang>[^\W_]+\n)?              # Optionally match a language specifier followed by a newline.
@@ -32,6 +32,16 @@ _RE_CODE_BLOCK = re.compile(
     re.DOTALL | re.VERBOSE
 )
 
+PY_LANG_CODES = ("python", "py")  # Order is important; "py" is second cause it's a subset.
+_RE_LANGUAGE = re.compile(
+    fr"""
+    ^(?P<spaces>\s+)?                    # Optionally match leading spaces from the beginning.
+    (?P<lang>{'|'.join(PY_LANG_CODES)})  # Match a Python language.
+    (?P<newline>\n)?                     # Optionally match a newline following the language.
+    """,
+    re.IGNORECASE | re.VERBOSE
+)
+
 
 class CodeBlock(NamedTuple):
     """Represents a Markdown code block."""
@@ -41,6 +51,14 @@ class CodeBlock(NamedTuple):
     tick: str
 
 
+class BadLanguage(NamedTuple):
+    """Parsed information about a poorly formatted language specifier."""
+
+    language: str
+    leading_spaces: bool
+    terminal_newline: bool
+
+
 def find_code_blocks(message: str) -> Optional[Sequence[CodeBlock]]:
     """
     Find and return all Markdown code blocks in the `message`.
@@ -108,3 +126,22 @@ def is_repl_code(content: str, threshold: int = 3) -> bool:
 
     log.trace("Content is not Python REPL code.")
     return False
+
+
+def parse_bad_language(content: str) -> Optional[BadLanguage]:
+    """
+    Return information about a poorly formatted Python language in code block `content`.
+
+    If the language is not Python, return None.
+    """
+    log.trace("Parsing bad language.")
+
+    match = _RE_LANGUAGE.match(content)
+    if not match:
+        return None
+
+    return BadLanguage(
+        language=match["lang"],
+        leading_spaces=match["spaces"] is not None,
+        terminal_newline=match["newline"] is not None,
+    )
author	MarkKoz <[email protected]>	2020-05-07 18:33:34 -0700
committer	MarkKoz <[email protected]>	2020-06-13 11:21:08 -0700
commit	2bfac307c4b06682db93e2a75108012a586d1c7d (patch)
tree	03e9cd15cf695db05e93b144f0f1209b61c8be04
parent	Code block: create a function to format the example code blocks (diff)