Move markdown truncation into parser module

author: Numerlor <[email protected]> 2020-07-19 03:13:02 +0200
committer: Numerlor <[email protected]> 2020-07-19 03:13:02 +0200
commit: eb8361d7fa9d0eb0dd5982c6df0fd35b80d40ba6 (patch)
tree: 3358f127db8b6f4c1ef18ccaa0823b07a5aac50b
parent: Move main parsing methods into a new module (diff)
2 files changed, 31 insertions, 25 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py
index 4a275c7c6..bd4e9d4d1 100644
--- a/bot/cogs/doc/cog.py
+++ b/bot/cogs/doc/cog.py
@@ -25,7 +25,7 @@ from bot.decorators import with_role
 from bot.pagination import LinePaginator
 from bot.utils.messages import wait_for_deletion
 from .cache import async_cache
-from .parser import get_soup_from_url, parse_module_symbol, parse_symbol
+from .parser import get_soup_from_url, parse_module_symbol, parse_symbol, truncate_markdown
 
 log = logging.getLogger(__name__)
 logging.getLogger('urllib3').setLevel(logging.WARNING)
@@ -270,30 +270,7 @@ class DocCog(commands.Cog):
         self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}")
         signatures = scraped_html[0]
         permalink = symbol_obj.url
-        description = markdownify(scraped_html[1], url=permalink)
-
-        # Truncate the description of the embed to the last occurrence
-        # of a double newline (interpreted as a paragraph) before index 1000.
-        if len(description) > 1000:
-            shortened = description[:1000]
-            description_cutoff = shortened.rfind('\n\n', 100)
-            if description_cutoff == -1:
-                # Search the shortened version for cutoff points in decreasing desirability,
-                # cutoff at 1000 if none are found.
-                for string in (". ", ", ", ",", " "):
-                    description_cutoff = shortened.rfind(string)
-                    if description_cutoff != -1:
-                        break
-                else:
-                    description_cutoff = 1000
-            description = description[:description_cutoff]
-
-            # If there is an incomplete code block, cut it out
-            if description.count("```") % 2:
-                codeblock_start = description.rfind('```py')
-                description = description[:codeblock_start].rstrip()
-            description += f"... [read more]({permalink})"
-
+        description = truncate_markdown(markdownify(scraped_html[1], url=permalink), permalink, 1000)
         description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
         if signatures is None:
             # If symbol is a module, don't show signature.
diff --git a/bot/cogs/doc/parser.py b/bot/cogs/doc/parser.py
index 67621591b..010826a96 100644
--- a/bot/cogs/doc/parser.py
+++ b/bot/cogs/doc/parser.py
@@ -83,6 +83,35 @@ def find_all_children_until_tag(
     return text
 
 
+def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str:
+    """
+    Truncate `markdown` to be at most `max_length` characters.
+
+    The markdown string is searched for substrings to cut at, to keep its structure,
+    but if none are found the string is simply sliced.
+    """
+    if len(markdown) > max_length:
+        shortened = markdown[:max_length]
+        description_cutoff = shortened.rfind('\n\n', 100)
+        if description_cutoff == -1:
+            # Search the shortened version for cutoff points in decreasing desirability,
+            # cutoff at 1000 if none are found.
+            for string in (". ", ", ", ",", " "):
+                description_cutoff = shortened.rfind(string)
+                if description_cutoff != -1:
+                    break
+            else:
+                description_cutoff = max_length
+        markdown = markdown[:description_cutoff]
+
+        # If there is an incomplete code block, cut it out
+        if markdown.count("```") % 2:
+            codeblock_start = markdown.rfind('```py')
+            markdown = markdown[:codeblock_start].rstrip()
+        markdown += f"... [read more]({permalink})"
+    return markdown
+
+
 @async_cache(arg_offset=1)
 async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
     """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed."""
author	Numerlor <[email protected]>	2020-07-19 03:13:02 +0200
committer	Numerlor <[email protected]>	2020-07-19 03:13:02 +0200
commit	eb8361d7fa9d0eb0dd5982c6df0fd35b80d40ba6 (patch)
tree	3358f127db8b6f4c1ef18ccaa0823b07a5aac50b
parent	Move main parsing methods into a new module (diff)