diff options
| author | 2020-07-19 03:13:02 +0200 | |
|---|---|---|
| committer | 2020-07-19 03:13:02 +0200 | |
| commit | eb8361d7fa9d0eb0dd5982c6df0fd35b80d40ba6 (patch) | |
| tree | 3358f127db8b6f4c1ef18ccaa0823b07a5aac50b | |
| parent | Move main parsing methods into a new module (diff) | |
Move markdown truncation into parser module
| -rw-r--r-- | bot/cogs/doc/cog.py | 27 | ||||
| -rw-r--r-- | bot/cogs/doc/parser.py | 29 |
2 files changed, 31 insertions, 25 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 4a275c7c6..bd4e9d4d1 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -25,7 +25,7 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from .cache import async_cache -from .parser import get_soup_from_url, parse_module_symbol, parse_symbol +from .parser import get_soup_from_url, parse_module_symbol, parse_symbol, truncate_markdown log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -270,30 +270,7 @@ class DocCog(commands.Cog): self.bot.stats.incr(f"doc_fetches.{symbol_obj.package.lower()}") signatures = scraped_html[0] permalink = symbol_obj.url - description = markdownify(scraped_html[1], url=permalink) - - # Truncate the description of the embed to the last occurrence - # of a double newline (interpreted as a paragraph) before index 1000. - if len(description) > 1000: - shortened = description[:1000] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(string) - if description_cutoff != -1: - break - else: - description_cutoff = 1000 - description = description[:description_cutoff] - - # If there is an incomplete code block, cut it out - if description.count("```") % 2: - codeblock_start = description.rfind('```py') - description = description[:codeblock_start].rstrip() - description += f"... [read more]({permalink})" - + description = truncate_markdown(markdownify(scraped_html[1], url=permalink), permalink, 1000) description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is None: # If symbol is a module, don't show signature. diff --git a/bot/cogs/doc/parser.py b/bot/cogs/doc/parser.py index 67621591b..010826a96 100644 --- a/bot/cogs/doc/parser.py +++ b/bot/cogs/doc/parser.py @@ -83,6 +83,35 @@ def find_all_children_until_tag( return text +def truncate_markdown(markdown: str, permalink: str, max_length: int) -> str: + """ + Truncate `markdown` to be at most `max_length` characters. + + The markdown string is searched for substrings to cut at, to keep its structure, + but if none are found the string is simply sliced. + """ + if len(markdown) > max_length: + shortened = markdown[:max_length] + description_cutoff = shortened.rfind('\n\n', 100) + if description_cutoff == -1: + # Search the shortened version for cutoff points in decreasing desirability, + # cutoff at 1000 if none are found. + for string in (". ", ", ", ",", " "): + description_cutoff = shortened.rfind(string) + if description_cutoff != -1: + break + else: + description_cutoff = max_length + markdown = markdown[:description_cutoff] + + # If there is an incomplete code block, cut it out + if markdown.count("```") % 2: + codeblock_start = markdown.rfind('```py') + markdown = markdown[:codeblock_start].rstrip() + markdown += f"... [read more]({permalink})" + return markdown + + @async_cache(arg_offset=1) async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" |