diff options
| author | 2020-09-21 00:30:08 +0200 | |
|---|---|---|
| committer | 2020-09-21 00:30:56 +0200 | |
| commit | 7ab949e09a22d7547f74caa447d81299f7b52e47 (patch) | |
| tree | fbc59528840a94e996434643ffa112c00af7d0f2 | |
| parent | Log exceptions from parsing task (diff) | |
Properly truncate description markdown
The previous truncating implementation used a naive method that
disregarded the actual markdown formatting, possibly resulting in
it getting cut out. With the introduction of proper href tags this
became impossible to manage without writing an actual parser; so the
process was moved to happen when the gathered bs4 elements are being
converted into markdown
| -rw-r--r-- | bot/cogs/doc/markdown.py | 7 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 86 |
2 files changed, 49 insertions, 44 deletions
diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py index dca477d35..a95e94991 100644 --- a/bot/cogs/doc/markdown.py +++ b/bot/cogs/doc/markdown.py @@ -4,7 +4,7 @@ from bs4.element import PageElement from markdownify import MarkdownConverter -class _DocMarkdownConverter(MarkdownConverter): +class DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" def __init__(self, *, page_url: str, **options): @@ -51,8 +51,3 @@ class _DocMarkdownConverter(MarkdownConverter): if parent is not None and parent.name == "li": return f"{text}\n" return super().convert_p(el, text) - - -def markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 21a3065f4..ed6343cd8 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,13 +5,13 @@ import re import string import textwrap from functools import partial -from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union +from typing import Callable, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag from .html import Strainer -from .markdown import markdownify +from .markdown import DocMarkdownConverter if TYPE_CHECKING: from .cog import DocItem @@ -39,6 +39,8 @@ _NO_SIGNATURE_GROUPS = { "templatetag", "term", } +_MAX_DESCRIPTION_LENGTH = 1800 +_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace def _find_elements_until_tag( @@ -80,7 +82,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_general_description(start_element: PageElement) -> Optional[str]: +def _get_general_description(start_element: PageElement) -> Iterable[Union[Tag, NavigableString]]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. @@ -89,18 +91,13 @@ def _get_general_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find_next("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join( - str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - ) + return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - return description - -def _get_dd_description(symbol: PageElement) -> str: - """Get the string contents of the next dd tag, up to a dt or a dl tag.""" +def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: + """Get the contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") - description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) - return "".join(str(tag) for tag in description_contents) + return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) def _get_signatures(start_signature: PageElement) -> List[str]: @@ -124,43 +121,57 @@ def _get_signatures(start_signature: PageElement) -> List[str]: return signatures -def _truncate_markdown(markdown: str, max_length: int) -> str: +def _get_truncated_description( + elements: Iterable[Union[Tag, NavigableString]], + markdown_converter: DocMarkdownConverter, + max_length: int, +) -> str: """ - Truncate `markdown` to be at most `max_length` characters. + Truncate markdown from `elements` to be at most `max_length` characters visually. - The markdown string is searched for substrings to cut at, to keep its structure, - but if none are found the string is simply sliced. + `max_length` limits the length of the rendered characters in the string, + with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits """ - if len(markdown) > max_length: - shortened = markdown[:max_length] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for cutoff_string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(cutoff_string) - if description_cutoff != -1: - break + visual_length = 0 + real_length = 0 + result = [] + shortened = False + + for element in elements: + is_tag = isinstance(element, Tag) + element_length = len(element.text) if is_tag else len(element) + if visual_length + element_length < max_length: + if is_tag: + element_markdown = markdown_converter.process_tag(element) + else: + element_markdown = markdown_converter.process_text(element) + + element_markdown_length = len(element_markdown) + if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: + result.append(element_markdown) else: - description_cutoff = max_length - markdown = markdown[:description_cutoff] + shortened = True + break + real_length += element_markdown_length + visual_length += element_length + else: + shortened = True + break - # If there is an incomplete code block, cut it out - if markdown.count("```") % 2: - codeblock_start = markdown.rfind('```py') - markdown = markdown[:codeblock_start].rstrip() - markdown = markdown.rstrip(string.punctuation) + "..." - return markdown + markdown_string = "".join(result) + if shortened: + markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." + return markdown_string -def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str: +def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: """ Create a markdown string with the signatures at the top, and the converted html description below them. The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = _truncate_markdown(markdownify(description, url=url), 1000) + description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) @@ -204,5 +215,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: else: signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) - - return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) + return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') |