diff options
| -rw-r--r-- | bot/cogs/doc/markdown.py | 7 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 86 |
2 files changed, 49 insertions, 44 deletions
diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py index dca477d35..a95e94991 100644 --- a/bot/cogs/doc/markdown.py +++ b/bot/cogs/doc/markdown.py @@ -4,7 +4,7 @@ from bs4.element import PageElement from markdownify import MarkdownConverter -class _DocMarkdownConverter(MarkdownConverter): +class DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" def __init__(self, *, page_url: str, **options): @@ -51,8 +51,3 @@ class _DocMarkdownConverter(MarkdownConverter): if parent is not None and parent.name == "li": return f"{text}\n" return super().convert_p(el, text) - - -def markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 21a3065f4..ed6343cd8 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,13 +5,13 @@ import re import string import textwrap from functools import partial -from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union +from typing import Callable, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag from .html import Strainer -from .markdown import markdownify +from .markdown import DocMarkdownConverter if TYPE_CHECKING: from .cog import DocItem @@ -39,6 +39,8 @@ _NO_SIGNATURE_GROUPS = { "templatetag", "term", } +_MAX_DESCRIPTION_LENGTH = 1800 +_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace def _find_elements_until_tag( @@ -80,7 +82,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_general_description(start_element: PageElement) -> Optional[str]: +def _get_general_description(start_element: PageElement) -> Iterable[Union[Tag, NavigableString]]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. @@ -89,18 +91,13 @@ def _get_general_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find_next("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join( - str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - ) + return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) - return description - -def _get_dd_description(symbol: PageElement) -> str: - """Get the string contents of the next dd tag, up to a dt or a dl tag.""" +def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: + """Get the contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") - description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) - return "".join(str(tag) for tag in description_contents) + return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) def _get_signatures(start_signature: PageElement) -> List[str]: @@ -124,43 +121,57 @@ def _get_signatures(start_signature: PageElement) -> List[str]: return signatures -def _truncate_markdown(markdown: str, max_length: int) -> str: +def _get_truncated_description( + elements: Iterable[Union[Tag, NavigableString]], + markdown_converter: DocMarkdownConverter, + max_length: int, +) -> str: """ - Truncate `markdown` to be at most `max_length` characters. + Truncate markdown from `elements` to be at most `max_length` characters visually. - The markdown string is searched for substrings to cut at, to keep its structure, - but if none are found the string is simply sliced. + `max_length` limits the length of the rendered characters in the string, + with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits """ - if len(markdown) > max_length: - shortened = markdown[:max_length] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for cutoff_string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(cutoff_string) - if description_cutoff != -1: - break + visual_length = 0 + real_length = 0 + result = [] + shortened = False + + for element in elements: + is_tag = isinstance(element, Tag) + element_length = len(element.text) if is_tag else len(element) + if visual_length + element_length < max_length: + if is_tag: + element_markdown = markdown_converter.process_tag(element) + else: + element_markdown = markdown_converter.process_text(element) + + element_markdown_length = len(element_markdown) + if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: + result.append(element_markdown) else: - description_cutoff = max_length - markdown = markdown[:description_cutoff] + shortened = True + break + real_length += element_markdown_length + visual_length += element_length + else: + shortened = True + break - # If there is an incomplete code block, cut it out - if markdown.count("```") % 2: - codeblock_start = markdown.rfind('```py') - markdown = markdown[:codeblock_start].rstrip() - markdown = markdown.rstrip(string.punctuation) + "..." - return markdown + markdown_string = "".join(result) + if shortened: + markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." + return markdown_string -def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str: +def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: """ Create a markdown string with the signatures at the top, and the converted html description below them. The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = _truncate_markdown(markdownify(description, url=url), 1000) + description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) @@ -204,5 +215,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: else: signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) - - return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) + return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') |