diff options
Diffstat (limited to '')
| -rw-r--r-- | bot/cogs/doc/markdown.py | 7 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 86 | 
2 files changed, 49 insertions, 44 deletions
| diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py index dca477d35..a95e94991 100644 --- a/bot/cogs/doc/markdown.py +++ b/bot/cogs/doc/markdown.py @@ -4,7 +4,7 @@ from bs4.element import PageElement  from markdownify import MarkdownConverter -class _DocMarkdownConverter(MarkdownConverter): +class DocMarkdownConverter(MarkdownConverter):      """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""      def __init__(self, *, page_url: str, **options): @@ -51,8 +51,3 @@ class _DocMarkdownConverter(MarkdownConverter):          if parent is not None and parent.name == "li":              return f"{text}\n"          return super().convert_p(el, text) - - -def markdownify(html: str, *, url: str = "") -> str: -    """Create a DocMarkdownConverter object from the input html.""" -    return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 21a3065f4..ed6343cd8 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,13 +5,13 @@ import re  import string  import textwrap  from functools import partial -from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union +from typing import Callable, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union  from bs4 import BeautifulSoup  from bs4.element import NavigableString, PageElement, Tag  from .html import Strainer -from .markdown import markdownify +from .markdown import DocMarkdownConverter  if TYPE_CHECKING:      from .cog import DocItem @@ -39,6 +39,8 @@ _NO_SIGNATURE_GROUPS = {      "templatetag",      "term",  } +_MAX_DESCRIPTION_LENGTH = 1800 +_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace  def _find_elements_until_tag( @@ -80,7 +82,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful  _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def _get_general_description(start_element: PageElement) -> Optional[str]: +def _get_general_description(start_element: PageElement) -> Iterable[Union[Tag, NavigableString]]:      """      Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. @@ -89,18 +91,13 @@ def _get_general_description(start_element: PageElement) -> Optional[str]:      """      header = start_element.find_next("a", attrs={"class": "headerlink"})      start_tag = header.parent if header is not None else start_element -    description = "".join( -        str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) -    ) +    return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) -    return description - -def _get_dd_description(symbol: PageElement) -> str: -    """Get the string contents of the next dd tag, up to a dt or a dl tag.""" +def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: +    """Get the contents of the next dd tag, up to a dt or a dl tag."""      description_tag = symbol.find_next("dd") -    description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) -    return "".join(str(tag) for tag in description_contents) +    return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)  def _get_signatures(start_signature: PageElement) -> List[str]: @@ -124,43 +121,57 @@ def _get_signatures(start_signature: PageElement) -> List[str]:      return signatures -def _truncate_markdown(markdown: str, max_length: int) -> str: +def _get_truncated_description( +        elements: Iterable[Union[Tag, NavigableString]], +        markdown_converter: DocMarkdownConverter, +        max_length: int, +) -> str:      """ -    Truncate `markdown` to be at most `max_length` characters. +    Truncate markdown from `elements` to be at most `max_length` characters visually. -    The markdown string is searched for substrings to cut at, to keep its structure, -    but if none are found the string is simply sliced. +    `max_length` limits the length of the rendered characters in the string, +    with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits      """ -    if len(markdown) > max_length: -        shortened = markdown[:max_length] -        description_cutoff = shortened.rfind('\n\n', 100) -        if description_cutoff == -1: -            # Search the shortened version for cutoff points in decreasing desirability, -            # cutoff at 1000 if none are found. -            for cutoff_string in (". ", ", ", ",", " "): -                description_cutoff = shortened.rfind(cutoff_string) -                if description_cutoff != -1: -                    break +    visual_length = 0 +    real_length = 0 +    result = [] +    shortened = False + +    for element in elements: +        is_tag = isinstance(element, Tag) +        element_length = len(element.text) if is_tag else len(element) +        if visual_length + element_length < max_length: +            if is_tag: +                element_markdown = markdown_converter.process_tag(element) +            else: +                element_markdown = markdown_converter.process_text(element) + +            element_markdown_length = len(element_markdown) +            if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: +                result.append(element_markdown)              else: -                description_cutoff = max_length -        markdown = markdown[:description_cutoff] +                shortened = True +                break +            real_length += element_markdown_length +            visual_length += element_length +        else: +            shortened = True +            break -        # If there is an incomplete code block, cut it out -        if markdown.count("```") % 2: -            codeblock_start = markdown.rfind('```py') -            markdown = markdown[:codeblock_start].rstrip() -        markdown = markdown.rstrip(string.punctuation) + "..." -    return markdown +    markdown_string = "".join(result) +    if shortened: +        markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." +    return markdown_string -def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str: +def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str:      """      Create a markdown string with the signatures at the top, and the converted html description below them.      The signatures are wrapped in python codeblocks, separated from the description by a newline.      The result string is truncated to be max 1000 symbols long.      """ -    description = _truncate_markdown(markdownify(description, url=url), 1000) +    description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750)      description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)      if signatures is not None:          formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) @@ -204,5 +215,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str:      else:          signature = _get_signatures(symbol_heading)          description = _get_dd_description(symbol_heading) - -    return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) +    return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') | 
