aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bot/cogs/doc/markdown.py7
-rw-r--r--bot/cogs/doc/parsing.py86
2 files changed, 49 insertions, 44 deletions
diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py
index dca477d35..a95e94991 100644
--- a/bot/cogs/doc/markdown.py
+++ b/bot/cogs/doc/markdown.py
@@ -4,7 +4,7 @@ from bs4.element import PageElement
from markdownify import MarkdownConverter
-class _DocMarkdownConverter(MarkdownConverter):
+class DocMarkdownConverter(MarkdownConverter):
"""Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
def __init__(self, *, page_url: str, **options):
@@ -51,8 +51,3 @@ class _DocMarkdownConverter(MarkdownConverter):
if parent is not None and parent.name == "li":
return f"{text}\n"
return super().convert_p(el, text)
-
-
-def markdownify(html: str, *, url: str = "") -> str:
- """Create a DocMarkdownConverter object from the input html."""
- return _DocMarkdownConverter(bullets='•', page_url=url).convert(html)
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
index 21a3065f4..ed6343cd8 100644
--- a/bot/cogs/doc/parsing.py
+++ b/bot/cogs/doc/parsing.py
@@ -5,13 +5,13 @@ import re
import string
import textwrap
from functools import partial
-from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union
+from typing import Callable, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union
from bs4 import BeautifulSoup
from bs4.element import NavigableString, PageElement, Tag
from .html import Strainer
-from .markdown import markdownify
+from .markdown import DocMarkdownConverter
if TYPE_CHECKING:
from .cog import DocItem
@@ -39,6 +39,8 @@ _NO_SIGNATURE_GROUPS = {
"templatetag",
"term",
}
+_MAX_DESCRIPTION_LENGTH = 1800
+_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace
def _find_elements_until_tag(
@@ -80,7 +82,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful
_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
-def _get_general_description(start_element: PageElement) -> Optional[str]:
+def _get_general_description(start_element: PageElement) -> Iterable[Union[Tag, NavigableString]]:
"""
Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
@@ -89,18 +91,13 @@ def _get_general_description(start_element: PageElement) -> Optional[str]:
"""
header = start_element.find_next("a", attrs={"class": "headerlink"})
start_tag = header.parent if header is not None else start_element
- description = "".join(
- str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
- )
+ return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
- return description
-
-def _get_dd_description(symbol: PageElement) -> str:
- """Get the string contents of the next dd tag, up to a dt or a dl tag."""
+def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]:
+ """Get the contents of the next dd tag, up to a dt or a dl tag."""
description_tag = symbol.find_next("dd")
- description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
- return "".join(str(tag) for tag in description_contents)
+ return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
def _get_signatures(start_signature: PageElement) -> List[str]:
@@ -124,43 +121,57 @@ def _get_signatures(start_signature: PageElement) -> List[str]:
return signatures
-def _truncate_markdown(markdown: str, max_length: int) -> str:
+def _get_truncated_description(
+ elements: Iterable[Union[Tag, NavigableString]],
+ markdown_converter: DocMarkdownConverter,
+ max_length: int,
+) -> str:
"""
- Truncate `markdown` to be at most `max_length` characters.
+ Truncate markdown from `elements` to be at most `max_length` characters visually.
- The markdown string is searched for substrings to cut at, to keep its structure,
- but if none are found the string is simply sliced.
+ `max_length` limits the length of the rendered characters in the string,
+ with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits
"""
- if len(markdown) > max_length:
- shortened = markdown[:max_length]
- description_cutoff = shortened.rfind('\n\n', 100)
- if description_cutoff == -1:
- # Search the shortened version for cutoff points in decreasing desirability,
- # cutoff at 1000 if none are found.
- for cutoff_string in (". ", ", ", ",", " "):
- description_cutoff = shortened.rfind(cutoff_string)
- if description_cutoff != -1:
- break
+ visual_length = 0
+ real_length = 0
+ result = []
+ shortened = False
+
+ for element in elements:
+ is_tag = isinstance(element, Tag)
+ element_length = len(element.text) if is_tag else len(element)
+ if visual_length + element_length < max_length:
+ if is_tag:
+ element_markdown = markdown_converter.process_tag(element)
+ else:
+ element_markdown = markdown_converter.process_text(element)
+
+ element_markdown_length = len(element_markdown)
+ if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH:
+ result.append(element_markdown)
else:
- description_cutoff = max_length
- markdown = markdown[:description_cutoff]
+ shortened = True
+ break
+ real_length += element_markdown_length
+ visual_length += element_length
+ else:
+ shortened = True
+ break
- # If there is an incomplete code block, cut it out
- if markdown.count("```") % 2:
- codeblock_start = markdown.rfind('```py')
- markdown = markdown[:codeblock_start].rstrip()
- markdown = markdown.rstrip(string.punctuation) + "..."
- return markdown
+ markdown_string = "".join(result)
+ if shortened:
+ markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..."
+ return markdown_string
-def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str:
+def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str:
"""
Create a markdown string with the signatures at the top, and the converted html description below them.
The signatures are wrapped in python codeblocks, separated from the description by a newline.
The result string is truncated to be max 1000 symbols long.
"""
- description = _truncate_markdown(markdownify(description, url=url), 1000)
+ description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750)
description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
if signatures is not None:
formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures)
@@ -204,5 +215,4 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str:
else:
signature = _get_signatures(symbol_heading)
description = _get_dd_description(symbol_heading)
-
- return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url)
+ return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '')