diff options
| -rw-r--r-- | bot/cogs/doc/markdown.py | 58 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 59 |
2 files changed, 60 insertions, 57 deletions
diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py new file mode 100644 index 000000000..dca477d35 --- /dev/null +++ b/bot/cogs/doc/markdown.py @@ -0,0 +1,58 @@ +from urllib.parse import urljoin + +from bs4.element import PageElement +from markdownify import MarkdownConverter + + +class _DocMarkdownConverter(MarkdownConverter): + """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + + def __init__(self, *, page_url: str, **options): + super().__init__(**options) + self.page_url = page_url + + def convert_li(self, el: PageElement, text: str) -> str: + """Fix markdownify's erroneous indexing in ol tags.""" + parent = el.parent + if parent is not None and parent.name == 'ol': + li_tags = parent.find_all("li") + bullet = '%s.' % (li_tags.index(el)+1) + else: + depth = -1 + while el: + if el.name == 'ul': + depth += 1 + el = el.parent + bullets = self.options['bullets'] + bullet = bullets[depth % len(bullets)] + return '%s %s\n' % (bullet, text or '') + + def convert_hn(self, _n: int, el: PageElement, text: str) -> str: + """Convert h tags to bold text with ** instead of adding #.""" + return f"**{text}**\n\n" + + def convert_code(self, el: PageElement, text: str) -> str: + """Undo `markdownify`s underscore escaping.""" + return f"`{text}`".replace('\\', '') + + def convert_pre(self, el: PageElement, text: str) -> str: + """Wrap any codeblocks in `py` for syntax highlighting.""" + code = ''.join(el.strings) + return f"```py\n{code}```" + + def convert_a(self, el: PageElement, text: str) -> str: + """Resolve relative URLs to `self.page_url`.""" + el["href"] = urljoin(self.page_url, el["href"]) + return super().convert_a(el, text) + + def convert_p(self, el: PageElement, text: str) -> str: + """Include only one newline instead of two when the parent is a li tag.""" + parent = el.parent + if parent is not None and parent.name == "li": + return f"{text}\n" + return super().convert_p(el, text) + + +def markdownify(html: str, *, url: str = "") -> str: + """Create a DocMarkdownConverter object from the input html.""" + return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index ac8a94e3f..93daf3faf 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -4,15 +4,14 @@ import string import textwrap from functools import partial from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union -from urllib.parse import urljoin from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag -from markdownify import MarkdownConverter from .cache import async_cache from .html import Strainer +from .markdown import markdownify if TYPE_CHECKING: from .cog import DocItem @@ -42,60 +41,6 @@ _NO_SIGNATURE_GROUPS = { } -class _DocMarkdownConverter(MarkdownConverter): - """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - - def __init__(self, *, page_url: str, **options): - super().__init__(**options) - self.page_url = page_url - - def convert_li(self, el: PageElement, text: str) -> str: - """Fix markdownify's erroneous indexing in ol tags.""" - parent = el.parent - if parent is not None and parent.name == 'ol': - li_tags = parent.find_all("li") - bullet = '%s.' % (li_tags.index(el)+1) - else: - depth = -1 - while el: - if el.name == 'ul': - depth += 1 - el = el.parent - bullets = self.options['bullets'] - bullet = bullets[depth % len(bullets)] - return '%s %s\n' % (bullet, text or '') - - def convert_hn(self, _n: int, el: PageElement, text: str) -> str: - """Convert h tags to bold text with ** instead of adding #.""" - return f"**{text}**\n\n" - - def convert_code(self, el: PageElement, text: str) -> str: - """Undo `markdownify`s underscore escaping.""" - return f"`{text}`".replace('\\', '') - - def convert_pre(self, el: PageElement, text: str) -> str: - """Wrap any codeblocks in `py` for syntax highlighting.""" - code = ''.join(el.strings) - return f"```py\n{code}```" - - def convert_a(self, el: PageElement, text: str) -> str: - """Resolve relative URLs to `self.page_url`.""" - el["href"] = urljoin(self.page_url, el["href"]) - return super().convert_a(el, text) - - def convert_p(self, el: PageElement, text: str) -> str: - """Include only one newline instead of two when the parent is a li tag.""" - parent = el.parent - if parent is not None and parent.name == "li": - return f"{text}\n" - return super().convert_p(el, text) - - -def _markdownify(html: str, *, url: str = "") -> str: - """Create a DocMarkdownConverter object from the input html.""" - return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) - - def _find_elements_until_tag( start_element: PageElement, tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], @@ -215,7 +160,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = _truncate_markdown(_markdownify(description, url=url), 1000) + description = _truncate_markdown(markdownify(description, url=url), 1000) description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) |