diff options
| author | 2020-07-26 15:11:45 +0200 | |
|---|---|---|
| committer | 2020-07-26 15:11:45 +0200 | |
| commit | 13030b8c54dd2ed37047349c5b09e4ded2c83391 (patch) | |
| tree | ce7b816826960b3348144dd817706384837f021f | |
| parent | Fix markdownify's handling of h tags. (diff) | |
Move MarkdownConverter subclass to separate module
| -rw-r--r-- | bot/cogs/doc/markdown.py | 58 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 59 | 
2 files changed, 60 insertions, 57 deletions
| diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py new file mode 100644 index 000000000..dca477d35 --- /dev/null +++ b/bot/cogs/doc/markdown.py @@ -0,0 +1,58 @@ +from urllib.parse import urljoin + +from bs4.element import PageElement +from markdownify import MarkdownConverter + + +class _DocMarkdownConverter(MarkdownConverter): +    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + +    def __init__(self, *, page_url: str, **options): +        super().__init__(**options) +        self.page_url = page_url + +    def convert_li(self, el: PageElement, text: str) -> str: +        """Fix markdownify's erroneous indexing in ol tags.""" +        parent = el.parent +        if parent is not None and parent.name == 'ol': +            li_tags = parent.find_all("li") +            bullet = '%s.' % (li_tags.index(el)+1) +        else: +            depth = -1 +            while el: +                if el.name == 'ul': +                    depth += 1 +                el = el.parent +            bullets = self.options['bullets'] +            bullet = bullets[depth % len(bullets)] +        return '%s %s\n' % (bullet, text or '') + +    def convert_hn(self, _n: int, el: PageElement, text: str) -> str: +        """Convert h tags to bold text with ** instead of adding #.""" +        return f"**{text}**\n\n" + +    def convert_code(self, el: PageElement, text: str) -> str: +        """Undo `markdownify`s underscore escaping.""" +        return f"`{text}`".replace('\\', '') + +    def convert_pre(self, el: PageElement, text: str) -> str: +        """Wrap any codeblocks in `py` for syntax highlighting.""" +        code = ''.join(el.strings) +        return f"```py\n{code}```" + +    def convert_a(self, el: PageElement, text: str) -> str: +        """Resolve relative URLs to `self.page_url`.""" +        el["href"] = urljoin(self.page_url, el["href"]) +        return super().convert_a(el, text) + +    def convert_p(self, el: PageElement, text: str) -> str: +        """Include only one newline instead of two when the parent is a li tag.""" +        parent = el.parent +        if parent is not None and parent.name == "li": +            return f"{text}\n" +        return super().convert_p(el, text) + + +def markdownify(html: str, *, url: str = "") -> str: +    """Create a DocMarkdownConverter object from the input html.""" +    return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index ac8a94e3f..93daf3faf 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -4,15 +4,14 @@ import string  import textwrap  from functools import partial  from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union -from urllib.parse import urljoin  from aiohttp import ClientSession  from bs4 import BeautifulSoup  from bs4.element import NavigableString, PageElement, Tag -from markdownify import MarkdownConverter  from .cache import async_cache  from .html import Strainer +from .markdown import markdownify  if TYPE_CHECKING:      from .cog import DocItem @@ -42,60 +41,6 @@ _NO_SIGNATURE_GROUPS = {  } -class _DocMarkdownConverter(MarkdownConverter): -    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - -    def __init__(self, *, page_url: str, **options): -        super().__init__(**options) -        self.page_url = page_url - -    def convert_li(self, el: PageElement, text: str) -> str: -        """Fix markdownify's erroneous indexing in ol tags.""" -        parent = el.parent -        if parent is not None and parent.name == 'ol': -            li_tags = parent.find_all("li") -            bullet = '%s.' % (li_tags.index(el)+1) -        else: -            depth = -1 -            while el: -                if el.name == 'ul': -                    depth += 1 -                el = el.parent -            bullets = self.options['bullets'] -            bullet = bullets[depth % len(bullets)] -        return '%s %s\n' % (bullet, text or '') - -    def convert_hn(self, _n: int, el: PageElement, text: str) -> str: -        """Convert h tags to bold text with ** instead of adding #.""" -        return f"**{text}**\n\n" - -    def convert_code(self, el: PageElement, text: str) -> str: -        """Undo `markdownify`s underscore escaping.""" -        return f"`{text}`".replace('\\', '') - -    def convert_pre(self, el: PageElement, text: str) -> str: -        """Wrap any codeblocks in `py` for syntax highlighting.""" -        code = ''.join(el.strings) -        return f"```py\n{code}```" - -    def convert_a(self, el: PageElement, text: str) -> str: -        """Resolve relative URLs to `self.page_url`.""" -        el["href"] = urljoin(self.page_url, el["href"]) -        return super().convert_a(el, text) - -    def convert_p(self, el: PageElement, text: str) -> str: -        """Include only one newline instead of two when the parent is a li tag.""" -        parent = el.parent -        if parent is not None and parent.name == "li": -            return f"{text}\n" -        return super().convert_p(el, text) - - -def _markdownify(html: str, *, url: str = "") -> str: -    """Create a DocMarkdownConverter object from the input html.""" -    return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) - -  def _find_elements_until_tag(          start_element: PageElement,          tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], @@ -215,7 +160,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:      The signatures are wrapped in python codeblocks, separated from the description by a newline.      The result string is truncated to be max 1000 symbols long.      """ -    description = _truncate_markdown(_markdownify(description, url=url), 1000) +    description = _truncate_markdown(markdownify(description, url=url), 1000)      description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)      if signatures is not None:          formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) | 
