diff options
| -rw-r--r-- | bot/cogs/doc/parsing.py | 92 | 
1 files changed, 71 insertions, 21 deletions
| diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 8756e0694..a2c6564b3 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -3,7 +3,7 @@ import re  import string  import textwrap  from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union  from urllib.parse import urljoin  from aiohttp import ClientSession @@ -12,13 +12,15 @@ from bs4.element import PageElement, Tag  from markdownify import MarkdownConverter  from .cache import async_cache +if TYPE_CHECKING: +    from .cog import DocItem  log = logging.getLogger(__name__) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") -SEARCH_END_TAG_ATTRS = ( +_SEARCH_END_TAG_ATTRS = (      "data",      "function",      "class", @@ -29,8 +31,17 @@ SEARCH_END_TAG_ATTRS = (      "sphinxsidebar",  ) +_NO_SIGNATURE_GROUPS = { +    "attribute", +    "envvar", +    "setting", +    "tempaltefilter", +    "templatetag", +    "term", +} -class DocMarkdownConverter(MarkdownConverter): + +class _DocMarkdownConverter(MarkdownConverter):      """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""      def __init__(self, *, page_url: str, **options): @@ -75,12 +86,12 @@ class DocMarkdownConverter(MarkdownConverter):          return super().convert_p(el, text) -def markdownify(html: str, *, url: str = "") -> str: +def _markdownify(html: str, *, url: str = "") -> str:      """Create a DocMarkdownConverter object from the input html.""" -    return DocMarkdownConverter(bullets='•', page_url=url).convert(html) +    return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) -def find_elements_until_tag( +def _find_elements_until_tag(          start_element: PageElement,          tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]],          *, @@ -109,9 +120,9 @@ def find_elements_until_tag(      return elements -find_next_children_until_tag = partial(find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) -find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_next_siblings) -find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)  def get_module_description(start_element: PageElement) -> Optional[str]: @@ -123,12 +134,19 @@ def get_module_description(start_element: PageElement) -> Optional[str]:      """      header = start_element.find("a", attrs={"class": "headerlink"})      start_tag = header.parent if header is not None else start_element -    description = "".join(str(tag) for tag in find_next_siblings_until_tag(start_tag, _match_end_tag)) +    description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag))      return description -def get_signatures(start_signature: PageElement) -> List[str]: +def _get_symbol_description(symbol: PageElement) -> str: +    """Get the string contents of the next dd tag, up to a dt or a dl tag.""" +    description_tag = symbol.find_next("dd") +    description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl")) +    return "".join(str(tag) for tag in description_contents) + + +def _get_signatures(start_signature: PageElement) -> List[str]:      """      Collect up to 3 signatures from dt tags around the `start_signature` dt tag. @@ -137,11 +155,11 @@ def get_signatures(start_signature: PageElement) -> List[str]:      """      signatures = []      for element in ( -            *reversed(find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), +            *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),              start_signature, -            *find_next_siblings_until_tag(start_signature, ("dd",), limit=2), +            *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),      )[-3:]: -        signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) +        signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)          if signature:              signatures.append(signature) @@ -149,7 +167,7 @@ def get_signatures(start_signature: PageElement) -> List[str]:      return signatures -def truncate_markdown(markdown: str, max_length: int) -> str: +def _truncate_markdown(markdown: str, max_length: int) -> str:      """      Truncate `markdown` to be at most `max_length` characters. @@ -185,8 +203,8 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:      The signatures are wrapped in python codeblocks, separated from the description by a newline.      The result string is truncated to be max 1000 symbols long.      """ -    description = truncate_markdown(markdownify(description, url=url), 1000) -    description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) +    description = _truncate_markdown(_markdownify(description, url=url), 1000) +    description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)      if signatures is not None:          formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures)      else: @@ -197,7 +215,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:  @async_cache(arg_offset=1) -async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: +async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:      """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed."""      log.trace(f"Sending a request to {url}.")      async with http_session.get(url) as response: @@ -208,8 +226,40 @@ async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulS  def _match_end_tag(tag: Tag) -> bool:      """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" -    for attr in SEARCH_END_TAG_ATTRS: +    for attr in _SEARCH_END_TAG_ATTRS:          if attr in tag.get("class", ()):              return True      return tag.name == "table" + + +async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: +    """ +    Return parsed markdown of the passed symbol, truncated to 1000 characters. + +    A request through `http_session` is made to the url associated with `symbol_data` for the html contents; +    the contents are then parsed depending on what group the symbol belongs to. +    """ +    if "#" in symbol_data.url: +        request_url, symbol_id = symbol_data.url.rsplit('#') +    else: +        request_url = symbol_data.url +        symbol_id = None + +    soup = await _get_soup_from_url(http_session, request_url) +    symbol_heading = soup.find(id=symbol_id) + +    # Handle doc symbols as modules, because they either link to the page of a module, +    # or don't contain any useful info to be parsed. +    signature = None +    if symbol_data.group in {"module", "doc"}: +        description = get_module_description(symbol_heading) + +    elif symbol_data.group in _NO_SIGNATURE_GROUPS: +        description = _get_symbol_description(symbol_heading) + +    else: +        signature = _get_signatures(symbol_heading) +        description = _get_symbol_description(symbol_heading) + +    return _parse_into_markdown(signature, description, symbol_data.url) | 
