diff options
| author | 2020-07-22 02:35:13 +0200 | |
|---|---|---|
| committer | 2020-07-22 02:35:13 +0200 | |
| commit | f562c4b4551caa8ed3710ac5e9841150cb8a2492 (patch) | |
| tree | 3ae31b07dab104cc1a6f55c8d902cff00c71ea51 | |
| parent | Create a function for getting the result markdown. (diff) | |
Create the parsing interface function.
Other functions from the module are not intended to be used directly,
with the interface of it being the added function which accepts the
symbol and calls internals.
All other names except imports and log had the underscore prefix added
to accommodate this.
| -rw-r--r-- | bot/cogs/doc/parsing.py | 92 |
1 files changed, 71 insertions, 21 deletions
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 8756e0694..a2c6564b3 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -3,7 +3,7 @@ import re import string import textwrap from functools import partial -from typing import Callable, List, Optional, Tuple, Union +from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union from urllib.parse import urljoin from aiohttp import ClientSession @@ -12,13 +12,15 @@ from bs4.element import PageElement, Tag from markdownify import MarkdownConverter from .cache import async_cache +if TYPE_CHECKING: + from .cog import DocItem log = logging.getLogger(__name__) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") -SEARCH_END_TAG_ATTRS = ( +_SEARCH_END_TAG_ATTRS = ( "data", "function", "class", @@ -29,8 +31,17 @@ SEARCH_END_TAG_ATTRS = ( "sphinxsidebar", ) +_NO_SIGNATURE_GROUPS = { + "attribute", + "envvar", + "setting", + "tempaltefilter", + "templatetag", + "term", +} -class DocMarkdownConverter(MarkdownConverter): + +class _DocMarkdownConverter(MarkdownConverter): """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" def __init__(self, *, page_url: str, **options): @@ -75,12 +86,12 @@ class DocMarkdownConverter(MarkdownConverter): return super().convert_p(el, text) -def markdownify(html: str, *, url: str = "") -> str: +def _markdownify(html: str, *, url: str = "") -> str: """Create a DocMarkdownConverter object from the input html.""" - return DocMarkdownConverter(bullets='•', page_url=url).convert(html) + return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) -def find_elements_until_tag( +def _find_elements_until_tag( start_element: PageElement, tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], *, @@ -109,9 +120,9 @@ def find_elements_until_tag( return elements -find_next_children_until_tag = partial(find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) -find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_next_siblings) -find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) def get_module_description(start_element: PageElement) -> Optional[str]: @@ -123,12 +134,19 @@ def get_module_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join(str(tag) for tag in find_next_siblings_until_tag(start_tag, _match_end_tag)) + description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag)) return description -def get_signatures(start_signature: PageElement) -> List[str]: +def _get_symbol_description(symbol: PageElement) -> str: + """Get the string contents of the next dd tag, up to a dt or a dl tag.""" + description_tag = symbol.find_next("dd") + description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl")) + return "".join(str(tag) for tag in description_contents) + + +def _get_signatures(start_signature: PageElement) -> List[str]: """ Collect up to 3 signatures from dt tags around the `start_signature` dt tag. @@ -137,11 +155,11 @@ def get_signatures(start_signature: PageElement) -> List[str]: """ signatures = [] for element in ( - *reversed(find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), + *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), start_signature, - *find_next_siblings_until_tag(start_signature, ("dd",), limit=2), + *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) if signature: signatures.append(signature) @@ -149,7 +167,7 @@ def get_signatures(start_signature: PageElement) -> List[str]: return signatures -def truncate_markdown(markdown: str, max_length: int) -> str: +def _truncate_markdown(markdown: str, max_length: int) -> str: """ Truncate `markdown` to be at most `max_length` characters. @@ -185,8 +203,8 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: The signatures are wrapped in python codeblocks, separated from the description by a newline. The result string is truncated to be max 1000 symbols long. """ - description = truncate_markdown(markdownify(description, url=url), 1000) - description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) + description = _truncate_markdown(_markdownify(description, url=url), 1000) + description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) if signatures is not None: formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) else: @@ -197,7 +215,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: @async_cache(arg_offset=1) -async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: +async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" log.trace(f"Sending a request to {url}.") async with http_session.get(url) as response: @@ -208,8 +226,40 @@ async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulS def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: + for attr in _SEARCH_END_TAG_ATTRS: if attr in tag.get("class", ()): return True return tag.name == "table" + + +async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: + """ + Return parsed markdown of the passed symbol, truncated to 1000 characters. + + A request through `http_session` is made to the url associated with `symbol_data` for the html contents; + the contents are then parsed depending on what group the symbol belongs to. + """ + if "#" in symbol_data.url: + request_url, symbol_id = symbol_data.url.rsplit('#') + else: + request_url = symbol_data.url + symbol_id = None + + soup = await _get_soup_from_url(http_session, request_url) + symbol_heading = soup.find(id=symbol_id) + + # Handle doc symbols as modules, because they either link to the page of a module, + # or don't contain any useful info to be parsed. + signature = None + if symbol_data.group in {"module", "doc"}: + description = get_module_description(symbol_heading) + + elif symbol_data.group in _NO_SIGNATURE_GROUPS: + description = _get_symbol_description(symbol_heading) + + else: + signature = _get_signatures(symbol_heading) + description = _get_symbol_description(symbol_heading) + + return _parse_into_markdown(signature, description, symbol_data.url) |