Create the parsing interface function.

Other functions from the module are not intended to be used directly, with the interface of it being the added function which accepts the symbol and calls internals. All other names except imports and log had the underscore prefix added to accommodate this.
author: Numerlor <[email protected]> 2020-07-22 02:35:13 +0200
committer: Numerlor <[email protected]> 2020-07-22 02:35:13 +0200
commit: f562c4b4551caa8ed3710ac5e9841150cb8a2492 (patch)
tree: 3ae31b07dab104cc1a6f55c8d902cff00c71ea51
parent: Create a function for getting the result markdown. (diff)
1 files changed, 71 insertions, 21 deletions
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
index 8756e0694..a2c6564b3 100644
--- a/bot/cogs/doc/parsing.py
+++ b/bot/cogs/doc/parsing.py
@@ -3,7 +3,7 @@ import re
 import string
 import textwrap
 from functools import partial
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union
 from urllib.parse import urljoin
 
 from aiohttp import ClientSession
@@ -12,13 +12,15 @@ from bs4.element import PageElement, Tag
 from markdownify import MarkdownConverter
 
 from .cache import async_cache
+if TYPE_CHECKING:
+    from .cog import DocItem
 
 log = logging.getLogger(__name__)
 
-UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
-WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
+_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
+_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
 
-SEARCH_END_TAG_ATTRS = (
+_SEARCH_END_TAG_ATTRS = (
     "data",
     "function",
     "class",
@@ -29,8 +31,17 @@ SEARCH_END_TAG_ATTRS = (
     "sphinxsidebar",
 )
 
+_NO_SIGNATURE_GROUPS = {
+    "attribute",
+    "envvar",
+    "setting",
+    "tempaltefilter",
+    "templatetag",
+    "term",
+}
 
-class DocMarkdownConverter(MarkdownConverter):
+
+class _DocMarkdownConverter(MarkdownConverter):
     """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
 
     def __init__(self, *, page_url: str, **options):
@@ -75,12 +86,12 @@ class DocMarkdownConverter(MarkdownConverter):
         return super().convert_p(el, text)
 
 
-def markdownify(html: str, *, url: str = "") -> str:
+def _markdownify(html: str, *, url: str = "") -> str:
     """Create a DocMarkdownConverter object from the input html."""
-    return DocMarkdownConverter(bullets='•', page_url=url).convert(html)
+    return _DocMarkdownConverter(bullets='•', page_url=url).convert(html)
 
 
-def find_elements_until_tag(
+def _find_elements_until_tag(
         start_element: PageElement,
         tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]],
         *,
@@ -109,9 +120,9 @@ def find_elements_until_tag(
     return elements
 
 
-find_next_children_until_tag = partial(find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
-find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
-find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
+_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
+_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
+_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
 
 
 def get_module_description(start_element: PageElement) -> Optional[str]:
@@ -123,12 +134,19 @@ def get_module_description(start_element: PageElement) -> Optional[str]:
     """
     header = start_element.find("a", attrs={"class": "headerlink"})
     start_tag = header.parent if header is not None else start_element
-    description = "".join(str(tag) for tag in find_next_siblings_until_tag(start_tag, _match_end_tag))
+    description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag))
 
     return description
 
 
-def get_signatures(start_signature: PageElement) -> List[str]:
+def _get_symbol_description(symbol: PageElement) -> str:
+    """Get the string contents of the next dd tag, up to a dt or a dl tag."""
+    description_tag = symbol.find_next("dd")
+    description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"))
+    return "".join(str(tag) for tag in description_contents)
+
+
+def _get_signatures(start_signature: PageElement) -> List[str]:
     """
     Collect up to 3 signatures from dt tags around the `start_signature` dt tag.
 
@@ -137,11 +155,11 @@ def get_signatures(start_signature: PageElement) -> List[str]:
     """
     signatures = []
     for element in (
-            *reversed(find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
+            *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
             start_signature,
-            *find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
+            *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
     )[-3:]:
-        signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
+        signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
 
         if signature:
             signatures.append(signature)
@@ -149,7 +167,7 @@ def get_signatures(start_signature: PageElement) -> List[str]:
     return signatures
 
 
-def truncate_markdown(markdown: str, max_length: int) -> str:
+def _truncate_markdown(markdown: str, max_length: int) -> str:
     """
     Truncate `markdown` to be at most `max_length` characters.
 
@@ -185,8 +203,8 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:
     The signatures are wrapped in python codeblocks, separated from the description by a newline.
     The result string is truncated to be max 1000 symbols long.
     """
-    description = truncate_markdown(markdownify(description, url=url), 1000)
-    description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
+    description = _truncate_markdown(_markdownify(description, url=url), 1000)
+    description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
     if signatures is not None:
         formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures)
     else:
@@ -197,7 +215,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:
 
 
 @async_cache(arg_offset=1)
-async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
+async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
     """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed."""
     log.trace(f"Sending a request to {url}.")
     async with http_session.get(url) as response:
@@ -208,8 +226,40 @@ async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulS
 
 def _match_end_tag(tag: Tag) -> bool:
     """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
-    for attr in SEARCH_END_TAG_ATTRS:
+    for attr in _SEARCH_END_TAG_ATTRS:
         if attr in tag.get("class", ()):
             return True
 
     return tag.name == "table"
+
+
+async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str:
+    """
+    Return parsed markdown of the passed symbol, truncated to 1000 characters.
+
+    A request through `http_session` is made to the url associated with `symbol_data` for the html contents;
+    the contents are then parsed depending on what group the symbol belongs to.
+    """
+    if "#" in symbol_data.url:
+        request_url, symbol_id = symbol_data.url.rsplit('#')
+    else:
+        request_url = symbol_data.url
+        symbol_id = None
+
+    soup = await _get_soup_from_url(http_session, request_url)
+    symbol_heading = soup.find(id=symbol_id)
+
+    # Handle doc symbols as modules, because they either link to the page of a module,
+    # or don't contain any useful info to be parsed.
+    signature = None
+    if symbol_data.group in {"module", "doc"}:
+        description = get_module_description(symbol_heading)
+
+    elif symbol_data.group in _NO_SIGNATURE_GROUPS:
+        description = _get_symbol_description(symbol_heading)
+
+    else:
+        signature = _get_signatures(symbol_heading)
+        description = _get_symbol_description(symbol_heading)
+
+    return _parse_into_markdown(signature, description, symbol_data.url)
author	Numerlor <[email protected]>	2020-07-22 02:35:13 +0200
committer	Numerlor <[email protected]>	2020-07-22 02:35:13 +0200
commit	f562c4b4551caa8ed3710ac5e9841150cb8a2492 (patch)
tree	3ae31b07dab104cc1a6f55c8d902cff00c71ea51
parent	Create a function for getting the result markdown. (diff)