aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bot/cogs/doc/parsing.py92
1 files changed, 71 insertions, 21 deletions
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
index 8756e0694..a2c6564b3 100644
--- a/bot/cogs/doc/parsing.py
+++ b/bot/cogs/doc/parsing.py
@@ -3,7 +3,7 @@ import re
import string
import textwrap
from functools import partial
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union
from urllib.parse import urljoin
from aiohttp import ClientSession
@@ -12,13 +12,15 @@ from bs4.element import PageElement, Tag
from markdownify import MarkdownConverter
from .cache import async_cache
+if TYPE_CHECKING:
+ from .cog import DocItem
log = logging.getLogger(__name__)
-UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
-WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
+_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
+_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
-SEARCH_END_TAG_ATTRS = (
+_SEARCH_END_TAG_ATTRS = (
"data",
"function",
"class",
@@ -29,8 +31,17 @@ SEARCH_END_TAG_ATTRS = (
"sphinxsidebar",
)
+_NO_SIGNATURE_GROUPS = {
+ "attribute",
+ "envvar",
+ "setting",
+ "tempaltefilter",
+ "templatetag",
+ "term",
+}
-class DocMarkdownConverter(MarkdownConverter):
+
+class _DocMarkdownConverter(MarkdownConverter):
"""Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
def __init__(self, *, page_url: str, **options):
@@ -75,12 +86,12 @@ class DocMarkdownConverter(MarkdownConverter):
return super().convert_p(el, text)
-def markdownify(html: str, *, url: str = "") -> str:
+def _markdownify(html: str, *, url: str = "") -> str:
"""Create a DocMarkdownConverter object from the input html."""
- return DocMarkdownConverter(bullets='•', page_url=url).convert(html)
+ return _DocMarkdownConverter(bullets='•', page_url=url).convert(html)
-def find_elements_until_tag(
+def _find_elements_until_tag(
start_element: PageElement,
tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]],
*,
@@ -109,9 +120,9 @@ def find_elements_until_tag(
return elements
-find_next_children_until_tag = partial(find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
-find_next_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
-find_previous_siblings_until_tag = partial(find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
+_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
+_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
+_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
def get_module_description(start_element: PageElement) -> Optional[str]:
@@ -123,12 +134,19 @@ def get_module_description(start_element: PageElement) -> Optional[str]:
"""
header = start_element.find("a", attrs={"class": "headerlink"})
start_tag = header.parent if header is not None else start_element
- description = "".join(str(tag) for tag in find_next_siblings_until_tag(start_tag, _match_end_tag))
+ description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag))
return description
-def get_signatures(start_signature: PageElement) -> List[str]:
+def _get_symbol_description(symbol: PageElement) -> str:
+ """Get the string contents of the next dd tag, up to a dt or a dl tag."""
+ description_tag = symbol.find_next("dd")
+ description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"))
+ return "".join(str(tag) for tag in description_contents)
+
+
+def _get_signatures(start_signature: PageElement) -> List[str]:
"""
Collect up to 3 signatures from dt tags around the `start_signature` dt tag.
@@ -137,11 +155,11 @@ def get_signatures(start_signature: PageElement) -> List[str]:
"""
signatures = []
for element in (
- *reversed(find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
+ *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
start_signature,
- *find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
+ *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
)[-3:]:
- signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
+ signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
if signature:
signatures.append(signature)
@@ -149,7 +167,7 @@ def get_signatures(start_signature: PageElement) -> List[str]:
return signatures
-def truncate_markdown(markdown: str, max_length: int) -> str:
+def _truncate_markdown(markdown: str, max_length: int) -> str:
"""
Truncate `markdown` to be at most `max_length` characters.
@@ -185,8 +203,8 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:
The signatures are wrapped in python codeblocks, separated from the description by a newline.
The result string is truncated to be max 1000 symbols long.
"""
- description = truncate_markdown(markdownify(description, url=url), 1000)
- description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
+ description = _truncate_markdown(_markdownify(description, url=url), 1000)
+ description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
if signatures is not None:
formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures)
else:
@@ -197,7 +215,7 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:
@async_cache(arg_offset=1)
-async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
+async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
"""Create a BeautifulSoup object from the HTML data in `url` with the head tag removed."""
log.trace(f"Sending a request to {url}.")
async with http_session.get(url) as response:
@@ -208,8 +226,40 @@ async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulS
def _match_end_tag(tag: Tag) -> bool:
"""Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
- for attr in SEARCH_END_TAG_ATTRS:
+ for attr in _SEARCH_END_TAG_ATTRS:
if attr in tag.get("class", ()):
return True
return tag.name == "table"
+
+
+async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str:
+ """
+ Return parsed markdown of the passed symbol, truncated to 1000 characters.
+
+ A request through `http_session` is made to the url associated with `symbol_data` for the html contents;
+ the contents are then parsed depending on what group the symbol belongs to.
+ """
+ if "#" in symbol_data.url:
+ request_url, symbol_id = symbol_data.url.rsplit('#')
+ else:
+ request_url = symbol_data.url
+ symbol_id = None
+
+ soup = await _get_soup_from_url(http_session, request_url)
+ symbol_heading = soup.find(id=symbol_id)
+
+ # Handle doc symbols as modules, because they either link to the page of a module,
+ # or don't contain any useful info to be parsed.
+ signature = None
+ if symbol_data.group in {"module", "doc"}:
+ description = get_module_description(symbol_heading)
+
+ elif symbol_data.group in _NO_SIGNATURE_GROUPS:
+ description = _get_symbol_description(symbol_heading)
+
+ else:
+ signature = _get_signatures(symbol_heading)
+ description = _get_symbol_description(symbol_heading)
+
+ return _parse_into_markdown(signature, description, symbol_data.url)