aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bot/exts/info/doc/__init__.py5
-rw-r--r--bot/exts/info/doc/_cog.py4
-rw-r--r--bot/exts/info/doc/_html.py112
-rw-r--r--bot/exts/info/doc/_parsing.py125
4 files changed, 126 insertions, 120 deletions
diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py
index e9eb9428c..af0bbff2d 100644
--- a/bot/exts/info/doc/__init__.py
+++ b/bot/exts/info/doc/__init__.py
@@ -1,6 +1,11 @@
from bot.bot import Bot
from ._cog import DocCog
+MAX_SIGNATURE_AMOUNT = 3
+PRIORITY_PACKAGES = (
+ "python",
+)
+
def setup(bot: Bot) -> None:
"""Load the Doc cog."""
diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py
index 3f7604072..fd211d9f1 100644
--- a/bot/exts/info/doc/_cog.py
+++ b/bot/exts/info/doc/_cog.py
@@ -24,6 +24,7 @@ from bot.pagination import LinePaginator
from bot.utils.lock import lock
from bot.utils.messages import send_denial, wait_for_deletion
from bot.utils.scheduling import Scheduler
+from . import PRIORITY_PACKAGES
from ._inventory_parser import INVENTORY_DICT, fetch_inventory
from ._parsing import get_symbol_markdown
from ._redis_cache import DocRedisCache
@@ -38,9 +39,6 @@ FORCE_PREFIX_GROUPS = (
"pdbcommand",
"term",
)
-PRIORITY_PACKAGES = (
- "python",
-)
WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay
# Delay to wait before trying to reach a rescheduled inventory again, in minutes
diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py
index 88fbc8825..f9fe542ce 100644
--- a/bot/exts/info/doc/_html.py
+++ b/bot/exts/info/doc/_html.py
@@ -1,10 +1,27 @@
import logging
-from typing import List, Union
+import re
+from functools import partial
+from typing import Callable, Container, Iterable, List, Union
-from bs4.element import PageElement, SoupStrainer
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString, PageElement, SoupStrainer, Tag
+
+from . import MAX_SIGNATURE_AMOUNT
log = logging.getLogger(__name__)
+_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
+_SEARCH_END_TAG_ATTRS = (
+ "data",
+ "function",
+ "class",
+ "exception",
+ "seealso",
+ "section",
+ "rubric",
+ "sphinxsidebar",
+)
+
class Strainer(SoupStrainer):
"""Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s."""
@@ -26,3 +43,94 @@ class Strainer(SoupStrainer):
return markup
else:
return super().search(markup)
+
+
+def _find_elements_until_tag(
+ start_element: PageElement,
+ end_tag_filter: Union[Container[str], Callable[[Tag], bool]],
+ *,
+ func: Callable,
+ include_strings: bool = False,
+ limit: int = None,
+) -> List[Union[Tag, NavigableString]]:
+ """
+ Get all elements up to `limit` or until a tag matching `tag_filter` is found.
+
+ `end_tag_filter` can be either a container of string names to check against,
+ or a filtering callable that's applied to tags.
+
+ When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s.
+
+ `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`.
+ The method is then iterated over and all elements until the matching tag or the limit are added to the return list.
+ """
+ use_container_filter = not callable(end_tag_filter)
+ elements = []
+
+ for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit):
+ if isinstance(element, Tag):
+ if use_container_filter:
+ if element.name in end_tag_filter:
+ break
+ elif end_tag_filter(element):
+ break
+ elements.append(element)
+
+ return elements
+
+
+_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
+_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all)
+_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
+_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
+
+
+def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]:
+ """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table."""
+ def match_tag(tag: Tag) -> bool:
+ for attr in class_names:
+ if attr in tag.get("class", ()):
+ return True
+ return tag.name == "table"
+
+ return match_tag
+
+
+def get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]:
+ """
+ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
+
+ A headerlink a tag is attempted to be found to skip repeating the symbol information in the description,
+ if it's found it's used as the tag to start the search from instead of the `start_element`.
+ """
+ child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100)
+ header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None)
+ start_tag = header.parent if header is not None else start_element
+ return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True)
+
+
+def get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]:
+ """Get the contents of the next dd tag, up to a dt or a dl tag."""
+ description_tag = symbol.find_next("dd")
+ return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
+
+
+def get_signatures(start_signature: PageElement) -> List[str]:
+ """
+ Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag.
+
+ First the signatures under the `start_signature` are included;
+ if less than 2 are found, tags above the start signature are added to the result if any are present.
+ """
+ signatures = []
+ for element in (
+ *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
+ start_signature,
+ *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
+ )[-MAX_SIGNATURE_AMOUNT:]:
+ signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
+
+ if signature:
+ signatures.append(signature)
+
+ return signatures
diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py
index 46ae33b92..d68f7c8d7 100644
--- a/bot/exts/info/doc/_parsing.py
+++ b/bot/exts/info/doc/_parsing.py
@@ -5,37 +5,23 @@ import re
import string
import textwrap
from collections import namedtuple
-from functools import partial
-from typing import Callable, Collection, Container, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union
+from typing import Collection, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union
from bs4 import BeautifulSoup
-from bs4.element import NavigableString, PageElement, Tag
+from bs4.element import NavigableString, Tag
from bot.utils.helpers import find_nth_occurrence
-from ._html import Strainer
+from . import MAX_SIGNATURE_AMOUNT
+from ._html import get_dd_description, get_general_description, get_signatures
from ._markdown import DocMarkdownConverter
if TYPE_CHECKING:
from ._cog import DocItem
log = logging.getLogger(__name__)
-_MAX_SIGNATURE_AMOUNT = 3
-
-_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
_PARAMETERS_RE = re.compile(r"\((.+)\)")
-_SEARCH_END_TAG_ATTRS = (
- "data",
- "function",
- "class",
- "exception",
- "seealso",
- "section",
- "rubric",
- "sphinxsidebar",
-)
-
_NO_SIGNATURE_GROUPS = {
"attribute",
"envvar",
@@ -46,7 +32,7 @@ _NO_SIGNATURE_GROUPS = {
}
_EMBED_CODE_BLOCK_LINE_LENGTH = 61
# _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight
-_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT
+_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * MAX_SIGNATURE_AMOUNT
# Maximum discord message length - signatures on top
_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH
_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace
@@ -118,86 +104,6 @@ def _split_parameters(parameters_string: str) -> Iterator[str]:
yield parameters_string[last_split:]
-def _find_elements_until_tag(
- start_element: PageElement,
- end_tag_filter: Union[Container[str], Callable[[Tag], bool]],
- *,
- func: Callable,
- include_strings: bool = False,
- limit: int = None,
-) -> List[Union[Tag, NavigableString]]:
- """
- Get all elements up to `limit` or until a tag matching `tag_filter` is found.
-
- `end_tag_filter` can be either a container of string names to check against,
- or a filtering callable that's applied to tags.
-
- When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s.
-
- `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`.
- The method is then iterated over and all elements until the matching tag or the limit are added to the return list.
- """
- use_container_filter = not callable(end_tag_filter)
- elements = []
-
- for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit):
- if isinstance(element, Tag):
- if use_container_filter:
- if element.name in end_tag_filter:
- break
- elif end_tag_filter(element):
- break
- elements.append(element)
-
- return elements
-
-
-_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
-_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all)
-_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
-_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
-
-
-def _get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]:
- """
- Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
-
- A headerlink a tag is attempted to be found to skip repeating the symbol information in the description,
- if it's found it's used as the tag to start the search from instead of the `start_element`.
- """
- child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100)
- header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None)
- start_tag = header.parent if header is not None else start_element
- return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True)
-
-
-def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]:
- """Get the contents of the next dd tag, up to a dt or a dl tag."""
- description_tag = symbol.find_next("dd")
- return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
-
-
-def _get_signatures(start_signature: PageElement) -> List[str]:
- """
- Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag.
-
- First the signatures under the `start_signature` are included;
- if less than 2 are found, tags above the start signature are added to the result if any are present.
- """
- signatures = []
- for element in (
- *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
- start_signature,
- *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
- )[-(_MAX_SIGNATURE_AMOUNT):]:
- signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
-
- if signature:
- signatures.append(signature)
-
- return signatures
-
-
def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]:
"""
Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`.
@@ -210,7 +116,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec
if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH:
return signatures
- max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures))
+ max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (MAX_SIGNATURE_AMOUNT + 1 - len(signatures))
formatted_signatures = []
for signature in signatures:
signature = signature.strip()
@@ -317,17 +223,6 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag]
return formatted_markdown
-def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]:
- """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table."""
- def match_tag(tag: Tag) -> bool:
- for attr in class_names:
- if attr in tag.get("class", ()):
- return True
- return tag.name == "table"
-
- return match_tag
-
-
def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]:
"""
Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters.
@@ -342,12 +237,12 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s
# Modules, doc pages and labels don't point to description list tags but to tags like divs,
# no special parsing can be done so we only try to include what's under them.
if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt":
- description = _get_general_description(symbol_heading)
+ description = get_general_description(symbol_heading)
elif symbol_data.group in _NO_SIGNATURE_GROUPS:
- description = _get_dd_description(symbol_heading)
+ description = get_dd_description(symbol_heading)
else:
- signature = _get_signatures(symbol_heading)
- description = _get_dd_description(symbol_heading)
+ signature = get_signatures(symbol_heading)
+ description = get_dd_description(symbol_heading)
return _create_markdown(signature, description, symbol_data.url).replace('¶', '').strip()