aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorGravatar Numerlor <[email protected]>2021-01-10 03:58:43 +0100
committerGravatar Numerlor <[email protected]>2021-01-10 06:16:10 +0100
commit695044167756eb2b6b4d953ef17f0359ba688246 (patch)
tree8e311175a1f8ce813cbc17b896370837c6b0c18d
parentExpand docstring (diff)
Move functions strictly related to parsing html to the _html module
Some constants need to be shared between html and parsing, because they may also be wanted to be edited by the cog user to change the behaviour, they were moved into the package's init.
-rw-r--r--bot/exts/info/doc/__init__.py5
-rw-r--r--bot/exts/info/doc/_cog.py4
-rw-r--r--bot/exts/info/doc/_html.py112
-rw-r--r--bot/exts/info/doc/_parsing.py125
4 files changed, 126 insertions, 120 deletions
diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py
index e9eb9428c..af0bbff2d 100644
--- a/bot/exts/info/doc/__init__.py
+++ b/bot/exts/info/doc/__init__.py
@@ -1,6 +1,11 @@
from bot.bot import Bot
from ._cog import DocCog
+MAX_SIGNATURE_AMOUNT = 3
+PRIORITY_PACKAGES = (
+ "python",
+)
+
def setup(bot: Bot) -> None:
"""Load the Doc cog."""
diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py
index 3f7604072..fd211d9f1 100644
--- a/bot/exts/info/doc/_cog.py
+++ b/bot/exts/info/doc/_cog.py
@@ -24,6 +24,7 @@ from bot.pagination import LinePaginator
from bot.utils.lock import lock
from bot.utils.messages import send_denial, wait_for_deletion
from bot.utils.scheduling import Scheduler
+from . import PRIORITY_PACKAGES
from ._inventory_parser import INVENTORY_DICT, fetch_inventory
from ._parsing import get_symbol_markdown
from ._redis_cache import DocRedisCache
@@ -38,9 +39,6 @@ FORCE_PREFIX_GROUPS = (
"pdbcommand",
"term",
)
-PRIORITY_PACKAGES = (
- "python",
-)
WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay
# Delay to wait before trying to reach a rescheduled inventory again, in minutes
diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py
index 88fbc8825..f9fe542ce 100644
--- a/bot/exts/info/doc/_html.py
+++ b/bot/exts/info/doc/_html.py
@@ -1,10 +1,27 @@
import logging
-from typing import List, Union
+import re
+from functools import partial
+from typing import Callable, Container, Iterable, List, Union
-from bs4.element import PageElement, SoupStrainer
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString, PageElement, SoupStrainer, Tag
+
+from . import MAX_SIGNATURE_AMOUNT
log = logging.getLogger(__name__)
+_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
+_SEARCH_END_TAG_ATTRS = (
+ "data",
+ "function",
+ "class",
+ "exception",
+ "seealso",
+ "section",
+ "rubric",
+ "sphinxsidebar",
+)
+
class Strainer(SoupStrainer):
"""Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s."""
@@ -26,3 +43,94 @@ class Strainer(SoupStrainer):
return markup
else:
return super().search(markup)
+
+
+def _find_elements_until_tag(
+ start_element: PageElement,
+ end_tag_filter: Union[Container[str], Callable[[Tag], bool]],
+ *,
+ func: Callable,
+ include_strings: bool = False,
+ limit: int = None,
+) -> List[Union[Tag, NavigableString]]:
+ """
+ Get all elements up to `limit` or until a tag matching `tag_filter` is found.
+
+ `end_tag_filter` can be either a container of string names to check against,
+ or a filtering callable that's applied to tags.
+
+ When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s.
+
+ `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`.
+ The method is then iterated over and all elements until the matching tag or the limit are added to the return list.
+ """
+ use_container_filter = not callable(end_tag_filter)
+ elements = []
+
+ for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit):
+ if isinstance(element, Tag):
+ if use_container_filter:
+ if element.name in end_tag_filter:
+ break
+ elif end_tag_filter(element):
+ break
+ elements.append(element)
+
+ return elements
+
+
+_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
+_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all)
+_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
+_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
+
+
+def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]:
+ """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table."""
+ def match_tag(tag: Tag) -> bool:
+ for attr in class_names:
+ if attr in tag.get("class", ()):
+ return True
+ return tag.name == "table"
+
+ return match_tag
+
+
+def get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]:
+ """
+ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
+
+ A headerlink a tag is attempted to be found to skip repeating the symbol information in the description,
+ if it's found it's used as the tag to start the search from instead of the `start_element`.
+ """
+ child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100)
+ header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None)
+ start_tag = header.parent if header is not None else start_element
+ return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True)
+
+
+def get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]:
+ """Get the contents of the next dd tag, up to a dt or a dl tag."""
+ description_tag = symbol.find_next("dd")
+ return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
+
+
+def get_signatures(start_signature: PageElement) -> List[str]:
+ """
+ Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag.
+
+ First the signatures under the `start_signature` are included;
+ if less than 2 are found, tags above the start signature are added to the result if any are present.
+ """
+ signatures = []
+ for element in (
+ *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
+ start_signature,
+ *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
+ )[-MAX_SIGNATURE_AMOUNT:]:
+ signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
+
+ if signature:
+ signatures.append(signature)
+
+ return signatures
diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py
index 46ae33b92..d68f7c8d7 100644
--- a/bot/exts/info/doc/_parsing.py
+++ b/bot/exts/info/doc/_parsing.py
@@ -5,37 +5,23 @@ import re
import string
import textwrap
from collections import namedtuple
-from functools import partial
-from typing import Callable, Collection, Container, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union
+from typing import Collection, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union
from bs4 import BeautifulSoup
-from bs4.element import NavigableString, PageElement, Tag
+from bs4.element import NavigableString, Tag
from bot.utils.helpers import find_nth_occurrence
-from ._html import Strainer
+from . import MAX_SIGNATURE_AMOUNT
+from ._html import get_dd_description, get_general_description, get_signatures
from ._markdown import DocMarkdownConverter
if TYPE_CHECKING:
from ._cog import DocItem
log = logging.getLogger(__name__)
-_MAX_SIGNATURE_AMOUNT = 3
-
-_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
_PARAMETERS_RE = re.compile(r"\((.+)\)")
-_SEARCH_END_TAG_ATTRS = (
- "data",
- "function",
- "class",
- "exception",
- "seealso",
- "section",
- "rubric",
- "sphinxsidebar",
-)
-
_NO_SIGNATURE_GROUPS = {
"attribute",
"envvar",
@@ -46,7 +32,7 @@ _NO_SIGNATURE_GROUPS = {
}
_EMBED_CODE_BLOCK_LINE_LENGTH = 61
# _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight
-_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT
+_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * MAX_SIGNATURE_AMOUNT
# Maximum discord message length - signatures on top
_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH
_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace
@@ -118,86 +104,6 @@ def _split_parameters(parameters_string: str) -> Iterator[str]:
yield parameters_string[last_split:]
-def _find_elements_until_tag(
- start_element: PageElement,
- end_tag_filter: Union[Container[str], Callable[[Tag], bool]],
- *,
- func: Callable,
- include_strings: bool = False,
- limit: int = None,
-) -> List[Union[Tag, NavigableString]]:
- """
- Get all elements up to `limit` or until a tag matching `tag_filter` is found.
-
- `end_tag_filter` can be either a container of string names to check against,
- or a filtering callable that's applied to tags.
-
- When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s.
-
- `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`.
- The method is then iterated over and all elements until the matching tag or the limit are added to the return list.
- """
- use_container_filter = not callable(end_tag_filter)
- elements = []
-
- for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit):
- if isinstance(element, Tag):
- if use_container_filter:
- if element.name in end_tag_filter:
- break
- elif end_tag_filter(element):
- break
- elements.append(element)
-
- return elements
-
-
-_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
-_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all)
-_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
-_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
-
-
-def _get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]:
- """
- Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
-
- A headerlink a tag is attempted to be found to skip repeating the symbol information in the description,
- if it's found it's used as the tag to start the search from instead of the `start_element`.
- """
- child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100)
- header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None)
- start_tag = header.parent if header is not None else start_element
- return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True)
-
-
-def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]:
- """Get the contents of the next dd tag, up to a dt or a dl tag."""
- description_tag = symbol.find_next("dd")
- return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
-
-
-def _get_signatures(start_signature: PageElement) -> List[str]:
- """
- Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag.
-
- First the signatures under the `start_signature` are included;
- if less than 2 are found, tags above the start signature are added to the result if any are present.
- """
- signatures = []
- for element in (
- *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
- start_signature,
- *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
- )[-(_MAX_SIGNATURE_AMOUNT):]:
- signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
-
- if signature:
- signatures.append(signature)
-
- return signatures
-
-
def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]:
"""
Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`.
@@ -210,7 +116,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec
if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH:
return signatures
- max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures))
+ max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (MAX_SIGNATURE_AMOUNT + 1 - len(signatures))
formatted_signatures = []
for signature in signatures:
signature = signature.strip()
@@ -317,17 +223,6 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag]
return formatted_markdown
-def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]:
- """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table."""
- def match_tag(tag: Tag) -> bool:
- for attr in class_names:
- if attr in tag.get("class", ()):
- return True
- return tag.name == "table"
-
- return match_tag
-
-
def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]:
"""
Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters.
@@ -342,12 +237,12 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s
# Modules, doc pages and labels don't point to description list tags but to tags like divs,
# no special parsing can be done so we only try to include what's under them.
if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt":
- description = _get_general_description(symbol_heading)
+ description = get_general_description(symbol_heading)
elif symbol_data.group in _NO_SIGNATURE_GROUPS:
- description = _get_dd_description(symbol_heading)
+ description = get_dd_description(symbol_heading)
else:
- signature = _get_signatures(symbol_heading)
- description = _get_dd_description(symbol_heading)
+ signature = get_signatures(symbol_heading)
+ description = get_dd_description(symbol_heading)
return _create_markdown(signature, description, symbol_data.url).replace('¶', '').strip()