diff options
| -rw-r--r-- | bot/exts/info/doc/__init__.py | 5 | ||||
| -rw-r--r-- | bot/exts/info/doc/_cog.py | 4 | ||||
| -rw-r--r-- | bot/exts/info/doc/_html.py | 112 | ||||
| -rw-r--r-- | bot/exts/info/doc/_parsing.py | 125 |
4 files changed, 126 insertions, 120 deletions
diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py index e9eb9428c..af0bbff2d 100644 --- a/bot/exts/info/doc/__init__.py +++ b/bot/exts/info/doc/__init__.py @@ -1,6 +1,11 @@ from bot.bot import Bot from ._cog import DocCog +MAX_SIGNATURE_AMOUNT = 3 +PRIORITY_PACKAGES = ( + "python", +) + def setup(bot: Bot) -> None: """Load the Doc cog.""" diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py index 3f7604072..fd211d9f1 100644 --- a/bot/exts/info/doc/_cog.py +++ b/bot/exts/info/doc/_cog.py @@ -24,6 +24,7 @@ from bot.pagination import LinePaginator from bot.utils.lock import lock from bot.utils.messages import send_denial, wait_for_deletion from bot.utils.scheduling import Scheduler +from . import PRIORITY_PACKAGES from ._inventory_parser import INVENTORY_DICT, fetch_inventory from ._parsing import get_symbol_markdown from ._redis_cache import DocRedisCache @@ -38,9 +39,6 @@ FORCE_PREFIX_GROUPS = ( "pdbcommand", "term", ) -PRIORITY_PACKAGES = ( - "python", -) WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay # Delay to wait before trying to reach a rescheduled inventory again, in minutes diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py index 88fbc8825..f9fe542ce 100644 --- a/bot/exts/info/doc/_html.py +++ b/bot/exts/info/doc/_html.py @@ -1,10 +1,27 @@ import logging -from typing import List, Union +import re +from functools import partial +from typing import Callable, Container, Iterable, List, Union -from bs4.element import PageElement, SoupStrainer +from bs4 import BeautifulSoup +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + +from . import MAX_SIGNATURE_AMOUNT log = logging.getLogger(__name__) +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) + class Strainer(SoupStrainer): """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" @@ -26,3 +43,94 @@ class Strainer(SoupStrainer): return markup else: return super().search(markup) + + +def _find_elements_until_tag( + start_element: PageElement, + end_tag_filter: Union[Container[str], Callable[[Tag], bool]], + *, + func: Callable, + include_strings: bool = False, + limit: int = None, +) -> List[Union[Tag, NavigableString]]: + """ + Get all elements up to `limit` or until a tag matching `tag_filter` is found. + + `end_tag_filter` can be either a container of string names to check against, + or a filtering callable that's applied to tags. + + When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. + + `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. + The method is then iterated over and all elements until the matching tag or the limit are added to the return list. + """ + use_container_filter = not callable(end_tag_filter) + elements = [] + + for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): + if isinstance(element, Tag): + if use_container_filter: + if element.name in end_tag_filter: + break + elif end_tag_filter(element): + break + elements.append(element) + + return elements + + +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) + + +def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: + """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" + def match_tag(tag: Tag) -> bool: + for attr in class_names: + if attr in tag.get("class", ()): + return True + return tag.name == "table" + + return match_tag + + +def get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]: + """ + Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. + + A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, + if it's found it's used as the tag to start the search from instead of the `start_element`. + """ + child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) + header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) + start_tag = header.parent if header is not None else start_element + return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True) + + +def get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: + """Get the contents of the next dd tag, up to a dt or a dl tag.""" + description_tag = symbol.find_next("dd") + return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) + + +def get_signatures(start_signature: PageElement) -> List[str]: + """ + Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. + + First the signatures under the `start_signature` are included; + if less than 2 are found, tags above the start signature are added to the result if any are present. + """ + signatures = [] + for element in ( + *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), + start_signature, + *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), + )[-MAX_SIGNATURE_AMOUNT:]: + signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature: + signatures.append(signature) + + return signatures diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py index 46ae33b92..d68f7c8d7 100644 --- a/bot/exts/info/doc/_parsing.py +++ b/bot/exts/info/doc/_parsing.py @@ -5,37 +5,23 @@ import re import string import textwrap from collections import namedtuple -from functools import partial -from typing import Callable, Collection, Container, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union +from typing import Collection, Iterable, Iterator, List, Optional, TYPE_CHECKING, Union from bs4 import BeautifulSoup -from bs4.element import NavigableString, PageElement, Tag +from bs4.element import NavigableString, Tag from bot.utils.helpers import find_nth_occurrence -from ._html import Strainer +from . import MAX_SIGNATURE_AMOUNT +from ._html import get_dd_description, get_general_description, get_signatures from ._markdown import DocMarkdownConverter if TYPE_CHECKING: from ._cog import DocItem log = logging.getLogger(__name__) -_MAX_SIGNATURE_AMOUNT = 3 - -_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") _WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") _PARAMETERS_RE = re.compile(r"\((.+)\)") -_SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) - _NO_SIGNATURE_GROUPS = { "attribute", "envvar", @@ -46,7 +32,7 @@ _NO_SIGNATURE_GROUPS = { } _EMBED_CODE_BLOCK_LINE_LENGTH = 61 # _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight -_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LINE_LENGTH + 8) * MAX_SIGNATURE_AMOUNT # Maximum discord message length - signatures on top _MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH _TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace @@ -118,86 +104,6 @@ def _split_parameters(parameters_string: str) -> Iterator[str]: yield parameters_string[last_split:] -def _find_elements_until_tag( - start_element: PageElement, - end_tag_filter: Union[Container[str], Callable[[Tag], bool]], - *, - func: Callable, - include_strings: bool = False, - limit: int = None, -) -> List[Union[Tag, NavigableString]]: - """ - Get all elements up to `limit` or until a tag matching `tag_filter` is found. - - `end_tag_filter` can be either a container of string names to check against, - or a filtering callable that's applied to tags. - - When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. - - `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. - The method is then iterated over and all elements until the matching tag or the limit are added to the return list. - """ - use_container_filter = not callable(end_tag_filter) - elements = [] - - for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): - if isinstance(element, Tag): - if use_container_filter: - if element.name in end_tag_filter: - break - elif end_tag_filter(element): - break - elements.append(element) - - return elements - - -_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) -_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all) -_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) -_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) - - -def _get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]: - """ - Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. - - A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, - if it's found it's used as the tag to start the search from instead of the `start_element`. - """ - child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) - header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) - start_tag = header.parent if header is not None else start_element - return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True) - - -def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: - """Get the contents of the next dd tag, up to a dt or a dl tag.""" - description_tag = symbol.find_next("dd") - return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) - - -def _get_signatures(start_signature: PageElement) -> List[str]: - """ - Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. - - First the signatures under the `start_signature` are included; - if less than 2 are found, tags above the start signature are added to the result if any are present. - """ - signatures = [] - for element in ( - *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), - start_signature, - *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), - )[-(_MAX_SIGNATURE_AMOUNT):]: - signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - - if signature: - signatures.append(signature) - - return signatures - - def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]: """ Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. @@ -210,7 +116,7 @@ def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collec if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: return signatures - max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) + max_signature_length = _EMBED_CODE_BLOCK_LINE_LENGTH * (MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) formatted_signatures = [] for signature in signatures: signature = signature.strip() @@ -317,17 +223,6 @@ def _create_markdown(signatures: Optional[List[str]], description: Iterable[Tag] return formatted_markdown -def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: - """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" - def match_tag(tag: Tag) -> bool: - for attr in class_names: - if attr in tag.get("class", ()): - return True - return tag.name == "table" - - return match_tag - - def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[str]: """ Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. @@ -342,12 +237,12 @@ def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> Optional[s # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt": - description = _get_general_description(symbol_heading) + description = get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: - description = _get_dd_description(symbol_heading) + description = get_dd_description(symbol_heading) else: - signature = _get_signatures(symbol_heading) - description = _get_dd_description(symbol_heading) + signature = get_signatures(symbol_heading) + description = get_dd_description(symbol_heading) return _create_markdown(signature, description, symbol_data.url).replace('¶', '').strip() |