diff options
| author | 2020-07-26 15:06:35 +0200 | |
|---|---|---|
| committer | 2020-07-26 15:06:35 +0200 | |
| commit | 2cc7ec9e26b013b2967841372898f1f8954d8f8f (patch) | |
| tree | f5a725fc965a4b429a67a41a57a691e0f449e089 | |
| parent | Ensure all renamed symbols are kept (diff) | |
Parse NavigableStrings in symbol descriptions.
When a symbol, such as [term.numpy](https://matplotlib.org/3.1.1/glossary/index.html#term-numpy) had NavigableStrings as direct
children, they were not included as bs4's SoupStrainer won't include
both strings and tags in its filters.
The implementation goes around the limitation by introducing a new
optional flag, bypassing the default check which skips matching tags
when the `text` argument is present.
| -rw-r--r-- | bot/cogs/doc/html.py | 33 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 36 |
2 files changed, 55 insertions, 14 deletions
diff --git a/bot/cogs/doc/html.py b/bot/cogs/doc/html.py new file mode 100644 index 000000000..bc705130d --- /dev/null +++ b/bot/cogs/doc/html.py @@ -0,0 +1,33 @@ +from collections.abc import Iterable +from typing import List, Union + +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + + +class Strainer(SoupStrainer): + """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" + + def __init__(self, *, include_strings: bool, **kwargs): + self.include_strings = include_strings + super().__init__(**kwargs) + + markup_hint = Union[PageElement, List["markup_hint"]] + + def search(self, markup: markup_hint) -> Union[PageElement, str]: + """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" + if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) and self.search(element): + return element + elif isinstance(markup, Tag): + # Also include tags while we're searching for strings and tags. + if self.include_strings or (not self.text or self.name or self.attrs): + return self.search_tag(markup) + + elif isinstance(markup, str): + # Let everything through the text filter if we're including strings and tags. + text_filter = None if not self.include_strings else True + if not self.name and not self.attrs and self._matches(markup, text_filter): + return markup + else: + raise Exception(f"I don't know how to match against a {markup.__class__}") diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 79f3bbf69..050c49447 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -8,10 +8,11 @@ from urllib.parse import urljoin from aiohttp import ClientSession from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag +from bs4.element import NavigableString, PageElement, Tag from markdownify import MarkdownConverter from .cache import async_cache +from .html import Strainer if TYPE_CHECKING: from .cog import DocItem @@ -96,25 +97,30 @@ def _find_elements_until_tag( tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], *, func: Callable, + include_strings: bool = False, limit: int = None, -) -> List[Tag]: +) -> List[Union[Tag, NavigableString]]: """ - Get all tags until a tag matching `tag_filter` is found. + Get all elements up to `limit` or until a tag matching `tag_filter` is found. `tag_filter` can be either a tuple of string names to check against, - or a filtering t.Callable that's applied to the tags. + or a filtering callable that's applied to tags. + + When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. - That method is then iterated over and all tags until the matching tag are added to the return list as strings. + The method is then iterated over and all elements until the matching tag or the limit are added to the return list. """ + use_tuple_filter = isinstance(tag_filter, tuple) elements = [] - for element in func(start_element, limit=limit): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: + for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): + if isinstance(element, Tag): + if use_tuple_filter: + if element.name in tag_filter: + break + elif tag_filter(element): break - elif tag_filter(element): - break elements.append(element) return elements @@ -125,7 +131,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) -def get_module_description(start_element: PageElement) -> Optional[str]: +def _get_module_description(start_element: PageElement) -> Optional[str]: """ Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. @@ -134,7 +140,9 @@ def get_module_description(start_element: PageElement) -> Optional[str]: """ header = start_element.find("a", attrs={"class": "headerlink"}) start_tag = header.parent if header is not None else start_element - description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag)) + description = "".join( + str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) + ) return description @@ -142,7 +150,7 @@ def get_module_description(start_element: PageElement) -> Optional[str]: def _get_symbol_description(symbol: PageElement) -> str: """Get the string contents of the next dd tag, up to a dt or a dl tag.""" description_tag = symbol.find_next("dd") - description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl")) + description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) return "".join(str(tag) for tag in description_contents) @@ -253,7 +261,7 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem # or don't contain any useful info to be parsed. signature = None if symbol_data.group in {"module", "doc"}: - description = get_module_description(symbol_heading) + description = _get_module_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: description = _get_symbol_description(symbol_heading) |