Parse NavigableStrings in symbol descriptions.

When a symbol, such as [term.numpy](https://matplotlib.org/3.1.1/glossary/index.html#term-numpy) had NavigableStrings as direct children, they were not included as bs4's SoupStrainer won't include both strings and tags in its filters. The implementation goes around the limitation by introducing a new optional flag, bypassing the default check which skips matching tags when the `text` argument is present.
author: Numerlor <[email protected]> 2020-07-26 15:06:35 +0200
committer: Numerlor <[email protected]> 2020-07-26 15:06:35 +0200
commit: 2cc7ec9e26b013b2967841372898f1f8954d8f8f (patch)
tree: f5a725fc965a4b429a67a41a57a691e0f449e089
parent: Ensure all renamed symbols are kept (diff)
2 files changed, 55 insertions, 14 deletions
diff --git a/bot/cogs/doc/html.py b/bot/cogs/doc/html.py
new file mode 100644
index 000000000..bc705130d
--- /dev/null
+++ b/bot/cogs/doc/html.py
@@ -0,0 +1,33 @@
+from collections.abc import Iterable
+from typing import List, Union
+
+from bs4.element import NavigableString, PageElement, SoupStrainer, Tag
+
+
+class Strainer(SoupStrainer):
+    """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s."""
+
+    def __init__(self, *, include_strings: bool, **kwargs):
+        self.include_strings = include_strings
+        super().__init__(**kwargs)
+
+    markup_hint = Union[PageElement, List["markup_hint"]]
+
+    def search(self, markup: markup_hint) -> Union[PageElement, str]:
+        """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s."""
+        if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)):
+            for element in markup:
+                if isinstance(element, NavigableString) and self.search(element):
+                    return element
+        elif isinstance(markup, Tag):
+            # Also include tags while we're searching for strings and tags.
+            if self.include_strings or (not self.text or self.name or self.attrs):
+                return self.search_tag(markup)
+
+        elif isinstance(markup, str):
+            # Let everything through the text filter if we're including strings and tags.
+            text_filter = None if not self.include_strings else True
+            if not self.name and not self.attrs and self._matches(markup, text_filter):
+                return markup
+        else:
+            raise Exception(f"I don't know how to match against a {markup.__class__}")
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
index 79f3bbf69..050c49447 100644
--- a/bot/cogs/doc/parsing.py
+++ b/bot/cogs/doc/parsing.py
@@ -8,10 +8,11 @@ from urllib.parse import urljoin
 
 from aiohttp import ClientSession
 from bs4 import BeautifulSoup
-from bs4.element import PageElement, Tag
+from bs4.element import NavigableString, PageElement, Tag
 from markdownify import MarkdownConverter
 
 from .cache import async_cache
+from .html import Strainer
 if TYPE_CHECKING:
     from .cog import DocItem
 
@@ -96,25 +97,30 @@ def _find_elements_until_tag(
         tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]],
         *,
         func: Callable,
+        include_strings: bool = False,
         limit: int = None,
-) -> List[Tag]:
+) -> List[Union[Tag, NavigableString]]:
     """
-    Get all tags until a tag matching `tag_filter` is found.
+    Get all elements up to `limit` or until a tag matching `tag_filter` is found.
 
     `tag_filter` can be either a tuple of string names to check against,
-    or a filtering t.Callable that's applied to the tags.
+    or a filtering callable that's applied to tags.
+
+    When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s.
 
     `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`.
-    That method is then iterated over and all tags until the matching tag are added to the return list as strings.
+    The method is then iterated over and all elements until the matching tag or the limit are added to the return list.
     """
+    use_tuple_filter = isinstance(tag_filter, tuple)
     elements = []
 
-    for element in func(start_element, limit=limit):
-        if isinstance(tag_filter, tuple):
-            if element.name in tag_filter:
+    for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit):
+        if isinstance(element, Tag):
+            if use_tuple_filter:
+                if element.name in tag_filter:
+                    break
+            elif tag_filter(element):
                 break
-        elif tag_filter(element):
-            break
         elements.append(element)
 
     return elements
@@ -125,7 +131,7 @@ _find_next_siblings_until_tag = partial(_find_elements_until_tag, func=Beautiful
 _find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
 
 
-def get_module_description(start_element: PageElement) -> Optional[str]:
+def _get_module_description(start_element: PageElement) -> Optional[str]:
     """
     Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
 
@@ -134,7 +140,9 @@ def get_module_description(start_element: PageElement) -> Optional[str]:
     """
     header = start_element.find("a", attrs={"class": "headerlink"})
     start_tag = header.parent if header is not None else start_element
-    description = "".join(str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag))
+    description = "".join(
+        str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
+    )
 
     return description
 
@@ -142,7 +150,7 @@ def get_module_description(start_element: PageElement) -> Optional[str]:
 def _get_symbol_description(symbol: PageElement) -> str:
     """Get the string contents of the next dd tag, up to a dt or a dl tag."""
     description_tag = symbol.find_next("dd")
-    description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"))
+    description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
     return "".join(str(tag) for tag in description_contents)
 
 
@@ -253,7 +261,7 @@ async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem
     # or don't contain any useful info to be parsed.
     signature = None
     if symbol_data.group in {"module", "doc"}:
-        description = get_module_description(symbol_heading)
+        description = _get_module_description(symbol_heading)
 
     elif symbol_data.group in _NO_SIGNATURE_GROUPS:
         description = _get_symbol_description(symbol_heading)
author	Numerlor <[email protected]>	2020-07-26 15:06:35 +0200
committer	Numerlor <[email protected]>	2020-07-26 15:06:35 +0200
commit	2cc7ec9e26b013b2967841372898f1f8954d8f8f (patch)
tree	f5a725fc965a4b429a67a41a57a691e0f449e089
parent	Ensure all renamed symbols are kept (diff)