diff options
| -rw-r--r-- | bot/cogs/doc/cog.py | 102 | ||||
| -rw-r--r-- | bot/cogs/doc/parser.py | 102 |
2 files changed, 108 insertions, 96 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2627951e8..4a275c7c6 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -7,12 +7,11 @@ import textwrap from collections import OrderedDict from contextlib import suppress from types import SimpleNamespace -from typing import Callable, Dict, List, NamedTuple, Optional, Tuple, Union +from typing import Dict, NamedTuple, Optional, Tuple from urllib.parse import urljoin import discord -from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag +from bs4.element import PageElement from discord.ext import commands from markdownify import MarkdownConverter from requests import ConnectTimeout, ConnectionError, HTTPError @@ -26,6 +25,7 @@ from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion from .cache import async_cache +from .parser import get_soup_from_url, parse_module_symbol, parse_symbol log = logging.getLogger(__name__) logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -51,19 +51,7 @@ NO_OVERRIDE_PACKAGES = ( "python", ) -SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") - FAILED_REQUEST_RETRY_AMOUNT = 3 NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay @@ -248,7 +236,7 @@ class DocCog(commands.Cog): return None request_url, symbol_id = symbol_info.url.rsplit('#') - soup = await self._get_soup_from_url(request_url) + soup = await get_soup_from_url(self.bot.http_session, request_url) symbol_heading = soup.find(id=symbol_id) search_html = str(soup) @@ -256,14 +244,14 @@ class DocCog(commands.Cog): return None if symbol_info.group == "module": - parsed_module = self.parse_module_symbol(symbol_heading) + parsed_module = parse_module_symbol(symbol_heading) if parsed_module is None: return [], "" else: signatures, description = parsed_module else: - signatures, description = self.parse_symbol(symbol_heading, search_html) + signatures, description = parse_symbol(symbol_heading, search_html) return signatures, description.replace('¶', '') @@ -331,75 +319,6 @@ class DocCog(commands.Cog): ) return embed - @classmethod - def parse_module_symbol(cls, heading: PageElement) -> Optional[Tuple[None, str]]: - """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" - start_tag = heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return None - - description = cls.find_all_children_until_tag(start_tag, cls._match_end_tag) - if description is None: - return None - - return None, description - - @classmethod - def parse_symbol(cls, heading: PageElement, html: str) -> Tuple[List[str], str]: - """ - Parse the signatures and description of a symbol. - - Collects up to 3 signatures from dt tags and a description from their sibling dd tag. - """ - signatures = [] - description_element = heading.find_next_sibling("dd") - description_pos = html.find(str(description_element)) - description = cls.find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) - - for element in ( - *reversed(heading.find_previous_siblings("dt", limit=2)), - heading, - *heading.find_next_siblings("dt", limit=2), - )[-3:]: - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - - if signature and html.find(str(element)) < description_pos: - signatures.append(signature) - - return signatures, description - - @staticmethod - def find_all_children_until_tag( - start_element: PageElement, - tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] - ) -> Optional[str]: - """ - Get all direct children until a child matching `tag_filter` is found. - - `tag_filter` can be either a tuple of string names to check against, - or a filtering callable that's applied to the tags. - """ - text = "" - - for element in start_element.find_next().find_next_siblings(): - if isinstance(tag_filter, tuple): - if element.name in tag_filter: - break - elif tag_filter(element): - break - text += str(element) - - return text - - @async_cache(arg_offset=1) - async def _get_soup_from_url(self, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with self.bot.http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: """Lookup documentation for Python symbols.""" @@ -558,12 +477,3 @@ class DocCog(commands.Cog): return package log.error(f"Fetching of inventory {inventory_url} failed.") return None - - @staticmethod - def _match_end_tag(tag: Tag) -> bool: - """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: - if attr in tag.get("class", ()): - return True - - return tag.name == "table" diff --git a/bot/cogs/doc/parser.py b/bot/cogs/doc/parser.py new file mode 100644 index 000000000..67621591b --- /dev/null +++ b/bot/cogs/doc/parser.py @@ -0,0 +1,102 @@ +import logging +import re +from typing import Callable, List, Optional, Tuple, Union + +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from bs4.element import PageElement, Tag + +from .cache import async_cache + +log = logging.getLogger(__name__) + +UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) + + +def parse_module_symbol(heading: PageElement) -> Optional[Tuple[None, str]]: + """Get page content from the headerlink up to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.""" + start_tag = heading.find("a", attrs={"class": "headerlink"}) + if start_tag is None: + return None + + description = find_all_children_until_tag(start_tag, _match_end_tag) + if description is None: + return None + + return None, description + + +def parse_symbol(heading: PageElement, html: str) -> Tuple[List[str], str]: + """ + Parse the signatures and description of a symbol. + + Collects up to 3 signatures from dt tags and a description from their sibling dd tag. + """ + signatures = [] + description_element = heading.find_next_sibling("dd") + description_pos = html.find(str(description_element)) + description = find_all_children_until_tag(description_element, tag_filter=("dt", "dl")) + + for element in ( + *reversed(heading.find_previous_siblings("dt", limit=2)), + heading, + *heading.find_next_siblings("dt", limit=2), + )[-3:]: + signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature and html.find(str(element)) < description_pos: + signatures.append(signature) + + return signatures, description + + +def find_all_children_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]] +) -> Optional[str]: + """ + Get all direct children until a child matching `tag_filter` is found. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering callable that's applied to the tags. + """ + text = "" + + for element in start_element.find_next().find_next_siblings(): + if isinstance(tag_filter, tuple): + if element.name in tag_filter: + break + elif tag_filter(element): + break + text += str(element) + + return text + + +@async_cache(arg_offset=1) +async def get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: + """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" + log.trace(f"Sending a request to {url}.") + async with http_session.get(url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') + soup.find("head").decompose() # the head contains no useful data so we can remove it + return soup + + +def _match_end_tag(tag: Tag) -> bool: + """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" + for attr in SEARCH_END_TAG_ATTRS: + if attr in tag.get("class", ()): + return True + + return tag.name == "table" |