diff options
| -rw-r--r-- | LICENSE-THIRD-PARTY | 30 | ||||
| -rw-r--r-- | Pipfile | 2 | ||||
| -rw-r--r-- | bot/converters.py | 22 | ||||
| -rw-r--r-- | bot/exts/info/doc.py | 512 | ||||
| -rw-r--r-- | bot/exts/info/doc/__init__.py | 7 | ||||
| -rw-r--r-- | bot/exts/info/doc/_cog.py | 457 | ||||
| -rw-r--r-- | bot/exts/info/doc/_html.py | 33 | ||||
| -rw-r--r-- | bot/exts/info/doc/_inventory_parser.py | 120 | ||||
| -rw-r--r-- | bot/exts/info/doc/_markdown.py | 53 | ||||
| -rw-r--r-- | bot/exts/info/doc/_parsing.py | 313 | ||||
| -rw-r--r-- | tests/bot/test_converters.py | 21 |
11 files changed, 1033 insertions, 537 deletions
diff --git a/LICENSE-THIRD-PARTY b/LICENSE-THIRD-PARTY new file mode 100644 index 000000000..f78491fc1 --- /dev/null +++ b/LICENSE-THIRD-PARTY @@ -0,0 +1,30 @@ +License for Sphinx +Applies to: + - bot/cogs/doc/inventory_parser.py: _load_v1, _load_v2 and ZlibStreamReader.__aiter__. +================== + +Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. @@ -22,9 +22,7 @@ markdownify = "~=0.4" more_itertools = "~=8.2" python-dateutil = "~=2.8" pyyaml = "~=5.1" -requests = "~=2.22" sentry-sdk = "~=0.14" -sphinx = "~=2.2" statsd = "~=3.3" [dev-packages] diff --git a/bot/converters.py b/bot/converters.py index 2e118d476..6c87a50fe 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -126,22 +126,20 @@ class ValidFilterListType(Converter): return list_type -class ValidPythonIdentifier(Converter): +class PackageName(Converter): """ - A converter that checks whether the given string is a valid Python identifier. + A converter that checks whether the given string is a valid package name. - This is used to have package names that correspond to how you would use the package in your - code, e.g. `import package`. - - Raises `BadArgument` if the argument is not a valid Python identifier, and simply passes through - the given argument otherwise. + Package names are used for stats and are restricted to the a-z and _ characters. """ - @staticmethod - async def convert(ctx: Context, argument: str) -> str: - """Checks whether the given string is a valid Python identifier.""" - if not argument.isidentifier(): - raise BadArgument(f"`{argument}` is not a valid Python identifier") + PACKAGE_NAME_RE = re.compile(r"[^a-z_]") + + @classmethod + async def convert(cls, ctx: Context, argument: str) -> str: + """Checks whether the given string is a valid package name.""" + if cls.PACKAGE_NAME_RE.search(argument): + raise BadArgument("The provided package name is not valid, please only use the _ and a-z characters.") return argument diff --git a/bot/exts/info/doc.py b/bot/exts/info/doc.py deleted file mode 100644 index c16a99225..000000000 --- a/bot/exts/info/doc.py +++ /dev/null @@ -1,512 +0,0 @@ -import asyncio -import functools -import logging -import re -import textwrap -from collections import OrderedDict -from contextlib import suppress -from types import SimpleNamespace -from typing import Any, Callable, Optional, Tuple - -import discord -from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag -from discord.errors import NotFound -from discord.ext import commands -from markdownify import MarkdownConverter -from requests import ConnectTimeout, ConnectionError, HTTPError -from sphinx.ext import intersphinx -from urllib3.exceptions import ProtocolError - -from bot.bot import Bot -from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import ValidPythonIdentifier, ValidURL -from bot.pagination import LinePaginator -from bot.utils.messages import wait_for_deletion - - -log = logging.getLogger(__name__) -logging.getLogger('urllib3').setLevel(logging.WARNING) - -# Since Intersphinx is intended to be used with Sphinx, -# we need to mock its configuration. -SPHINX_MOCK_APP = SimpleNamespace( - config=SimpleNamespace( - intersphinx_timeout=3, - tls_verify=True, - user_agent="python3:python-discord/bot:1.0.0" - ) -) - -NO_OVERRIDE_GROUPS = ( - "2to3fixer", - "token", - "label", - "pdbcommand", - "term", -) -NO_OVERRIDE_PACKAGES = ( - "python", -) - -SEARCH_END_TAG_ATTRS = ( - "data", - "function", - "class", - "exception", - "seealso", - "section", - "rubric", - "sphinxsidebar", -) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") - -FAILED_REQUEST_RETRY_AMOUNT = 3 -NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay - - -def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: - """ - LRU cache implementation for coroutines. - - Once the cache exceeds the maximum size, keys are deleted in FIFO order. - - An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. - """ - # Assign the cache to the function itself so we can clear it from outside. - async_cache.cache = OrderedDict() - - def decorator(function: Callable) -> Callable: - """Define the async_cache decorator.""" - @functools.wraps(function) - async def wrapper(*args) -> Any: - """Decorator wrapper for the caching logic.""" - key = ':'.join(args[arg_offset:]) - - value = async_cache.cache.get(key) - if value is None: - if len(async_cache.cache) > max_size: - async_cache.cache.popitem(last=False) - - async_cache.cache[key] = await function(*args) - return async_cache.cache[key] - return wrapper - return decorator - - -class DocMarkdownConverter(MarkdownConverter): - """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - - def convert_code(self, el: PageElement, text: str) -> str: - """Undo `markdownify`s underscore escaping.""" - return f"`{text}`".replace('\\', '') - - def convert_pre(self, el: PageElement, text: str) -> str: - """Wrap any codeblocks in `py` for syntax highlighting.""" - code = ''.join(el.strings) - return f"```py\n{code}```" - - -def markdownify(html: str) -> DocMarkdownConverter: - """Create a DocMarkdownConverter object from the input html.""" - return DocMarkdownConverter(bullets='•').convert(html) - - -class InventoryURL(commands.Converter): - """ - Represents an Intersphinx inventory URL. - - This converter checks whether intersphinx accepts the given inventory URL, and raises - `BadArgument` if that is not the case. - - Otherwise, it simply passes through the given URL. - """ - - @staticmethod - async def convert(ctx: commands.Context, url: str) -> str: - """Convert url to Intersphinx inventory URL.""" - try: - intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url) - except AttributeError: - raise commands.BadArgument(f"Failed to fetch Intersphinx inventory from URL `{url}`.") - except ConnectionError: - if url.startswith('https'): - raise commands.BadArgument( - f"Cannot establish a connection to `{url}`. Does it support HTTPS?" - ) - raise commands.BadArgument(f"Cannot connect to host with URL `{url}`.") - except ValueError: - raise commands.BadArgument( - f"Failed to read Intersphinx inventory from URL `{url}`. " - "Are you sure that it's a valid inventory file?" - ) - return url - - -class Doc(commands.Cog): - """A set of commands for querying & displaying documentation.""" - - def __init__(self, bot: Bot): - self.base_urls = {} - self.bot = bot - self.inventories = {} - self.renamed_symbols = set() - - self.bot.loop.create_task(self.init_refresh_inventory()) - - async def init_refresh_inventory(self) -> None: - """Refresh documentation inventory on cog initialization.""" - await self.bot.wait_until_guild_available() - await self.refresh_inventory() - - async def update_single( - self, package_name: str, base_url: str, inventory_url: str - ) -> None: - """ - Rebuild the inventory for a single package. - - Where: - * `package_name` is the package name to use, appears in the log - * `base_url` is the root documentation URL for the specified package, used to build - absolute paths that link to specific symbols - * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running - `intersphinx.fetch_inventory` in an executor on the bot's event loop - """ - self.base_urls[package_name] = base_url - - package = await self._fetch_inventory(inventory_url) - if not package: - return None - - for group, value in package.items(): - for symbol, (package_name, _version, relative_doc_url, _) in value.items(): - absolute_doc_url = base_url + relative_doc_url - - if symbol in self.inventories: - group_name = group.split(":")[1] - symbol_base_url = self.inventories[symbol].split("/", 3)[2] - if ( - group_name in NO_OVERRIDE_GROUPS - or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) - ): - - symbol = f"{group_name}.{symbol}" - # If renamed `symbol` already exists, add library name in front to differentiate between them. - if symbol in self.renamed_symbols: - # Split `package_name` because of packages like Pillow that have spaces in them. - symbol = f"{package_name.split()[0]}.{symbol}" - - self.inventories[symbol] = absolute_doc_url - self.renamed_symbols.add(symbol) - continue - - self.inventories[symbol] = absolute_doc_url - - log.trace(f"Fetched inventory for {package_name}.") - - async def refresh_inventory(self) -> None: - """Refresh internal documentation inventory.""" - log.debug("Refreshing documentation inventory...") - - # Clear the old base URLS and inventories to ensure - # that we start from a fresh local dataset. - # Also, reset the cache used for fetching documentation. - self.base_urls.clear() - self.inventories.clear() - self.renamed_symbols.clear() - async_cache.cache = OrderedDict() - - # Run all coroutines concurrently - since each of them performs a HTTP - # request, this speeds up fetching the inventory data heavily. - coros = [ - self.update_single( - package["package"], package["base_url"], package["inventory_url"] - ) for package in await self.bot.api_client.get('bot/documentation-links') - ] - await asyncio.gather(*coros) - - async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]: - """ - Given a Python symbol, return its signature and description. - - The first tuple element is the signature of the given symbol as a markup-free string, and - the second tuple element is the description of the given symbol with HTML markup included. - - If the given symbol is a module, returns a tuple `(None, str)` - else if the symbol could not be found, returns `None`. - """ - url = self.inventories.get(symbol) - if url is None: - return None - - async with self.bot.http_session.get(url) as response: - html = await response.text(encoding='utf-8') - - # Find the signature header and parse the relevant parts. - symbol_id = url.split('#')[-1] - soup = BeautifulSoup(html, 'lxml') - symbol_heading = soup.find(id=symbol_id) - search_html = str(soup) - - if symbol_heading is None: - return None - - if symbol_id == f"module-{symbol}": - # Get page content from the module headerlink to the - # first tag that has its class in `SEARCH_END_TAG_ATTRS` - start_tag = symbol_heading.find("a", attrs={"class": "headerlink"}) - if start_tag is None: - return [], "" - - end_tag = start_tag.find_next(self._match_end_tag) - if end_tag is None: - return [], "" - - description_start_index = search_html.find(str(start_tag.parent)) + len(str(start_tag.parent)) - description_end_index = search_html.find(str(end_tag)) - description = search_html[description_start_index:description_end_index] - signatures = None - - else: - signatures = [] - description = str(symbol_heading.find_next_sibling("dd")) - description_pos = search_html.find(description) - # Get text of up to 3 signatures, remove unwanted symbols - for element in [symbol_heading] + symbol_heading.find_next_siblings("dt", limit=2): - signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) - if signature and search_html.find(str(element)) < description_pos: - signatures.append(signature) - - return signatures, description.replace('¶', '') - - @async_cache(arg_offset=1) - async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: - """ - Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. - - If the symbol is known, an Embed with documentation about it is returned. - """ - scraped_html = await self.get_symbol_html(symbol) - if scraped_html is None: - return None - - signatures = scraped_html[0] - permalink = self.inventories[symbol] - description = markdownify(scraped_html[1]) - - # Truncate the description of the embed to the last occurrence - # of a double newline (interpreted as a paragraph) before index 1000. - if len(description) > 1000: - shortened = description[:1000] - description_cutoff = shortened.rfind('\n\n', 100) - if description_cutoff == -1: - # Search the shortened version for cutoff points in decreasing desirability, - # cutoff at 1000 if none are found. - for string in (". ", ", ", ",", " "): - description_cutoff = shortened.rfind(string) - if description_cutoff != -1: - break - else: - description_cutoff = 1000 - description = description[:description_cutoff] - - # If there is an incomplete code block, cut it out - if description.count("```") % 2: - codeblock_start = description.rfind('```py') - description = description[:codeblock_start].rstrip() - description += f"... [read more]({permalink})" - - description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) - if signatures is None: - # If symbol is a module, don't show signature. - embed_description = description - - elif not signatures: - # It's some "meta-page", for example: - # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views - embed_description = "This appears to be a generic page not tied to a specific symbol." - - else: - embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) - embed_description += f"\n{description}" - - embed = discord.Embed( - title=f'`{symbol}`', - url=permalink, - description=embed_description - ) - # Show all symbols with the same name that were renamed in the footer. - embed.set_footer( - text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) - ) - return embed - - @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) - async def docs_group(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: - """Lookup documentation for Python symbols.""" - await self.get_command(ctx, symbol) - - @docs_group.command(name='get', aliases=('g',)) - async def get_command(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: - """ - Return a documentation embed for a given symbol. - - If no symbol is given, return a list of all available inventories. - - Examples: - !docs - !docs aiohttp - !docs aiohttp.ClientSession - !docs get aiohttp.ClientSession - """ - if symbol is None: - inventory_embed = discord.Embed( - title=f"All inventories (`{len(self.base_urls)}` total)", - colour=discord.Colour.blue() - ) - - lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items()) - if self.base_urls: - await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False) - - else: - inventory_embed.description = "Hmmm, seems like there's nothing here yet." - await ctx.send(embed=inventory_embed) - - else: - # Fetching documentation for a symbol (at least for the first time, since - # caching is used) takes quite some time, so let's send typing to indicate - # that we got the command, but are still working on it. - async with ctx.typing(): - doc_embed = await self.get_symbol_embed(symbol) - - if doc_embed is None: - error_embed = discord.Embed( - description=f"Sorry, I could not find any documentation for `{symbol}`.", - colour=discord.Colour.red() - ) - error_message = await ctx.send(embed=error_embed) - with suppress(NotFound): - await error_message.delete(delay=NOT_FOUND_DELETE_DELAY) - await ctx.message.delete(delay=NOT_FOUND_DELETE_DELAY) - else: - msg = await ctx.send(embed=doc_embed) - await wait_for_deletion(msg, (ctx.author.id,), client=self.bot) - - @docs_group.command(name='set', aliases=('s',)) - @commands.has_any_role(*MODERATION_ROLES) - async def set_command( - self, ctx: commands.Context, package_name: ValidPythonIdentifier, - base_url: ValidURL, inventory_url: InventoryURL - ) -> None: - """ - Adds a new documentation metadata object to the site's database. - - The database will update the object, should an existing item with the specified `package_name` already exist. - - Example: - !docs set \ - python \ - https://docs.python.org/3/ \ - https://docs.python.org/3/objects.inv - """ - body = { - 'package': package_name, - 'base_url': base_url, - 'inventory_url': inventory_url - } - await self.bot.api_client.post('bot/documentation-links', json=body) - - log.info( - f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" - f"Package name: {package_name}\n" - f"Base url: {base_url}\n" - f"Inventory URL: {inventory_url}" - ) - - # Rebuilding the inventory can take some time, so lets send out a - # typing event to show that the Bot is still working. - async with ctx.typing(): - await self.refresh_inventory() - await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") - - @docs_group.command(name='delete', aliases=('remove', 'rm', 'd')) - @commands.has_any_role(*MODERATION_ROLES) - async def delete_command(self, ctx: commands.Context, package_name: ValidPythonIdentifier) -> None: - """ - Removes the specified package from the database. - - Examples: - !docs delete aiohttp - """ - await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') - - async with ctx.typing(): - # Rebuild the inventory to ensure that everything - # that was from this package is properly deleted. - await self.refresh_inventory() - await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") - - @docs_group.command(name="refresh", aliases=("rfsh", "r")) - @commands.has_any_role(*MODERATION_ROLES) - async def refresh_command(self, ctx: commands.Context) -> None: - """Refresh inventories and send differences to channel.""" - old_inventories = set(self.base_urls) - with ctx.typing(): - await self.refresh_inventory() - # Get differences of added and removed inventories - added = ', '.join(inv for inv in self.base_urls if inv not in old_inventories) - if added: - added = f"+ {added}" - - removed = ', '.join(inv for inv in old_inventories if inv not in self.base_urls) - if removed: - removed = f"- {removed}" - - embed = discord.Embed( - title="Inventories refreshed", - description=f"```diff\n{added}\n{removed}```" if added or removed else "" - ) - await ctx.send(embed=embed) - - async def _fetch_inventory(self, inventory_url: str) -> Optional[dict]: - """Get and return inventory from `inventory_url`. If fetching fails, return None.""" - fetch_func = functools.partial(intersphinx.fetch_inventory, SPHINX_MOCK_APP, '', inventory_url) - for retry in range(1, FAILED_REQUEST_RETRY_AMOUNT+1): - try: - package = await self.bot.loop.run_in_executor(None, fetch_func) - except ConnectTimeout: - log.error( - f"Fetching of inventory {inventory_url} timed out," - f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" - ) - except ProtocolError: - log.error( - f"Connection lost while fetching inventory {inventory_url}," - f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" - ) - except HTTPError as e: - log.error(f"Fetching of inventory {inventory_url} failed with status code {e.response.status_code}.") - return None - except ConnectionError: - log.error(f"Couldn't establish connection to inventory {inventory_url}.") - return None - else: - return package - log.error(f"Fetching of inventory {inventory_url} failed.") - return None - - @staticmethod - def _match_end_tag(tag: Tag) -> bool: - """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" - for attr in SEARCH_END_TAG_ATTRS: - if attr in tag.get("class", ()): - return True - - return tag.name == "table" - - -def setup(bot: Bot) -> None: - """Load the Doc cog.""" - bot.add_cog(Doc(bot)) diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py new file mode 100644 index 000000000..e9eb9428c --- /dev/null +++ b/bot/exts/info/doc/__init__.py @@ -0,0 +1,7 @@ +from bot.bot import Bot +from ._cog import DocCog + + +def setup(bot: Bot) -> None: + """Load the Doc cog.""" + bot.add_cog(DocCog(bot)) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py new file mode 100644 index 000000000..257435e95 --- /dev/null +++ b/bot/exts/info/doc/_cog.py @@ -0,0 +1,457 @@ +from __future__ import annotations + +import asyncio +import logging +import re +import sys +from collections import defaultdict +from contextlib import suppress +from typing import Dict, List, NamedTuple, Optional, Union + +import discord +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from discord.ext import commands + +from bot.bot import Bot +from bot.constants import MODERATION_ROLES, RedirectOutput +from bot.converters import PackageName, ValidURL +from bot.pagination import LinePaginator +from bot.utils.messages import wait_for_deletion +from bot.utils.scheduling import Scheduler +from ._inventory_parser import FAILED_REQUEST_ATTEMPTS, fetch_inventory +from ._parsing import get_symbol_markdown + +log = logging.getLogger(__name__) + +NO_OVERRIDE_GROUPS = ( + "2to3fixer", + "token", + "label", + "pdbcommand", + "term", +) +NO_OVERRIDE_PACKAGES = ( + "python", +) + +WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay + + +class DocItem(NamedTuple): + """Holds inventory symbol information.""" + + package: str + group: str + base_url: str + relative_url_path: str + symbol_id: str + + @property + def url(self) -> str: + """Return the absolute url to the symbol.""" + return "".join((self.base_url, self.relative_url_path)) + + +class QueueItem(NamedTuple): + """Contains a symbol and the BeautifulSoup object needed to parse it.""" + + symbol: DocItem + soup: BeautifulSoup + + def __eq__(self, other: Union[QueueItem, DocItem]): + if isinstance(other, DocItem): + return self.symbol == other + return NamedTuple.__eq__(self, other) + + +class CachedParser: + """ + Get symbol markdown from pages with smarter caching. + + DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. + `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, + all of the symbols are queued to be parsed to avoid multiple web requests to the same page. + """ + + def __init__(self): + self._queue: List[QueueItem] = [] + self._results = {} + self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) + self._item_events: Dict[DocItem, asyncio.Event] = {} + self._parse_task = None + + async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: + """ + Get result markdown of `doc_item`. + + If no symbols were fetched from `doc_item`s page before, + the HTML has to be fetched before parsing can be queued. + """ + if (symbol := self._results.get(doc_item)) is not None: + return symbol + + if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: + async with client_session.get(doc_item.url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + + self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + del self._page_symbols[doc_item.url] + log.debug(f"Added symbols from {doc_item.url} to parse queue.") + + if self._parse_task is None: + self._parse_task = asyncio.create_task(self._parse_queue()) + + self._move_to_front(doc_item) + self._item_events[doc_item] = item_event = asyncio.Event() + await item_event.wait() + return self._results[doc_item] + + async def _parse_queue(self) -> None: + """ + Parse all item from the queue, setting associated events for symbols if present. + + The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. + """ + log.trace("Starting queue parsing.") + while self._queue: + item, soup = self._queue.pop() + try: + self._results[item] = get_symbol_markdown(soup, item) + except Exception: + log.exception(f"Unexpected error when handling {item}") + else: + if (event := self._item_events.get(item)) is not None: + event.set() + await asyncio.sleep(0.1) + + self._parse_task = None + log.trace("Finished parsing queue.") + + def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: + """Move `item` to the front of the parse queue.""" + # The parse queue stores soups along with the doc symbols in QueueItem objects, + # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. + item_index = self._queue.index(item) + queue_item = self._queue[item_index] + + del self._queue[item_index] + self._queue.append(queue_item) + + def add_item(self, doc_item: DocItem) -> None: + """Add a DocItem to `_page_symbols`.""" + self._page_symbols[doc_item.url].append(doc_item) + + async def clear(self) -> None: + """ + Clear all internal symbol data. + + All currently requested items are waited to be parsed before clearing. + """ + for event in self._item_events.values(): + await event.wait() + if self._parse_task is not None: + self._parse_task.cancel() + self._parse_task = None + self._queue.clear() + self._results.clear() + self._page_symbols.clear() + self._item_events.clear() + + +class InventoryURL(commands.Converter): + """ + Represents an Intersphinx inventory URL. + + This converter checks whether intersphinx accepts the given inventory URL, and raises + `BadArgument` if that is not the case. + + Otherwise, it simply passes through the given URL. + """ + + @staticmethod + async def convert(ctx: commands.Context, url: str) -> str: + """Convert url to Intersphinx inventory URL.""" + await ctx.trigger_typing() + if await fetch_inventory(ctx.bot.http_session, url) is None: + raise commands.BadArgument(f"Failed to fetch inventory file after {FAILED_REQUEST_ATTEMPTS}.") + return url + + +class DocCog(commands.Cog): + """A set of commands for querying & displaying documentation.""" + + def __init__(self, bot: Bot): + self.base_urls = {} + self.bot = bot + self.doc_symbols: Dict[str, DocItem] = {} + self.item_fetcher = CachedParser() + self.renamed_symbols = set() + + self.inventory_scheduler = Scheduler(self.__class__.__name__) + self.scheduled_inventories = set() + + self.bot.loop.create_task(self.init_refresh_inventory()) + + async def init_refresh_inventory(self) -> None: + """Refresh documentation inventory on cog initialization.""" + await self.bot.wait_until_guild_available() + await self.refresh_inventory() + + async def update_single( + self, api_package_name: str, base_url: str, inventory_url: str + ) -> bool: + """ + Rebuild the inventory for a single package. + + Where: + * `package_name` is the package name to use, appears in the log + * `base_url` is the root documentation URL for the specified package, used to build + absolute paths that link to specific symbols + * `inventory_url` is the absolute URL to the intersphinx inventory. + + If the inventory file is currently unreachable, + the update is rescheduled to execute in 2 minutes on the first attempt, and 5 minutes on subsequent attempts. + + Return True on success; False if fetching failed and was rescheduled. + """ + self.base_urls[api_package_name] = base_url + package = await fetch_inventory(self.bot.http_session, inventory_url) + + if not package: + delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60 + log.info(f"Failed to fetch inventory, attempting again in {delay//60} minutes.") + self.inventory_scheduler.schedule_later( + delay, + api_package_name, + fetch_inventory(self.bot.http_session, inventory_url) + ) + self.scheduled_inventories.add(api_package_name) + return False + with suppress(KeyError): + self.scheduled_inventories.discard(api_package_name) + + for group, items in package.items(): + for symbol, relative_doc_url in items: + if "/" in symbol: + continue # skip unreachable symbols with slashes + # Intern the group names since they're reused in all the DocItems + # to remove unnecessary memory consumption from them being unique objects + group_name = sys.intern(group.split(":")[1]) + + if (original_symbol := self.doc_symbols.get(symbol)) is not None: + if ( + group_name in NO_OVERRIDE_GROUPS + or any(package == original_symbol.package for package in NO_OVERRIDE_PACKAGES) + ): + symbol = f"{group_name}.{symbol}" + self.renamed_symbols.add(symbol) + + elif (overridden_symbol_group := original_symbol.group) in NO_OVERRIDE_GROUPS: + overridden_symbol = f"{overridden_symbol_group}.{symbol}" + if overridden_symbol in self.renamed_symbols: + overridden_symbol = f"{api_package_name}.{overridden_symbol}" + + self.doc_symbols[overridden_symbol] = original_symbol + self.renamed_symbols.add(overridden_symbol) + + # If renamed `symbol` already exists, add library name in front to differentiate between them. + elif symbol in self.renamed_symbols: + symbol = f"{api_package_name}.{symbol}" + self.renamed_symbols.add(symbol) + + relative_url_path, _, symbol_id = relative_doc_url.partition("#") + symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) + self.doc_symbols[symbol] = symbol_item + self.item_fetcher.add_item(symbol_item) + + log.trace(f"Fetched inventory for {api_package_name}.") + return True + + async def refresh_inventory(self) -> None: + """Refresh internal documentation inventory.""" + log.debug("Refreshing documentation inventory...") + + # Clear the old base URLS and doc symbols to ensure + # that we start from a fresh local dataset. + # Also, reset the cache used for fetching documentation. + self.base_urls.clear() + self.doc_symbols.clear() + self.renamed_symbols.clear() + self.scheduled_inventories.clear() + await self.item_fetcher.clear() + + # Run all coroutines concurrently - since each of them performs a HTTP + # request, this speeds up fetching the inventory data heavily. + coros = [ + self.update_single( + package["package"], package["base_url"], package["inventory_url"] + ) for package in await self.bot.api_client.get('bot/documentation-links') + ] + await asyncio.gather(*coros) + + async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: + """ + Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. + + If the symbol is known, an Embed with documentation about it is returned. + """ + symbol_info = self.doc_symbols.get(symbol) + if symbol_info is None: + return None + self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") + + embed = discord.Embed( + title=discord.utils.escape_markdown(symbol), + url=f"{symbol_info.url}#{symbol_info.symbol_id}", + description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) + ) + # Show all symbols with the same name that were renamed in the footer. + embed.set_footer( + text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) + ) + return embed + + @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) + async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: + """Lookup documentation for Python symbols.""" + await ctx.invoke(self.get_command, symbol=symbol) + + @docs_group.command(name='getdoc', aliases=('g',)) + async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: + """ + Return a documentation embed for a given symbol. + + If no symbol is given, return a list of all available inventories. + + Examples: + !docs + !docs aiohttp + !docs aiohttp.ClientSession + !docs getdoc aiohttp.ClientSession + """ + if not symbol: + inventory_embed = discord.Embed( + title=f"All inventories (`{len(self.base_urls)}` total)", + colour=discord.Colour.blue() + ) + + lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items()) + if self.base_urls: + await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False) + + else: + inventory_embed.description = "Hmmm, seems like there's nothing here yet." + await ctx.send(embed=inventory_embed) + + else: + symbol = symbol.strip("`") + # Fetching documentation for a symbol (at least for the first time, since + # caching is used) takes quite some time, so let's send typing to indicate + # that we got the command, but are still working on it. + async with ctx.typing(): + doc_embed = await self.get_symbol_embed(symbol) + + if doc_embed is None: + symbol = await discord.ext.commands.clean_content().convert(ctx, symbol) + error_embed = discord.Embed( + description=f"Sorry, I could not find any documentation for `{(symbol)}`.", + colour=discord.Colour.red() + ) + error_message = await ctx.send(embed=error_embed) + await wait_for_deletion( + error_message, + (ctx.author.id,), + timeout=NOT_FOUND_DELETE_DELAY, + client=self.bot + ) + with suppress(discord.NotFound): + await ctx.message.delete() + with suppress(discord.NotFound): + await error_message.delete() + else: + msg = await ctx.send(embed=doc_embed) + await wait_for_deletion(msg, (ctx.author.id,), client=self.bot) + + @docs_group.command(name='setdoc', aliases=('s',)) + @commands.has_any_role(*MODERATION_ROLES) + async def set_command( + self, ctx: commands.Context, package_name: PackageName, + base_url: ValidURL, inventory_url: InventoryURL + ) -> None: + """ + Adds a new documentation metadata object to the site's database. + + The database will update the object, should an existing item with the specified `package_name` already exist. + + Example: + !docs setdoc \ + python \ + https://docs.python.org/3/ \ + https://docs.python.org/3/objects.inv + """ + body = { + 'package': package_name, + 'base_url': base_url, + 'inventory_url': inventory_url + } + await self.bot.api_client.post('bot/documentation-links', json=body) + + log.info( + f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" + f"Package name: {package_name}\n" + f"Base url: {base_url}\n" + f"Inventory URL: {inventory_url}" + ) + + if await self.update_single(package_name, base_url, inventory_url) is None: + await ctx.send( + f"Added package `{package_name}` to database but failed to fetch inventory; rescheduled in 2 minutes." + ) + return + await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") + + @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) + @commands.has_any_role(*MODERATION_ROLES) + async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: + """ + Removes the specified package from the database. + + Examples: + !docs deletedoc aiohttp + """ + await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') + + if package_name in self.scheduled_inventories: + self.inventory_scheduler.cancel(package_name) + + async with ctx.typing(): + # Rebuild the inventory to ensure that everything + # that was from this package is properly deleted. + await self.refresh_inventory() + await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") + + @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) + @commands.has_any_role(*MODERATION_ROLES) + async def refresh_command(self, ctx: commands.Context) -> None: + """Refresh inventories and send differences to channel.""" + for inventory in self.scheduled_inventories: + self.inventory_scheduler.cancel(inventory) + + old_inventories = set(self.base_urls) + with ctx.typing(): + await self.refresh_inventory() + new_inventories = set(self.base_urls) + + if added := ", ".join(new_inventories - old_inventories): + added = "+ " + added + + if removed := ", ".join(old_inventories - new_inventories): + removed = "- " + removed + + embed = discord.Embed( + title="Inventories refreshed", + description=f"```diff\n{added}\n{removed}```" if added or removed else "" + ) + await ctx.send(embed=embed) diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py new file mode 100644 index 000000000..bc705130d --- /dev/null +++ b/bot/exts/info/doc/_html.py @@ -0,0 +1,33 @@ +from collections.abc import Iterable +from typing import List, Union + +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + + +class Strainer(SoupStrainer): + """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" + + def __init__(self, *, include_strings: bool, **kwargs): + self.include_strings = include_strings + super().__init__(**kwargs) + + markup_hint = Union[PageElement, List["markup_hint"]] + + def search(self, markup: markup_hint) -> Union[PageElement, str]: + """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" + if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)): + for element in markup: + if isinstance(element, NavigableString) and self.search(element): + return element + elif isinstance(markup, Tag): + # Also include tags while we're searching for strings and tags. + if self.include_strings or (not self.text or self.name or self.attrs): + return self.search_tag(markup) + + elif isinstance(markup, str): + # Let everything through the text filter if we're including strings and tags. + text_filter = None if not self.include_strings else True + if not self.name and not self.attrs and self._matches(markup, text_filter): + return markup + else: + raise Exception(f"I don't know how to match against a {markup.__class__}") diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py new file mode 100644 index 000000000..23931869b --- /dev/null +++ b/bot/exts/info/doc/_inventory_parser.py @@ -0,0 +1,120 @@ +import logging +import re +import zlib +from collections import defaultdict +from typing import AsyncIterator, DefaultDict, List, Optional, Tuple + +import aiohttp + +log = logging.getLogger(__name__) + +FAILED_REQUEST_ATTEMPTS = 3 +_V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)') + + +class ZlibStreamReader: + """Class used for decoding zlib data of a stream line by line.""" + + READ_CHUNK_SIZE = 16 * 1024 + + def __init__(self, stream: aiohttp.StreamReader) -> None: + self.stream = stream + + async def _read_compressed_chunks(self) -> AsyncIterator[bytes]: + """Read zlib data in `READ_CHUNK_SIZE` sized chunks and decompress.""" + decompressor = zlib.decompressobj() + async for chunk in self.stream.iter_chunked(self.READ_CHUNK_SIZE): + yield decompressor.decompress(chunk) + + yield decompressor.flush() + + async def __aiter__(self) -> AsyncIterator[str]: + """Yield lines of decompressed text.""" + buf = b'' + async for chunk in self._read_compressed_chunks(): + buf += chunk + pos = buf.find(b'\n') + while pos != -1: + yield buf[:pos].decode() + buf = buf[pos + 1:] + pos = buf.find(b'\n') + + +async def _load_v1(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: + invdata = defaultdict(list) + + async for line in stream: + name, type_, location = line.decode().rstrip().split(maxsplit=2) + # version 1 did not add anchors to the location + if type_ == 'mod': + type_ = 'py:module' + location += '#module-' + name + else: + type_ = 'py:' + type_ + location += '#' + name + invdata[type_].append((name, location)) + return invdata + + +async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: + invdata = defaultdict(list) + + async for line in ZlibStreamReader(stream): + m = _V2_LINE_RE.match(line.rstrip()) + name, type_, _prio, location, _dispname = m.groups() # ignore the parsed items we don't need + if location.endswith('$'): + location = location[:-1] + name + + invdata[type_].append((name, location)) + return invdata + + +async def _fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]: + """Fetch, parse and return an intersphinx inventory file from an url.""" + timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) + async with client_session.get(url, timeout=timeout, raise_for_status=True) as response: + stream = response.content + + inventory_header = (await stream.readline()).decode().rstrip() + inventory_version = int(inventory_header[-1:]) + await stream.readline() # skip project name + await stream.readline() # skip project version + + if inventory_version == 1: + return await _load_v1(stream) + + elif inventory_version == 2: + if b"zlib" not in await stream.readline(): + raise ValueError(f"Invalid inventory file at url {url}.") + return await _load_v2(stream) + + raise ValueError(f"Invalid inventory file at url {url}.") + + +async def fetch_inventory( + client_session: aiohttp.ClientSession, + url: str +) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]: + """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors.""" + for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1): + try: + inventory = await _fetch_inventory(client_session, url) + except aiohttp.ClientConnectorError: + log.warning( + f"Failed to connect to inventory url at {url}, " + f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." + ) + except aiohttp.ClientError: + log.error( + f"Failed to get inventory from {url}, " + f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." + ) + except Exception: + log.exception( + f"An unexpected error has occurred during fetching of {url}, " + f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." + ) + else: + return inventory + + return None diff --git a/bot/exts/info/doc/_markdown.py b/bot/exts/info/doc/_markdown.py new file mode 100644 index 000000000..ba35a84c4 --- /dev/null +++ b/bot/exts/info/doc/_markdown.py @@ -0,0 +1,53 @@ +from urllib.parse import urljoin + +from bs4.element import PageElement +from markdownify import MarkdownConverter + + +class DocMarkdownConverter(MarkdownConverter): + """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + + def __init__(self, *, page_url: str, **options): + super().__init__(**options) + self.page_url = page_url + + def convert_li(self, el: PageElement, text: str) -> str: + """Fix markdownify's erroneous indexing in ol tags.""" + parent = el.parent + if parent is not None and parent.name == "ol": + li_tags = parent.find_all("li") + bullet = f"{li_tags.index(el)+1}." + else: + depth = -1 + while el: + if el.name == "ul": + depth += 1 + el = el.parent + bullets = self.options["bullets"] + bullet = bullets[depth % len(bullets)] + return f"{bullet} {text}\n" + + def convert_hn(self, _n: int, el: PageElement, text: str) -> str: + """Convert h tags to bold text with ** instead of adding #.""" + return f"**{text}**\n\n" + + def convert_code(self, el: PageElement, text: str) -> str: + """Undo `markdownify`s underscore escaping.""" + return f"`{text}`".replace("\\", "") + + def convert_pre(self, el: PageElement, text: str) -> str: + """Wrap any codeblocks in `py` for syntax highlighting.""" + code = "".join(el.strings) + return f"```py\n{code}```" + + def convert_a(self, el: PageElement, text: str) -> str: + """Resolve relative URLs to `self.page_url`.""" + el["href"] = urljoin(self.page_url, el["href"]) + return super().convert_a(el, text) + + def convert_p(self, el: PageElement, text: str) -> str: + """Include only one newline instead of two when the parent is a li tag.""" + parent = el.parent + if parent is not None and parent.name == "li": + return f"{text}\n" + return super().convert_p(el, text) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py new file mode 100644 index 000000000..83e35e2b1 --- /dev/null +++ b/bot/exts/info/doc/_parsing.py @@ -0,0 +1,313 @@ +from __future__ import annotations + +import logging +import re +import string +import textwrap +from functools import partial +from typing import Callable, Collection, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union + +from bs4 import BeautifulSoup +from bs4.element import NavigableString, PageElement, Tag + +from ._html import Strainer +from ._markdown import DocMarkdownConverter +if TYPE_CHECKING: + from ._cog import DocItem + +log = logging.getLogger(__name__) + +_MAX_SIGNATURE_AMOUNT = 3 + +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +_PARAMETERS_RE = re.compile(r"\((.+)\)") + +_SEARCH_END_TAG_ATTRS = ( + "data", + "function", + "class", + "exception", + "seealso", + "section", + "rubric", + "sphinxsidebar", +) + +_NO_SIGNATURE_GROUPS = { + "attribute", + "envvar", + "setting", + "tempaltefilter", + "templatetag", + "term", +} +_EMBED_CODE_BLOCK_LENGTH = 61 +# _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT +# Maximum discord message length - signatures on top +_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH +_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace +_BRACKET_PAIRS = { + "{": "}", + "(": ")", + "[": "]", +} + + +def _split_parameters(parameters_string: str) -> List[str]: + """ + Split parameters of a signature into individual parameter strings on commas. + + Long string literals are not accounted for. + """ + parameters_list = [] + last_split = 0 + depth = 0 + expected_end = None + current_search = None + previous_character = "" + + for index, character in enumerate(parameters_string): + if character in _BRACKET_PAIRS: + if current_search is None: + current_search = character + expected_end = _BRACKET_PAIRS[character] + if character == current_search: + depth += 1 + + elif character in {"'", '"'}: + if depth == 0: + depth += 1 + elif not previous_character == "\\": + depth -= 1 + + elif character == expected_end: + depth -= 1 + if depth == 0: + current_search = None + expected_end = None + + elif depth == 0 and character == ",": + parameters_list.append(parameters_string[last_split:index]) + last_split = index + 1 + previous_character = character + + parameters_list.append(parameters_string[last_split:]) + return parameters_list + + +def _find_elements_until_tag( + start_element: PageElement, + tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], + *, + func: Callable, + include_strings: bool = False, + limit: int = None, +) -> List[Union[Tag, NavigableString]]: + """ + Get all elements up to `limit` or until a tag matching `tag_filter` is found. + + `tag_filter` can be either a tuple of string names to check against, + or a filtering callable that's applied to tags. + + When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. + + `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. + The method is then iterated over and all elements until the matching tag or the limit are added to the return list. + """ + use_tuple_filter = isinstance(tag_filter, tuple) + elements = [] + + for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): + if isinstance(element, Tag): + if use_tuple_filter: + if element.name in tag_filter: + break + elif tag_filter(element): + break + elements.append(element) + + return elements + + +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) + + +def _get_general_description(start_element: PageElement) -> List[Union[Tag, NavigableString]]: + """ + Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. + + A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, + if it's found it's used as the tag to start the search from instead of the `start_element`. + """ + header = start_element.find_next("a", attrs={"class": "headerlink"}) + start_tag = header.parent if header is not None else start_element + return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) + + +def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: + """Get the contents of the next dd tag, up to a dt or a dl tag.""" + description_tag = symbol.find_next("dd") + return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) + + +def _get_signatures(start_signature: PageElement) -> List[str]: + """ + Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. + + First the signatures under the `start_signature` are included; + if less than 2 are found, tags above the start signature are added to the result if any are present. + """ + signatures = [] + for element in ( + *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), + start_signature, + *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), + )[-(_MAX_SIGNATURE_AMOUNT):]: + signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + + if signature: + signatures.append(signature) + + return signatures + + +def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]: + """ + Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. + + If the signatures need to be truncated, parameters are collapsed until they fit withing the limit. + Individual signatures can consist of max 1, 2, ..., `_MAX_SIGNATURE_AMOUNT` lines of text, + inversely proportional to the amount of signatures. + A maximum of `_MAX_SIGNATURE_AMOUNT` signatures is assumed to be passed. + """ + if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: + return signatures + + max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) + formatted_signatures = [] + for signature in signatures: + signature = signature.strip() + if len(signature) > max_signature_length: + if (parameters_match := _PARAMETERS_RE.search(signature)) is None: + formatted_signatures.append(textwrap.shorten(signature, max_signature_length)) + continue + + truncated_signature = [] + parameters_string = parameters_match[1] + running_length = len(signature) - len(parameters_string) + for parameter in _split_parameters(parameters_string): + if (len(parameter) + running_length) <= max_signature_length - 4: # account for comma and placeholder + truncated_signature.append(parameter) + running_length += len(parameter) + 1 + else: + truncated_signature.append(" ...") + formatted_signatures.append(signature.replace(parameters_string, ",".join(truncated_signature))) + break + else: + formatted_signatures.append(signature) + + return formatted_signatures + + +def _get_truncated_description( + elements: Iterable[Union[Tag, NavigableString]], + markdown_converter: DocMarkdownConverter, + max_length: int, +) -> str: + """ + Truncate markdown from `elements` to be at most `max_length` characters visually. + + `max_length` limits the length of the rendered characters in the string, + with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits + """ + visual_length = 0 + real_length = 0 + result = [] + shortened = False + + for element in elements: + is_tag = isinstance(element, Tag) + element_length = len(element.text) if is_tag else len(element) + if visual_length + element_length < max_length: + if is_tag: + element_markdown = markdown_converter.process_tag(element) + else: + element_markdown = markdown_converter.process_text(element) + + element_markdown_length = len(element_markdown) + if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: + result.append(element_markdown) + else: + shortened = True + break + real_length += element_markdown_length + visual_length += element_length + else: + shortened = True + break + + markdown_string = "".join(result) + if shortened: + markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." + return markdown_string + + +def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: + """ + Create a markdown string with the signatures at the top, and the converted html description below them. + + The signatures are wrapped in python codeblocks, separated from the description by a newline. + The result string is truncated to be max 1000 symbols long. + """ + description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) + description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) + if signatures is not None: + formatted_markdown = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) + else: + formatted_markdown = "" + formatted_markdown += f"\n{description}" + + return formatted_markdown + + +def _match_end_tag(tag: Tag) -> bool: + """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" + for attr in _SEARCH_END_TAG_ATTRS: + if attr in tag.get("class", ()): + return True + + return tag.name == "table" + + +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: + """ + Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. + + The method of parsing and what information gets included depends on the symbol's group. + """ + symbol_heading = soup.find(id=symbol_data.symbol_id) + if symbol_heading is None: + log.warning("Symbol present in loaded inventories not found on site, consider refreshing inventories.") + return "Unable to parse the requested symbol." + signature = None + # Modules, doc pages and labels don't point to description list tags but to tags like divs, + # no special parsing can be done so we only try to include what's under them. + if symbol_data.group in {"module", "doc", "label"}: + description = _get_general_description(symbol_heading) + + elif symbol_heading.name != "dt": + # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, + # log info the tag can be looked at. + description = _get_general_description(symbol_heading) + + elif symbol_data.group in _NO_SIGNATURE_GROUPS: + description = _get_dd_description(symbol_heading) + + else: + signature = _get_signatures(symbol_heading) + description = _get_dd_description(symbol_heading) + return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') diff --git a/tests/bot/test_converters.py b/tests/bot/test_converters.py index c42111f3f..231798a92 100644 --- a/tests/bot/test_converters.py +++ b/tests/bot/test_converters.py @@ -10,9 +10,9 @@ from bot.converters import ( Duration, HushDurationConverter, ISODateTime, + PackageName, TagContentConverter, TagNameConverter, - ValidPythonIdentifier, ) @@ -78,24 +78,23 @@ class ConverterTests(unittest.IsolatedAsyncioTestCase): with self.assertRaisesRegex(BadArgument, re.escape(exception_message)): await TagNameConverter.convert(self.context, invalid_name) - async def test_valid_python_identifier_for_valid(self): - """ValidPythonIdentifier returns valid identifiers unchanged.""" - test_values = ('foo', 'lemon') + async def test_package_name_for_valid(self): + """PackageName returns valid package names unchanged.""" + test_values = ('foo', 'le_mon') for name in test_values: with self.subTest(identifier=name): - conversion = await ValidPythonIdentifier.convert(self.context, name) + conversion = await PackageName.convert(self.context, name) self.assertEqual(name, conversion) - async def test_valid_python_identifier_for_invalid(self): - """ValidPythonIdentifier raises the proper exception for invalid identifiers.""" - test_values = ('nested.stuff', '#####') + async def test_package_name_for_invalid(self): + """PackageName raises the proper exception for invalid package names.""" + test_values = ('text_with_a_dot.', 'UpperCaseName', "num83r") for name in test_values: with self.subTest(identifier=name): - exception_message = f'`{name}` is not a valid Python identifier' - with self.assertRaisesRegex(BadArgument, re.escape(exception_message)): - await ValidPythonIdentifier.convert(self.context, name) + with self.assertRaises(BadArgument): + await PackageName.convert(self.context, name) async def test_duration_converter_for_valid(self): """Duration returns the correct `datetime` for valid duration strings.""" |