diff options
| -rw-r--r-- | LICENSE-THIRD-PARTY | 32 | ||||
| -rw-r--r-- | Pipfile | 2 | ||||
| -rw-r--r-- | bot/converters.py | 42 | ||||
| -rw-r--r-- | bot/exts/info/doc.py | 485 | ||||
| -rw-r--r-- | bot/exts/info/doc/__init__.py | 7 | ||||
| -rw-r--r-- | bot/exts/info/doc/_cog.py | 449 | ||||
| -rw-r--r-- | bot/exts/info/doc/_html.py | 33 | ||||
| -rw-r--r-- | bot/exts/info/doc/_inventory_parser.py | 120 | ||||
| -rw-r--r-- | bot/exts/info/doc/_markdown.py | 53 | ||||
| -rw-r--r-- | bot/exts/info/doc/_parsing.py | 312 | ||||
| -rw-r--r-- | bot/exts/info/doc/_redis_cache.py | 23 | ||||
| -rw-r--r-- | tests/bot/test_converters.py | 21 | 
12 files changed, 1069 insertions, 510 deletions
diff --git a/LICENSE-THIRD-PARTY b/LICENSE-THIRD-PARTY index eacd9b952..d454070c2 100644 --- a/LICENSE-THIRD-PARTY +++ b/LICENSE-THIRD-PARTY @@ -35,6 +35,38 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE  OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.  --------------------------------------------------------------------------------------------------- +                                       BSD 2-Clause License +Applies to: +    - bot/cogs/doc/inventory_parser.py: _load_v1, _load_v2 and ZlibStreamReader.__aiter__. +--------------------------------------------------------------------------------------------------- + +Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file). +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +* Redistributions of source code must retain the above copyright +  notice, this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright +  notice, this list of conditions and the following disclaimer in the +  documentation and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------------------------------------                             PYTHON SOFTWARE FOUNDATION LICENSE VERSION 2  Applies to:      - Copyright © 2001-2020 Python Software Foundation. All rights reserved. @@ -22,9 +22,7 @@ markdownify = "~=0.4"  more_itertools = "~=8.2"  python-dateutil = "~=2.8"  pyyaml = "~=5.1" -requests = "~=2.22"  sentry-sdk = "~=0.14" -sphinx = "~=2.2"  statsd = "~=3.3"  [dev-packages] diff --git a/bot/converters.py b/bot/converters.py index 2e118d476..3066eaabb 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -15,6 +15,7 @@ from discord.utils import DISCORD_EPOCH, snowflake_time  from bot.api import ResponseCodeError  from bot.constants import URLs +from bot.exts.info.doc import _inventory_parser  from bot.utils.regex import INVITE_RE  log = logging.getLogger(__name__) @@ -126,22 +127,20 @@ class ValidFilterListType(Converter):          return list_type -class ValidPythonIdentifier(Converter): +class PackageName(Converter):      """ -    A converter that checks whether the given string is a valid Python identifier. +    A converter that checks whether the given string is a valid package name. -    This is used to have package names that correspond to how you would use the package in your -    code, e.g. `import package`. - -    Raises `BadArgument` if the argument is not a valid Python identifier, and simply passes through -    the given argument otherwise. +    Package names are used for stats and are restricted to the a-z and _ characters.      """ -    @staticmethod -    async def convert(ctx: Context, argument: str) -> str: -        """Checks whether the given string is a valid Python identifier.""" -        if not argument.isidentifier(): -            raise BadArgument(f"`{argument}` is not a valid Python identifier") +    PACKAGE_NAME_RE = re.compile(r"[^a-z_]") + +    @classmethod +    async def convert(cls, ctx: Context, argument: str) -> str: +        """Checks whether the given string is a valid package name.""" +        if cls.PACKAGE_NAME_RE.search(argument): +            raise BadArgument("The provided package name is not valid, please only use the _ and a-z characters.")          return argument @@ -177,6 +176,25 @@ class ValidURL(Converter):          return url +class InventoryURL(Converter): +    """ +    Represents an Intersphinx inventory URL. + +    This converter checks whether intersphinx accepts the given inventory URL, and raises +    `BadArgument` if that is not the case. + +    Otherwise, it simply passes through the given URL. +    """ + +    @staticmethod +    async def convert(ctx: Context, url: str) -> str: +        """Convert url to Intersphinx inventory URL.""" +        await ctx.trigger_typing() +        if await _inventory_parser.fetch_inventory(ctx.bot.http_session, url) is None: +            raise BadArgument(f"Failed to fetch inventory file after {_inventory_parser.FAILED_REQUEST_ATTEMPTS}.") +        return url + +  class Snowflake(IDConverter):      """      Converts to an int if the argument is a valid Discord snowflake. diff --git a/bot/exts/info/doc.py b/bot/exts/info/doc.py deleted file mode 100644 index 7ec8caa4b..000000000 --- a/bot/exts/info/doc.py +++ /dev/null @@ -1,485 +0,0 @@ -import asyncio -import functools -import logging -import re -import textwrap -from contextlib import suppress -from types import SimpleNamespace -from typing import Optional, Tuple - -import discord -from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag -from discord.errors import NotFound -from discord.ext import commands -from markdownify import MarkdownConverter -from requests import ConnectTimeout, ConnectionError, HTTPError -from sphinx.ext import intersphinx -from urllib3.exceptions import ProtocolError - -from bot.bot import Bot -from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import ValidPythonIdentifier, ValidURL -from bot.pagination import LinePaginator -from bot.utils.cache import AsyncCache -from bot.utils.messages import wait_for_deletion - - -log = logging.getLogger(__name__) -logging.getLogger('urllib3').setLevel(logging.WARNING) - -# Since Intersphinx is intended to be used with Sphinx, -# we need to mock its configuration. -SPHINX_MOCK_APP = SimpleNamespace( -    config=SimpleNamespace( -        intersphinx_timeout=3, -        tls_verify=True, -        user_agent="python3:python-discord/bot:1.0.0" -    ) -) - -NO_OVERRIDE_GROUPS = ( -    "2to3fixer", -    "token", -    "label", -    "pdbcommand", -    "term", -) -NO_OVERRIDE_PACKAGES = ( -    "python", -) - -SEARCH_END_TAG_ATTRS = ( -    "data", -    "function", -    "class", -    "exception", -    "seealso", -    "section", -    "rubric", -    "sphinxsidebar", -) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") -WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") - -FAILED_REQUEST_RETRY_AMOUNT = 3 -NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay - -symbol_cache = AsyncCache() - - -class DocMarkdownConverter(MarkdownConverter): -    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" - -    def convert_code(self, el: PageElement, text: str) -> str: -        """Undo `markdownify`s underscore escaping.""" -        return f"`{text}`".replace('\\', '') - -    def convert_pre(self, el: PageElement, text: str) -> str: -        """Wrap any codeblocks in `py` for syntax highlighting.""" -        code = ''.join(el.strings) -        return f"```py\n{code}```" - - -def markdownify(html: str) -> DocMarkdownConverter: -    """Create a DocMarkdownConverter object from the input html.""" -    return DocMarkdownConverter(bullets='•').convert(html) - - -class InventoryURL(commands.Converter): -    """ -    Represents an Intersphinx inventory URL. - -    This converter checks whether intersphinx accepts the given inventory URL, and raises -    `BadArgument` if that is not the case. - -    Otherwise, it simply passes through the given URL. -    """ - -    @staticmethod -    async def convert(ctx: commands.Context, url: str) -> str: -        """Convert url to Intersphinx inventory URL.""" -        try: -            intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url) -        except AttributeError: -            raise commands.BadArgument(f"Failed to fetch Intersphinx inventory from URL `{url}`.") -        except ConnectionError: -            if url.startswith('https'): -                raise commands.BadArgument( -                    f"Cannot establish a connection to `{url}`. Does it support HTTPS?" -                ) -            raise commands.BadArgument(f"Cannot connect to host with URL `{url}`.") -        except ValueError: -            raise commands.BadArgument( -                f"Failed to read Intersphinx inventory from URL `{url}`. " -                "Are you sure that it's a valid inventory file?" -            ) -        return url - - -class Doc(commands.Cog): -    """A set of commands for querying & displaying documentation.""" - -    def __init__(self, bot: Bot): -        self.base_urls = {} -        self.bot = bot -        self.inventories = {} -        self.renamed_symbols = set() - -        self.bot.loop.create_task(self.init_refresh_inventory()) - -    async def init_refresh_inventory(self) -> None: -        """Refresh documentation inventory on cog initialization.""" -        await self.bot.wait_until_guild_available() -        await self.refresh_inventory() - -    async def update_single( -        self, package_name: str, base_url: str, inventory_url: str -    ) -> None: -        """ -        Rebuild the inventory for a single package. - -        Where: -            * `package_name` is the package name to use, appears in the log -            * `base_url` is the root documentation URL for the specified package, used to build -                absolute paths that link to specific symbols -            * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running -                `intersphinx.fetch_inventory` in an executor on the bot's event loop -        """ -        self.base_urls[package_name] = base_url - -        package = await self._fetch_inventory(inventory_url) -        if not package: -            return None - -        for group, value in package.items(): -            for symbol, (package_name, _version, relative_doc_url, _) in value.items(): -                absolute_doc_url = base_url + relative_doc_url - -                if symbol in self.inventories: -                    group_name = group.split(":")[1] -                    symbol_base_url = self.inventories[symbol].split("/", 3)[2] -                    if ( -                        group_name in NO_OVERRIDE_GROUPS -                        or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) -                    ): - -                        symbol = f"{group_name}.{symbol}" -                        # If renamed `symbol` already exists, add library name in front to differentiate between them. -                        if symbol in self.renamed_symbols: -                            # Split `package_name` because of packages like Pillow that have spaces in them. -                            symbol = f"{package_name.split()[0]}.{symbol}" - -                        self.inventories[symbol] = absolute_doc_url -                        self.renamed_symbols.add(symbol) -                        continue - -                self.inventories[symbol] = absolute_doc_url - -        log.trace(f"Fetched inventory for {package_name}.") - -    async def refresh_inventory(self) -> None: -        """Refresh internal documentation inventory.""" -        log.debug("Refreshing documentation inventory...") - -        # Clear the old base URLS and inventories to ensure -        # that we start from a fresh local dataset. -        # Also, reset the cache used for fetching documentation. -        self.base_urls.clear() -        self.inventories.clear() -        self.renamed_symbols.clear() -        symbol_cache.clear() - -        # Run all coroutines concurrently - since each of them performs a HTTP -        # request, this speeds up fetching the inventory data heavily. -        coros = [ -            self.update_single( -                package["package"], package["base_url"], package["inventory_url"] -            ) for package in await self.bot.api_client.get('bot/documentation-links') -        ] -        await asyncio.gather(*coros) - -    async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]: -        """ -        Given a Python symbol, return its signature and description. - -        The first tuple element is the signature of the given symbol as a markup-free string, and -        the second tuple element is the description of the given symbol with HTML markup included. - -        If the given symbol is a module, returns a tuple `(None, str)` -        else if the symbol could not be found, returns `None`. -        """ -        url = self.inventories.get(symbol) -        if url is None: -            return None - -        async with self.bot.http_session.get(url) as response: -            html = await response.text(encoding='utf-8') - -        # Find the signature header and parse the relevant parts. -        symbol_id = url.split('#')[-1] -        soup = BeautifulSoup(html, 'lxml') -        symbol_heading = soup.find(id=symbol_id) -        search_html = str(soup) - -        if symbol_heading is None: -            return None - -        if symbol_id == f"module-{symbol}": -            # Get page content from the module headerlink to the -            # first tag that has its class in `SEARCH_END_TAG_ATTRS` -            start_tag = symbol_heading.find("a", attrs={"class": "headerlink"}) -            if start_tag is None: -                return [], "" - -            end_tag = start_tag.find_next(self._match_end_tag) -            if end_tag is None: -                return [], "" - -            description_start_index = search_html.find(str(start_tag.parent)) + len(str(start_tag.parent)) -            description_end_index = search_html.find(str(end_tag)) -            description = search_html[description_start_index:description_end_index] -            signatures = None - -        else: -            signatures = [] -            description = str(symbol_heading.find_next_sibling("dd")) -            description_pos = search_html.find(description) -            # Get text of up to 3 signatures, remove unwanted symbols -            for element in [symbol_heading] + symbol_heading.find_next_siblings("dt", limit=2): -                signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) -                if signature and search_html.find(str(element)) < description_pos: -                    signatures.append(signature) - -        return signatures, description.replace('¶', '') - -    @symbol_cache(arg_offset=1) -    async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: -        """ -        Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. - -        If the symbol is known, an Embed with documentation about it is returned. -        """ -        scraped_html = await self.get_symbol_html(symbol) -        if scraped_html is None: -            return None - -        signatures = scraped_html[0] -        permalink = self.inventories[symbol] -        description = markdownify(scraped_html[1]) - -        # Truncate the description of the embed to the last occurrence -        # of a double newline (interpreted as a paragraph) before index 1000. -        if len(description) > 1000: -            shortened = description[:1000] -            description_cutoff = shortened.rfind('\n\n', 100) -            if description_cutoff == -1: -                # Search the shortened version for cutoff points in decreasing desirability, -                # cutoff at 1000 if none are found. -                for string in (". ", ", ", ",", " "): -                    description_cutoff = shortened.rfind(string) -                    if description_cutoff != -1: -                        break -                else: -                    description_cutoff = 1000 -            description = description[:description_cutoff] - -            # If there is an incomplete code block, cut it out -            if description.count("```") % 2: -                codeblock_start = description.rfind('```py') -                description = description[:codeblock_start].rstrip() -            description += f"... [read more]({permalink})" - -        description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) -        if signatures is None: -            # If symbol is a module, don't show signature. -            embed_description = description - -        elif not signatures: -            # It's some "meta-page", for example: -            # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views -            embed_description = "This appears to be a generic page not tied to a specific symbol." - -        else: -            embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) -            embed_description += f"\n{description}" - -        embed = discord.Embed( -            title=f'`{symbol}`', -            url=permalink, -            description=embed_description -        ) -        # Show all symbols with the same name that were renamed in the footer. -        embed.set_footer( -            text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) -        ) -        return embed - -    @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) -    async def docs_group(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: -        """Lookup documentation for Python symbols.""" -        await self.get_command(ctx, symbol) - -    @docs_group.command(name='get', aliases=('g',)) -    async def get_command(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: -        """ -        Return a documentation embed for a given symbol. - -        If no symbol is given, return a list of all available inventories. - -        Examples: -            !docs -            !docs aiohttp -            !docs aiohttp.ClientSession -            !docs get aiohttp.ClientSession -        """ -        if symbol is None: -            inventory_embed = discord.Embed( -                title=f"All inventories (`{len(self.base_urls)}` total)", -                colour=discord.Colour.blue() -            ) - -            lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items()) -            if self.base_urls: -                await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False) - -            else: -                inventory_embed.description = "Hmmm, seems like there's nothing here yet." -                await ctx.send(embed=inventory_embed) - -        else: -            # Fetching documentation for a symbol (at least for the first time, since -            # caching is used) takes quite some time, so let's send typing to indicate -            # that we got the command, but are still working on it. -            async with ctx.typing(): -                doc_embed = await self.get_symbol_embed(symbol) - -            if doc_embed is None: -                error_embed = discord.Embed( -                    description=f"Sorry, I could not find any documentation for `{symbol}`.", -                    colour=discord.Colour.red() -                ) -                error_message = await ctx.send(embed=error_embed) -                with suppress(NotFound): -                    await error_message.delete(delay=NOT_FOUND_DELETE_DELAY) -                    await ctx.message.delete(delay=NOT_FOUND_DELETE_DELAY) -            else: -                msg = await ctx.send(embed=doc_embed) -                await wait_for_deletion(msg, (ctx.author.id,), client=self.bot) - -    @docs_group.command(name='set', aliases=('s',)) -    @commands.has_any_role(*MODERATION_ROLES) -    async def set_command( -        self, ctx: commands.Context, package_name: ValidPythonIdentifier, -        base_url: ValidURL, inventory_url: InventoryURL -    ) -> None: -        """ -        Adds a new documentation metadata object to the site's database. - -        The database will update the object, should an existing item with the specified `package_name` already exist. - -        Example: -            !docs set \ -                    python \ -                    https://docs.python.org/3/ \ -                    https://docs.python.org/3/objects.inv -        """ -        body = { -            'package': package_name, -            'base_url': base_url, -            'inventory_url': inventory_url -        } -        await self.bot.api_client.post('bot/documentation-links', json=body) - -        log.info( -            f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" -            f"Package name: {package_name}\n" -            f"Base url: {base_url}\n" -            f"Inventory URL: {inventory_url}" -        ) - -        # Rebuilding the inventory can take some time, so lets send out a -        # typing event to show that the Bot is still working. -        async with ctx.typing(): -            await self.refresh_inventory() -        await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") - -    @docs_group.command(name='delete', aliases=('remove', 'rm', 'd')) -    @commands.has_any_role(*MODERATION_ROLES) -    async def delete_command(self, ctx: commands.Context, package_name: ValidPythonIdentifier) -> None: -        """ -        Removes the specified package from the database. - -        Examples: -            !docs delete aiohttp -        """ -        await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') - -        async with ctx.typing(): -            # Rebuild the inventory to ensure that everything -            # that was from this package is properly deleted. -            await self.refresh_inventory() -        await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") - -    @docs_group.command(name="refresh", aliases=("rfsh", "r")) -    @commands.has_any_role(*MODERATION_ROLES) -    async def refresh_command(self, ctx: commands.Context) -> None: -        """Refresh inventories and send differences to channel.""" -        old_inventories = set(self.base_urls) -        with ctx.typing(): -            await self.refresh_inventory() -        # Get differences of added and removed inventories -        added = ', '.join(inv for inv in self.base_urls if inv not in old_inventories) -        if added: -            added = f"+ {added}" - -        removed = ', '.join(inv for inv in old_inventories if inv not in self.base_urls) -        if removed: -            removed = f"- {removed}" - -        embed = discord.Embed( -            title="Inventories refreshed", -            description=f"```diff\n{added}\n{removed}```" if added or removed else "" -        ) -        await ctx.send(embed=embed) - -    async def _fetch_inventory(self, inventory_url: str) -> Optional[dict]: -        """Get and return inventory from `inventory_url`. If fetching fails, return None.""" -        fetch_func = functools.partial(intersphinx.fetch_inventory, SPHINX_MOCK_APP, '', inventory_url) -        for retry in range(1, FAILED_REQUEST_RETRY_AMOUNT+1): -            try: -                package = await self.bot.loop.run_in_executor(None, fetch_func) -            except ConnectTimeout: -                log.error( -                    f"Fetching of inventory {inventory_url} timed out," -                    f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" -                ) -            except ProtocolError: -                log.error( -                    f"Connection lost while fetching inventory {inventory_url}," -                    f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})" -                ) -            except HTTPError as e: -                log.error(f"Fetching of inventory {inventory_url} failed with status code {e.response.status_code}.") -                return None -            except ConnectionError: -                log.error(f"Couldn't establish connection to inventory {inventory_url}.") -                return None -            else: -                return package -        log.error(f"Fetching of inventory {inventory_url} failed.") -        return None - -    @staticmethod -    def _match_end_tag(tag: Tag) -> bool: -        """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" -        for attr in SEARCH_END_TAG_ATTRS: -            if attr in tag.get("class", ()): -                return True - -        return tag.name == "table" - - -def setup(bot: Bot) -> None: -    """Load the Doc cog.""" -    bot.add_cog(Doc(bot)) diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py new file mode 100644 index 000000000..e9eb9428c --- /dev/null +++ b/bot/exts/info/doc/__init__.py @@ -0,0 +1,7 @@ +from bot.bot import Bot +from ._cog import DocCog + + +def setup(bot: Bot) -> None: +    """Load the Doc cog.""" +    bot.add_cog(DocCog(bot)) diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py new file mode 100644 index 000000000..25477fe07 --- /dev/null +++ b/bot/exts/info/doc/_cog.py @@ -0,0 +1,449 @@ +from __future__ import annotations + +import asyncio +import logging +import re +import sys +import urllib.parse +from collections import defaultdict +from contextlib import suppress +from typing import Dict, List, NamedTuple, Optional, Union + +import discord +from aiohttp import ClientSession +from bs4 import BeautifulSoup +from discord.ext import commands + +from bot.bot import Bot +from bot.constants import MODERATION_ROLES, RedirectOutput +from bot.converters import InventoryURL, PackageName, ValidURL +from bot.pagination import LinePaginator +from bot.utils.messages import wait_for_deletion +from bot.utils.scheduling import Scheduler +from ._inventory_parser import fetch_inventory +from ._parsing import get_symbol_markdown +from ._redis_cache import DocRedisCache + +log = logging.getLogger(__name__) + +# symbols with a group contained here will get the group prefixed on duplicates +FORCE_PREFIX_GROUPS = ( +    "2to3fixer", +    "token", +    "label", +    "pdbcommand", +    "term", +) +WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay + + +class DocItem(NamedTuple): +    """Holds inventory symbol information.""" + +    package: str +    group: str +    base_url: str +    relative_url_path: str +    symbol_id: str + +    @property +    def url(self) -> str: +        """Return the absolute url to the symbol.""" +        return "".join((self.base_url, self.relative_url_path)) + + +class QueueItem(NamedTuple): +    """Contains a symbol and the BeautifulSoup object needed to parse it.""" + +    symbol: DocItem +    soup: BeautifulSoup + +    def __eq__(self, other: Union[QueueItem, DocItem]): +        if isinstance(other, DocItem): +            return self.symbol == other +        return NamedTuple.__eq__(self, other) + + +class CachedParser: +    """ +    Get symbol markdown from pages with smarter caching. + +    DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. +    `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, +    all of the symbols are queued to be parsed to avoid multiple web requests to the same page. +    """ + +    def __init__(self): +        self._queue: List[QueueItem] = [] +        self._results = {} +        self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) +        self._item_events: Dict[DocItem, asyncio.Event] = {} +        self._parse_task = None + +    async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: +        """ +        Get result markdown of `doc_item`. + +        If no symbols were fetched from `doc_item`s page before, +        the HTML has to be fetched before parsing can be queued. +        """ +        if (symbol := self._results.get(doc_item)) is not None: +            return symbol + +        if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: +            async with client_session.get(doc_item.url) as response: +                soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + +            self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) +            del self._page_symbols[doc_item.url] +            log.debug(f"Added symbols from {doc_item.url} to parse queue.") + +            if self._parse_task is None: +                self._parse_task = asyncio.create_task(self._parse_queue()) + +        self._move_to_front(doc_item) +        self._item_events[doc_item] = item_event = asyncio.Event() +        await item_event.wait() +        return self._results[doc_item] + +    async def _parse_queue(self) -> None: +        """ +        Parse all item from the queue, setting associated events for symbols if present. + +        The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. +        """ +        log.trace("Starting queue parsing.") +        while self._queue: +            item, soup = self._queue.pop() +            try: +                self._results[item] = get_symbol_markdown(soup, item) +            except Exception: +                log.exception(f"Unexpected error when handling {item}") +            else: +                if (event := self._item_events.get(item)) is not None: +                    event.set() +            await asyncio.sleep(0.1) + +        self._parse_task = None +        log.trace("Finished parsing queue.") + +    def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: +        """Move `item` to the front of the parse queue.""" +        # The parse queue stores soups along with the doc symbols in QueueItem objects, +        # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. +        item_index = self._queue.index(item) +        queue_item = self._queue[item_index] + +        del self._queue[item_index] +        self._queue.append(queue_item) + +    def add_item(self, doc_item: DocItem) -> None: +        """Add a DocItem to `_page_symbols`.""" +        self._page_symbols[doc_item.url].append(doc_item) + +    async def clear(self) -> None: +        """ +        Clear all internal symbol data. + +        All currently requested items are waited to be parsed before clearing. +        """ +        for event in self._item_events.values(): +            await event.wait() +        if self._parse_task is not None: +            self._parse_task.cancel() +            self._parse_task = None +        self._queue.clear() +        self._results.clear() +        self._page_symbols.clear() +        self._item_events.clear() + + +class DocCog(commands.Cog): +    """A set of commands for querying & displaying documentation.""" + +    doc_cache = DocRedisCache() + +    def __init__(self, bot: Bot): +        self.base_urls = {} +        self.bot = bot +        self.doc_symbols: Dict[str, DocItem] = {} +        self.item_fetcher = CachedParser() +        self.renamed_symbols = set() + +        self.inventory_scheduler = Scheduler(self.__class__.__name__) +        self.scheduled_inventories = set() + +        self.bot.loop.create_task(self.init_refresh_inventory()) + +    async def init_refresh_inventory(self) -> None: +        """Refresh documentation inventory on cog initialization.""" +        await self.bot.wait_until_guild_available() +        await self.refresh_inventory() + +    async def update_single( +        self, api_package_name: str, base_url: str, inventory_url: str +    ) -> bool: +        """ +        Rebuild the inventory for a single package. + +        Where: +            * `package_name` is the package name to use, appears in the log +            * `base_url` is the root documentation URL for the specified package, used to build +                absolute paths that link to specific symbols +            * `inventory_url` is the absolute URL to the intersphinx inventory. + +        If the inventory file is currently unreachable, +        the update is rescheduled to execute in 2 minutes on the first attempt, and 5 minutes on subsequent attempts. + +        Return True on success; False if fetching failed and was rescheduled. +        """ +        self.base_urls[api_package_name] = base_url +        package = await fetch_inventory(self.bot.http_session, inventory_url) + +        if not package: +            delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60 +            log.info(f"Failed to fetch inventory, attempting again in {delay//60} minutes.") +            self.inventory_scheduler.schedule_later( +                delay, +                api_package_name, +                fetch_inventory(self.bot.http_session, inventory_url) +            ) +            self.scheduled_inventories.add(api_package_name) +            return False +        with suppress(KeyError): +            self.scheduled_inventories.discard(api_package_name) + +        for group, items in package.items(): +            for symbol, relative_doc_url in items: +                if "/" in symbol: +                    continue  # skip unreachable symbols with slashes +                # Intern the group names since they're reused in all the DocItems +                # to remove unnecessary memory consumption from them being unique objects +                group_name = sys.intern(group.split(":")[1]) + +                if (original_symbol := self.doc_symbols.get(symbol)) is not None: +                    if group_name in FORCE_PREFIX_GROUPS: +                        symbol = f"{group_name}.{symbol}" +                        self.renamed_symbols.add(symbol) + +                    elif (overridden_symbol_group := original_symbol.group) in FORCE_PREFIX_GROUPS: +                        overridden_symbol = f"{overridden_symbol_group}.{symbol}" +                        if overridden_symbol in self.renamed_symbols: +                            overridden_symbol = f"{api_package_name}.{overridden_symbol}" + +                        self.doc_symbols[overridden_symbol] = original_symbol +                        self.renamed_symbols.add(overridden_symbol) + +                    else: +                        symbol = f"{api_package_name}.{symbol}" +                        self.renamed_symbols.add(symbol) + +                relative_url_path, _, symbol_id = relative_doc_url.partition("#") +                symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) +                self.doc_symbols[symbol] = symbol_item +                self.item_fetcher.add_item(symbol_item) + +        log.trace(f"Fetched inventory for {api_package_name}.") +        return True + +    async def refresh_inventory(self) -> None: +        """Refresh internal documentation inventory.""" +        log.debug("Refreshing documentation inventory...") + +        # Clear the old base URLS and doc symbols to ensure +        # that we start from a fresh local dataset. +        # Also, reset the cache used for fetching documentation. +        self.base_urls.clear() +        self.doc_symbols.clear() +        self.renamed_symbols.clear() +        self.scheduled_inventories.clear() +        await self.item_fetcher.clear() + +        # Run all coroutines concurrently - since each of them performs a HTTP +        # request, this speeds up fetching the inventory data heavily. +        coros = [ +            self.update_single( +                package["package"], package["base_url"], package["inventory_url"] +            ) for package in await self.bot.api_client.get('bot/documentation-links') +        ] +        await asyncio.gather(*coros) + +    async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]: +        """ +        Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents. + +        If the symbol is known, an Embed with documentation about it is returned. + +        First check the DocRedisCache before querying the cog's `CachedParser`, +        if not present also create a redis entry for the symbol. +        """ +        log.trace(f"Building embed for symbol `{symbol}`") +        symbol_info = self.doc_symbols.get(symbol) +        if symbol_info is None: +            log.debug("Symbol does not exist.") +            return None +        self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") + +        item_url = f"{symbol_info.url}#{symbol_info.symbol_id}" +        redis_key = "".join(urllib.parse.urlparse(item_url)[1:])  # url without scheme + +        markdown = await self.doc_cache.get(redis_key) +        if markdown is None: +            log.debug(f"Redis cache miss for symbol `{symbol}`.") +            markdown = await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) +            await self.doc_cache.set(redis_key, markdown) + +        embed = discord.Embed( +            title=discord.utils.escape_markdown(symbol), +            url=item_url, +            description=markdown +        ) +        # Show all symbols with the same name that were renamed in the footer. +        embed.set_footer( +            text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}")) +        ) +        return embed + +    @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) +    async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: +        """Lookup documentation for Python symbols.""" +        await ctx.invoke(self.get_command, symbol=symbol) + +    @docs_group.command(name='getdoc', aliases=('g',)) +    async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None: +        """ +        Return a documentation embed for a given symbol. + +        If no symbol is given, return a list of all available inventories. + +        Examples: +            !docs +            !docs aiohttp +            !docs aiohttp.ClientSession +            !docs getdoc aiohttp.ClientSession +        """ +        if not symbol: +            inventory_embed = discord.Embed( +                title=f"All inventories (`{len(self.base_urls)}` total)", +                colour=discord.Colour.blue() +            ) + +            lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items()) +            if self.base_urls: +                await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False) + +            else: +                inventory_embed.description = "Hmmm, seems like there's nothing here yet." +                await ctx.send(embed=inventory_embed) + +        else: +            symbol = symbol.strip("`") +            # Fetching documentation for a symbol (at least for the first time, since +            # caching is used) takes quite some time, so let's send typing to indicate +            # that we got the command, but are still working on it. +            async with ctx.typing(): +                doc_embed = await self.get_symbol_embed(symbol) + +            if doc_embed is None: +                symbol = await discord.ext.commands.clean_content().convert(ctx, symbol) +                error_embed = discord.Embed( +                    description=f"Sorry, I could not find any documentation for `{(symbol)}`.", +                    colour=discord.Colour.red() +                ) +                error_message = await ctx.send(embed=error_embed) +                await wait_for_deletion( +                    error_message, +                    (ctx.author.id,), +                    timeout=NOT_FOUND_DELETE_DELAY, +                    client=self.bot +                ) +                with suppress(discord.NotFound): +                    await ctx.message.delete() +                with suppress(discord.NotFound): +                    await error_message.delete() +            else: +                msg = await ctx.send(embed=doc_embed) +                await wait_for_deletion(msg, (ctx.author.id,), client=self.bot) + +    @docs_group.command(name='setdoc', aliases=('s',)) +    @commands.has_any_role(*MODERATION_ROLES) +    async def set_command( +        self, ctx: commands.Context, package_name: PackageName, +        base_url: ValidURL, inventory_url: InventoryURL +    ) -> None: +        """ +        Adds a new documentation metadata object to the site's database. + +        The database will update the object, should an existing item with the specified `package_name` already exist. + +        Example: +            !docs setdoc \ +                    python \ +                    https://docs.python.org/3/ \ +                    https://docs.python.org/3/objects.inv +        """ +        body = { +            'package': package_name, +            'base_url': base_url, +            'inventory_url': inventory_url +        } +        await self.bot.api_client.post('bot/documentation-links', json=body) + +        log.info( +            f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n" +            f"Package name: {package_name}\n" +            f"Base url: {base_url}\n" +            f"Inventory URL: {inventory_url}" +        ) + +        if await self.update_single(package_name, base_url, inventory_url) is None: +            await ctx.send( +                f"Added package `{package_name}` to database but failed to fetch inventory; rescheduled in 2 minutes." +            ) +            return +        await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") + +    @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd')) +    @commands.has_any_role(*MODERATION_ROLES) +    async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None: +        """ +        Removes the specified package from the database. + +        Examples: +            !docs deletedoc aiohttp +        """ +        await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') + +        if package_name in self.scheduled_inventories: +            self.inventory_scheduler.cancel(package_name) + +        async with ctx.typing(): +            # Rebuild the inventory to ensure that everything +            # that was from this package is properly deleted. +            await self.refresh_inventory() +        await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") + +    @docs_group.command(name="refreshdoc", aliases=("rfsh", "r")) +    @commands.has_any_role(*MODERATION_ROLES) +    async def refresh_command(self, ctx: commands.Context) -> None: +        """Refresh inventories and send differences to channel.""" +        for inventory in self.scheduled_inventories: +            self.inventory_scheduler.cancel(inventory) + +        old_inventories = set(self.base_urls) +        with ctx.typing(): +            await self.refresh_inventory() +        new_inventories = set(self.base_urls) + +        if added := ", ".join(new_inventories - old_inventories): +            added = "+ " + added + +        if removed := ", ".join(old_inventories - new_inventories): +            removed = "- " + removed + +        embed = discord.Embed( +            title="Inventories refreshed", +            description=f"```diff\n{added}\n{removed}```" if added or removed else "" +        ) +        await ctx.send(embed=embed) diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py new file mode 100644 index 000000000..bc705130d --- /dev/null +++ b/bot/exts/info/doc/_html.py @@ -0,0 +1,33 @@ +from collections.abc import Iterable +from typing import List, Union + +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + + +class Strainer(SoupStrainer): +    """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" + +    def __init__(self, *, include_strings: bool, **kwargs): +        self.include_strings = include_strings +        super().__init__(**kwargs) + +    markup_hint = Union[PageElement, List["markup_hint"]] + +    def search(self, markup: markup_hint) -> Union[PageElement, str]: +        """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" +        if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)): +            for element in markup: +                if isinstance(element, NavigableString) and self.search(element): +                    return element +        elif isinstance(markup, Tag): +            # Also include tags while we're searching for strings and tags. +            if self.include_strings or (not self.text or self.name or self.attrs): +                return self.search_tag(markup) + +        elif isinstance(markup, str): +            # Let everything through the text filter if we're including strings and tags. +            text_filter = None if not self.include_strings else True +            if not self.name and not self.attrs and self._matches(markup, text_filter): +                return markup +        else: +            raise Exception(f"I don't know how to match against a {markup.__class__}") diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py new file mode 100644 index 000000000..23931869b --- /dev/null +++ b/bot/exts/info/doc/_inventory_parser.py @@ -0,0 +1,120 @@ +import logging +import re +import zlib +from collections import defaultdict +from typing import AsyncIterator, DefaultDict, List, Optional, Tuple + +import aiohttp + +log = logging.getLogger(__name__) + +FAILED_REQUEST_ATTEMPTS = 3 +_V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)') + + +class ZlibStreamReader: +    """Class used for decoding zlib data of a stream line by line.""" + +    READ_CHUNK_SIZE = 16 * 1024 + +    def __init__(self, stream: aiohttp.StreamReader) -> None: +        self.stream = stream + +    async def _read_compressed_chunks(self) -> AsyncIterator[bytes]: +        """Read zlib data in `READ_CHUNK_SIZE` sized chunks and decompress.""" +        decompressor = zlib.decompressobj() +        async for chunk in self.stream.iter_chunked(self.READ_CHUNK_SIZE): +            yield decompressor.decompress(chunk) + +        yield decompressor.flush() + +    async def __aiter__(self) -> AsyncIterator[str]: +        """Yield lines of decompressed text.""" +        buf = b'' +        async for chunk in self._read_compressed_chunks(): +            buf += chunk +            pos = buf.find(b'\n') +            while pos != -1: +                yield buf[:pos].decode() +                buf = buf[pos + 1:] +                pos = buf.find(b'\n') + + +async def _load_v1(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: +    invdata = defaultdict(list) + +    async for line in stream: +        name, type_, location = line.decode().rstrip().split(maxsplit=2) +        # version 1 did not add anchors to the location +        if type_ == 'mod': +            type_ = 'py:module' +            location += '#module-' + name +        else: +            type_ = 'py:' + type_ +            location += '#' + name +        invdata[type_].append((name, location)) +    return invdata + + +async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]: +    invdata = defaultdict(list) + +    async for line in ZlibStreamReader(stream): +        m = _V2_LINE_RE.match(line.rstrip()) +        name, type_, _prio, location, _dispname = m.groups()  # ignore the parsed items we don't need +        if location.endswith('$'): +            location = location[:-1] + name + +        invdata[type_].append((name, location)) +    return invdata + + +async def _fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]: +    """Fetch, parse and return an intersphinx inventory file from an url.""" +    timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5) +    async with client_session.get(url, timeout=timeout, raise_for_status=True) as response: +        stream = response.content + +        inventory_header = (await stream.readline()).decode().rstrip() +        inventory_version = int(inventory_header[-1:]) +        await stream.readline()  # skip project name +        await stream.readline()  # skip project version + +        if inventory_version == 1: +            return await _load_v1(stream) + +        elif inventory_version == 2: +            if b"zlib" not in await stream.readline(): +                raise ValueError(f"Invalid inventory file at url {url}.") +            return await _load_v2(stream) + +        raise ValueError(f"Invalid inventory file at url {url}.") + + +async def fetch_inventory( +        client_session: aiohttp.ClientSession, +        url: str +) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]: +    """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors.""" +    for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1): +        try: +            inventory = await _fetch_inventory(client_session, url) +        except aiohttp.ClientConnectorError: +            log.warning( +                f"Failed to connect to inventory url at {url}, " +                f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." +            ) +        except aiohttp.ClientError: +            log.error( +                f"Failed to get inventory from {url}, " +                f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." +            ) +        except Exception: +            log.exception( +                f"An unexpected error has occurred during fetching of {url}, " +                f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})." +            ) +        else: +            return inventory + +    return None diff --git a/bot/exts/info/doc/_markdown.py b/bot/exts/info/doc/_markdown.py new file mode 100644 index 000000000..ba35a84c4 --- /dev/null +++ b/bot/exts/info/doc/_markdown.py @@ -0,0 +1,53 @@ +from urllib.parse import urljoin + +from bs4.element import PageElement +from markdownify import MarkdownConverter + + +class DocMarkdownConverter(MarkdownConverter): +    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + +    def __init__(self, *, page_url: str, **options): +        super().__init__(**options) +        self.page_url = page_url + +    def convert_li(self, el: PageElement, text: str) -> str: +        """Fix markdownify's erroneous indexing in ol tags.""" +        parent = el.parent +        if parent is not None and parent.name == "ol": +            li_tags = parent.find_all("li") +            bullet = f"{li_tags.index(el)+1}." +        else: +            depth = -1 +            while el: +                if el.name == "ul": +                    depth += 1 +                el = el.parent +            bullets = self.options["bullets"] +            bullet = bullets[depth % len(bullets)] +        return f"{bullet} {text}\n" + +    def convert_hn(self, _n: int, el: PageElement, text: str) -> str: +        """Convert h tags to bold text with ** instead of adding #.""" +        return f"**{text}**\n\n" + +    def convert_code(self, el: PageElement, text: str) -> str: +        """Undo `markdownify`s underscore escaping.""" +        return f"`{text}`".replace("\\", "") + +    def convert_pre(self, el: PageElement, text: str) -> str: +        """Wrap any codeblocks in `py` for syntax highlighting.""" +        code = "".join(el.strings) +        return f"```py\n{code}```" + +    def convert_a(self, el: PageElement, text: str) -> str: +        """Resolve relative URLs to `self.page_url`.""" +        el["href"] = urljoin(self.page_url, el["href"]) +        return super().convert_a(el, text) + +    def convert_p(self, el: PageElement, text: str) -> str: +        """Include only one newline instead of two when the parent is a li tag.""" +        parent = el.parent +        if parent is not None and parent.name == "li": +            return f"{text}\n" +        return super().convert_p(el, text) diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py new file mode 100644 index 000000000..0883b9f42 --- /dev/null +++ b/bot/exts/info/doc/_parsing.py @@ -0,0 +1,312 @@ +from __future__ import annotations + +import logging +import re +import string +import textwrap +from functools import partial +from typing import Callable, Collection, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union + +from bs4 import BeautifulSoup +from bs4.element import NavigableString, PageElement, Tag + +from ._html import Strainer +from ._markdown import DocMarkdownConverter +if TYPE_CHECKING: +    from ._cog import DocItem + +log = logging.getLogger(__name__) + +_MAX_SIGNATURE_AMOUNT = 3 + +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") +_PARAMETERS_RE = re.compile(r"\((.+)\)") + +_SEARCH_END_TAG_ATTRS = ( +    "data", +    "function", +    "class", +    "exception", +    "seealso", +    "section", +    "rubric", +    "sphinxsidebar", +) + +_NO_SIGNATURE_GROUPS = { +    "attribute", +    "envvar", +    "setting", +    "tempaltefilter", +    "templatetag", +    "term", +} +_EMBED_CODE_BLOCK_LENGTH = 61 +# _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight +_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT +# Maximum discord message length - signatures on top +_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH +_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace +_BRACKET_PAIRS = { +    "{": "}", +    "(": ")", +    "[": "]", +} + + +def _split_parameters(parameters_string: str) -> List[str]: +    """ +    Split parameters of a signature into individual parameter strings on commas. + +    Long string literals are not accounted for. +    """ +    parameters_list = [] +    last_split = 0 +    depth = 0 +    expected_end = None +    current_search = None + +    for index, character in enumerate(parameters_string): +        if character in _BRACKET_PAIRS: +            if current_search is None: +                current_search = character +                expected_end = _BRACKET_PAIRS[character] +            if character == current_search: +                depth += 1 + +        elif character in {"'", '"'}: +            if depth == 0: +                depth += 1 +            elif parameters_string[index-1] != "\\": +                depth -= 1 +            elif parameters_string[index-2] == "\\": +                depth -= 1 + +        elif character == expected_end: +            depth -= 1 +            if depth == 0: +                current_search = None +                expected_end = None + +        elif depth == 0 and character == ",": +            parameters_list.append(parameters_string[last_split:index]) +            last_split = index + 1 + +    parameters_list.append(parameters_string[last_split:]) +    return parameters_list + + +def _find_elements_until_tag( +        start_element: PageElement, +        end_tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], +        *, +        func: Callable, +        include_strings: bool = False, +        limit: int = None, +) -> List[Union[Tag, NavigableString]]: +    """ +    Get all elements up to `limit` or until a tag matching `tag_filter` is found. + +    `end_tag_filter` can be either a tuple of string names to check against, +    or a filtering callable that's applied to tags. + +    When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. + +    `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. +    The method is then iterated over and all elements until the matching tag or the limit are added to the return list. +    """ +    use_tuple_filter = isinstance(end_tag_filter, tuple) +    elements = [] + +    for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): +        if isinstance(element, Tag): +            if use_tuple_filter: +                if element.name in end_tag_filter: +                    break +            elif end_tag_filter(element): +                break +        elements.append(element) + +    return elements + + +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_recursive_children_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_all) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) + + +def _get_general_description(start_element: Tag) -> List[Union[Tag, NavigableString]]: +    """ +    Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. + +    A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, +    if it's found it's used as the tag to start the search from instead of the `start_element`. +    """ +    child_tags = _find_recursive_children_until_tag(start_element, _class_filter_factory(["section"]), limit=100) +    header = next(filter(_class_filter_factory(["headerlink"]), child_tags), None) +    start_tag = header.parent if header is not None else start_element +    return _find_next_siblings_until_tag(start_tag, _class_filter_factory(_SEARCH_END_TAG_ATTRS), include_strings=True) + + +def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]: +    """Get the contents of the next dd tag, up to a dt or a dl tag.""" +    description_tag = symbol.find_next("dd") +    return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) + + +def _get_signatures(start_signature: PageElement) -> List[str]: +    """ +    Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag. + +    First the signatures under the `start_signature` are included; +    if less than 2 are found, tags above the start signature are added to the result if any are present. +    """ +    signatures = [] +    for element in ( +            *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), +            start_signature, +            *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), +    )[-(_MAX_SIGNATURE_AMOUNT):]: +        signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + +        if signature: +            signatures.append(signature) + +    return signatures + + +def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]: +    """ +    Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`. + +    If the signatures need to be truncated, parameters are collapsed until they fit withing the limit. +    Individual signatures can consist of max 1, 2, ..., `_MAX_SIGNATURE_AMOUNT` lines of text, +    inversely proportional to the amount of signatures. +    A maximum of `_MAX_SIGNATURE_AMOUNT` signatures is assumed to be passed. +    """ +    if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH: +        return signatures + +    max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures)) +    formatted_signatures = [] +    for signature in signatures: +        signature = signature.strip() +        if len(signature) > max_signature_length: +            if (parameters_match := _PARAMETERS_RE.search(signature)) is None: +                formatted_signatures.append(textwrap.shorten(signature, max_signature_length)) +                continue + +            truncated_signature = [] +            parameters_string = parameters_match[1] +            running_length = len(signature) - len(parameters_string) +            for parameter in _split_parameters(parameters_string): +                if (len(parameter) + running_length) <= max_signature_length - 4:  # account for comma and placeholder +                    truncated_signature.append(parameter) +                    running_length += len(parameter) + 1 +                else: +                    truncated_signature.append(" ...") +                    formatted_signatures.append(signature.replace(parameters_string, ",".join(truncated_signature))) +                    break +        else: +            formatted_signatures.append(signature) + +    return formatted_signatures + + +def _get_truncated_description( +        elements: Iterable[Union[Tag, NavigableString]], +        markdown_converter: DocMarkdownConverter, +        max_length: int, +) -> str: +    """ +    Truncate markdown from `elements` to be at most `max_length` characters visually. + +    `max_length` limits the length of the rendered characters in the string, +    with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits +    """ +    visual_length = 0 +    real_length = 0 +    result = [] +    shortened = False + +    for element in elements: +        is_tag = isinstance(element, Tag) +        element_length = len(element.text) if is_tag else len(element) +        if visual_length + element_length < max_length: +            if is_tag: +                element_markdown = markdown_converter.process_tag(element) +            else: +                element_markdown = markdown_converter.process_text(element) + +            element_markdown_length = len(element_markdown) +            if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH: +                result.append(element_markdown) +            else: +                shortened = True +                break +            real_length += element_markdown_length +            visual_length += element_length +        else: +            shortened = True +            break + +    markdown_string = "".join(result) +    if shortened: +        markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..." +    return markdown_string + + +def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str: +    """ +    Create a markdown string with the signatures at the top, and the converted html description below them. + +    The signatures are wrapped in python codeblocks, separated from the description by a newline. +    The result string is truncated to be max 1000 symbols long. +    """ +    description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750) +    description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) +    if signatures is not None: +        formatted_markdown = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures)) +    else: +        formatted_markdown = "" +    formatted_markdown += f"\n{description}" + +    return formatted_markdown + + +def _class_filter_factory(class_names: Iterable[str]) -> Callable[[Tag], bool]: +    """Create callable that returns True when the passed in tag's class is in `class_names` or when it's is a table.""" +    def match_tag(tag: Tag) -> bool: +        for attr in class_names: +            if attr in tag.get("class", ()): +                return True +        return tag.name == "table" + +    return match_tag + + +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: +    """ +    Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. + +    The method of parsing and what information gets included depends on the symbol's group. +    """ +    symbol_heading = soup.find(id=symbol_data.symbol_id) +    if symbol_heading is None: +        log.warning("Symbol present in loaded inventories not found on site, consider refreshing inventories.") +        return "Unable to parse the requested symbol." +    signature = None +    # Modules, doc pages and labels don't point to description list tags but to tags like divs, +    # no special parsing can be done so we only try to include what's under them. +    if symbol_data.group in {"module", "doc", "label"} or symbol_heading.name != "dt": +        description = _get_general_description(symbol_heading) + +    elif symbol_data.group in _NO_SIGNATURE_GROUPS: +        description = _get_dd_description(symbol_heading) + +    else: +        signature = _get_signatures(symbol_heading) +        description = _get_dd_description(symbol_heading) +    return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '') diff --git a/bot/exts/info/doc/_redis_cache.py b/bot/exts/info/doc/_redis_cache.py new file mode 100644 index 000000000..147394ba6 --- /dev/null +++ b/bot/exts/info/doc/_redis_cache.py @@ -0,0 +1,23 @@ +from typing import Optional + +from async_rediscache.types.base import RedisObject, namespace_lock + + +class DocRedisCache(RedisObject): +    """Interface for redis functionality needed by the Doc cog.""" + +    @namespace_lock +    async def set(self, key: str, value: str) -> None: +        """ +        Set markdown `value` for `key`. + +        Keys expire after a week to keep data up to date. +        """ +        with await self._get_pool_connection() as connection: +            await connection.setex(f"{self.namespace}:{key}", 7*24*60*60, value) + +    @namespace_lock +    async def get(self, key: str) -> Optional[str]: +        """Get markdown contents for `key`.""" +        with await self._get_pool_connection() as connection: +            return await connection.get(f"{self.namespace}:{key}", encoding="utf8") diff --git a/tests/bot/test_converters.py b/tests/bot/test_converters.py index c42111f3f..231798a92 100644 --- a/tests/bot/test_converters.py +++ b/tests/bot/test_converters.py @@ -10,9 +10,9 @@ from bot.converters import (      Duration,      HushDurationConverter,      ISODateTime, +    PackageName,      TagContentConverter,      TagNameConverter, -    ValidPythonIdentifier,  ) @@ -78,24 +78,23 @@ class ConverterTests(unittest.IsolatedAsyncioTestCase):                  with self.assertRaisesRegex(BadArgument, re.escape(exception_message)):                      await TagNameConverter.convert(self.context, invalid_name) -    async def test_valid_python_identifier_for_valid(self): -        """ValidPythonIdentifier returns valid identifiers unchanged.""" -        test_values = ('foo', 'lemon') +    async def test_package_name_for_valid(self): +        """PackageName returns valid package names unchanged.""" +        test_values = ('foo', 'le_mon')          for name in test_values:              with self.subTest(identifier=name): -                conversion = await ValidPythonIdentifier.convert(self.context, name) +                conversion = await PackageName.convert(self.context, name)                  self.assertEqual(name, conversion) -    async def test_valid_python_identifier_for_invalid(self): -        """ValidPythonIdentifier raises the proper exception for invalid identifiers.""" -        test_values = ('nested.stuff', '#####') +    async def test_package_name_for_invalid(self): +        """PackageName raises the proper exception for invalid package names.""" +        test_values = ('text_with_a_dot.', 'UpperCaseName', "num83r")          for name in test_values:              with self.subTest(identifier=name): -                exception_message = f'`{name}` is not a valid Python identifier' -                with self.assertRaisesRegex(BadArgument, re.escape(exception_message)): -                    await ValidPythonIdentifier.convert(self.context, name) +                with self.assertRaises(BadArgument): +                    await PackageName.convert(self.context, name)      async def test_duration_converter_for_valid(self):          """Duration returns the correct `datetime` for valid duration strings."""  |