diff options
Diffstat (limited to '')
| -rw-r--r-- | bot/cogs/doc/__init__.py | 7 | ||||
| -rw-r--r-- | bot/cogs/doc/cog.py (renamed from bot/cogs/doc.py) | 403 | ||||
| -rw-r--r-- | bot/cogs/doc/html.py | 33 | ||||
| -rw-r--r-- | bot/cogs/doc/markdown.py | 58 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 208 | ||||
| -rw-r--r-- | bot/converters.py | 22 | ||||
| -rw-r--r-- | tests/bot/test_converters.py | 21 | 
7 files changed, 511 insertions, 241 deletions
| diff --git a/bot/cogs/doc/__init__.py b/bot/cogs/doc/__init__.py new file mode 100644 index 000000000..19a71ee66 --- /dev/null +++ b/bot/cogs/doc/__init__.py @@ -0,0 +1,7 @@ +from bot.bot import Bot +from .cog import DocCog + + +def setup(bot: Bot) -> None: +    """Load the Doc cog.""" +    bot.add_cog(DocCog(bot)) diff --git a/bot/cogs/doc.py b/bot/cogs/doc/cog.py index 30c793c75..fc01dfb20 100644 --- a/bot/cogs/doc.py +++ b/bot/cogs/doc/cog.py @@ -1,30 +1,30 @@ +from __future__ import annotations +  import asyncio  import functools  import logging  import re -import textwrap -from collections import OrderedDict +import sys +from collections import defaultdict  from contextlib import suppress  from types import SimpleNamespace -from typing import Any, Callable, Optional, Tuple +from typing import Dict, List, NamedTuple, Optional, Union  import discord +from aiohttp import ClientSession  from bs4 import BeautifulSoup -from bs4.element import PageElement, Tag -from discord.errors import NotFound  from discord.ext import commands -from markdownify import MarkdownConverter  from requests import ConnectTimeout, ConnectionError, HTTPError  from sphinx.ext import intersphinx  from urllib3.exceptions import ProtocolError  from bot.bot import Bot  from bot.constants import MODERATION_ROLES, RedirectOutput -from bot.converters import ValidPythonIdentifier, ValidURL +from bot.converters import PackageName, ValidURL  from bot.decorators import with_role  from bot.pagination import LinePaginator  from bot.utils.messages import wait_for_deletion - +from .parsing import get_symbol_markdown  log = logging.getLogger(__name__)  logging.getLogger('urllib3').setLevel(logging.WARNING) @@ -50,68 +50,126 @@ NO_OVERRIDE_PACKAGES = (      "python",  ) -SEARCH_END_TAG_ATTRS = ( -    "data", -    "function", -    "class", -    "exception", -    "seealso", -    "section", -    "rubric", -    "sphinxsidebar", -) -UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")  WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") -  FAILED_REQUEST_RETRY_AMOUNT = 3  NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay -def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable: -    """ -    LRU cache implementation for coroutines. +class DocItem(NamedTuple): +    """Holds inventory symbol information.""" + +    package: str +    group: str +    base_url: str +    relative_url_path: str +    symbol_id: str + +    @property +    def url(self) -> str: +        """Return the absolute url to the symbol.""" +        return "".join((self.base_url, self.relative_url_path)) -    Once the cache exceeds the maximum size, keys are deleted in FIFO order. -    An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key. +class QueueItem(NamedTuple): +    """Contains a symbol and the BeautifulSoup object needed to parse it.""" + +    symbol: DocItem +    soup: BeautifulSoup + +    def __eq__(self, other: Union[QueueItem, DocItem]): +        if isinstance(other, DocItem): +            return self.symbol == other +        return NamedTuple.__eq__(self, other) + + +class CachedParser: +    """ +    Get symbol markdown from pages with smarter caching. + +    DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. +    `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, +    all of the symbols are queued to be parsed to avoid multiple web requests to the same page.      """ -    # Assign the cache to the function itself so we can clear it from outside. -    async_cache.cache = OrderedDict() -    def decorator(function: Callable) -> Callable: -        """Define the async_cache decorator.""" -        @functools.wraps(function) -        async def wrapper(*args) -> Any: -            """Decorator wrapper for the caching logic.""" -            key = ':'.join(args[arg_offset:]) +    def __init__(self): +        self._queue: List[QueueItem] = [] +        self._results = {} +        self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) +        self._item_events: Dict[DocItem, asyncio.Event] = {} +        self._parse_task = None -            value = async_cache.cache.get(key) -            if value is None: -                if len(async_cache.cache) > max_size: -                    async_cache.cache.popitem(last=False) +    async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: +        """ +        Get result markdown of `doc_item`. -                async_cache.cache[key] = await function(*args) -            return async_cache.cache[key] -        return wrapper -    return decorator +        If no symbols were fetched from `doc_item`s page before, +        the HTML has to be fetched before parsing can be queued. +        """ +        if (symbol := self._results.get(doc_item)) is not None: +            return symbol +        if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: +            async with client_session.get(doc_item.url) as response: +                soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") -class DocMarkdownConverter(MarkdownConverter): -    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" +            self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) +            del self._page_symbols[doc_item.url] +            log.debug(f"Added symbols from {doc_item.url} to parse queue.") -    def convert_code(self, el: PageElement, text: str) -> str: -        """Undo `markdownify`s underscore escaping.""" -        return f"`{text}`".replace('\\', '') +            if self._parse_task is None: +                self._parse_task = asyncio.create_task(self._parse_queue()) -    def convert_pre(self, el: PageElement, text: str) -> str: -        """Wrap any codeblocks in `py` for syntax highlighting.""" -        code = ''.join(el.strings) -        return f"```py\n{code}```" +        self._move_to_front(doc_item) +        self._item_events[doc_item] = item_event = asyncio.Event() +        await item_event.wait() +        return self._results[doc_item] +    async def _parse_queue(self) -> None: +        """ +        Parse all item from the queue, setting associated events for symbols if present. -def markdownify(html: str) -> DocMarkdownConverter: -    """Create a DocMarkdownConverter object from the input html.""" -    return DocMarkdownConverter(bullets='•').convert(html) +        The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. +        """ +        log.trace("Starting queue parsing.") +        while self._queue: +            item, soup = self._queue.pop() +            self._results[item] = get_symbol_markdown(soup, item) +            if (event := self._item_events.get(item)) is not None: +                event.set() +            await asyncio.sleep(0.1) + +        self._parse_task = None +        log.trace("Finished parsing queue.") + +    def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: +        """Move `item` to the front of the parse queue.""" +        # The parse queue stores soups along with the doc symbols in QueueItem objects, +        # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. +        item_index = self._queue.index(item) +        queue_item = self._queue[item_index] + +        del self._queue[item_index] +        self._queue.append(queue_item) + +    def add_item(self, doc_item: DocItem) -> None: +        """Add a DocItem to `_page_symbols`.""" +        self._page_symbols[doc_item.url].append(doc_item) + +    async def clear(self) -> None: +        """ +        Clear all internal symbol data. + +        All currently requested items are waited to be parsed before clearing. +        """ +        for event in self._item_events.values(): +            await event.wait() +        if self._parse_task is not None: +            self._parse_task.cancel() +            self._parse_task = None +        self._queue.clear() +        self._results.clear() +        self._page_symbols.clear() +        self._item_events.clear()  class InventoryURL(commands.Converter): @@ -127,6 +185,7 @@ class InventoryURL(commands.Converter):      @staticmethod      async def convert(ctx: commands.Context, url: str) -> str:          """Convert url to Intersphinx inventory URL.""" +        await ctx.trigger_typing()          try:              intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url)          except AttributeError: @@ -145,13 +204,14 @@ class InventoryURL(commands.Converter):          return url -class Doc(commands.Cog): +class DocCog(commands.Cog):      """A set of commands for querying & displaying documentation."""      def __init__(self, bot: Bot):          self.base_urls = {}          self.bot = bot -        self.inventories = {} +        self.doc_symbols: Dict[str, DocItem] = {} +        self.item_fetcher = CachedParser()          self.renamed_symbols = set()          self.bot.loop.create_task(self.init_refresh_inventory()) @@ -162,7 +222,7 @@ class Doc(commands.Cog):          await self.refresh_inventory()      async def update_single( -        self, package_name: str, base_url: str, inventory_url: str +        self, api_package_name: str, base_url: str, inventory_url: str      ) -> None:          """          Rebuild the inventory for a single package. @@ -174,49 +234,59 @@ class Doc(commands.Cog):              * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running                  `intersphinx.fetch_inventory` in an executor on the bot's event loop          """ -        self.base_urls[package_name] = base_url +        self.base_urls[api_package_name] = base_url          package = await self._fetch_inventory(inventory_url)          if not package:              return None          for group, value in package.items(): -            for symbol, (package_name, _version, relative_doc_url, _) in value.items(): -                absolute_doc_url = base_url + relative_doc_url - -                if symbol in self.inventories: -                    group_name = group.split(":")[1] -                    symbol_base_url = self.inventories[symbol].split("/", 3)[2] +            for symbol, (_package_name, _version, relative_doc_url, _) in value.items(): +                if "/" in symbol: +                    continue  # skip unreachable symbols with slashes +                # Intern the group names since they're reused in all the DocItems +                # to remove unnecessary memory consumption from them being unique objects +                group_name = sys.intern(group.split(":")[1]) + +                if (original_symbol := self.doc_symbols.get(symbol)) is not None:                      if (                          group_name in NO_OVERRIDE_GROUPS -                        or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES) +                        or any(package == original_symbol.package for package in NO_OVERRIDE_PACKAGES)                      ): -                          symbol = f"{group_name}.{symbol}" -                        # If renamed `symbol` already exists, add library name in front to differentiate between them. -                        if symbol in self.renamed_symbols: -                            # Split `package_name` because of packages like Pillow that have spaces in them. -                            symbol = f"{package_name.split()[0]}.{symbol}" +                        self.renamed_symbols.add(symbol) -                        self.inventories[symbol] = absolute_doc_url +                    elif (overridden_symbol_group := original_symbol.group) in NO_OVERRIDE_GROUPS: +                        overridden_symbol = f"{overridden_symbol_group}.{symbol}" +                        if overridden_symbol in self.renamed_symbols: +                            overridden_symbol = f"{api_package_name}.{overridden_symbol}" + +                        self.doc_symbols[overridden_symbol] = original_symbol +                        self.renamed_symbols.add(overridden_symbol) + +                    # If renamed `symbol` already exists, add library name in front to differentiate between them. +                    elif symbol in self.renamed_symbols: +                        symbol = f"{api_package_name}.{symbol}"                          self.renamed_symbols.add(symbol) -                        continue -                self.inventories[symbol] = absolute_doc_url +                relative_url_path, _, symbol_id = relative_doc_url.partition("#") +                symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) +                self.doc_symbols[symbol] = symbol_item +                self.item_fetcher.add_item(symbol_item) -        log.trace(f"Fetched inventory for {package_name}.") +        log.trace(f"Fetched inventory for {api_package_name}.")      async def refresh_inventory(self) -> None:          """Refresh internal documentation inventory."""          log.debug("Refreshing documentation inventory...") -        # Clear the old base URLS and inventories to ensure +        # Clear the old base URLS and doc symbols to ensure          # that we start from a fresh local dataset.          # Also, reset the cache used for fetching documentation.          self.base_urls.clear() -        self.inventories.clear() +        self.doc_symbols.clear()          self.renamed_symbols.clear() -        async_cache.cache = OrderedDict() +        await self.item_fetcher.clear()          # Run all coroutines concurrently - since each of them performs a HTTP          # request, this speeds up fetching the inventory data heavily. @@ -227,115 +297,21 @@ class Doc(commands.Cog):          ]          await asyncio.gather(*coros) -    async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]: -        """ -        Given a Python symbol, return its signature and description. - -        The first tuple element is the signature of the given symbol as a markup-free string, and -        the second tuple element is the description of the given symbol with HTML markup included. - -        If the given symbol is a module, returns a tuple `(None, str)` -        else if the symbol could not be found, returns `None`. -        """ -        url = self.inventories.get(symbol) -        if url is None: -            return None - -        async with self.bot.http_session.get(url) as response: -            html = await response.text(encoding='utf-8') - -        # Find the signature header and parse the relevant parts. -        symbol_id = url.split('#')[-1] -        soup = BeautifulSoup(html, 'lxml') -        symbol_heading = soup.find(id=symbol_id) -        search_html = str(soup) - -        if symbol_heading is None: -            return None - -        if symbol_id == f"module-{symbol}": -            # Get page content from the module headerlink to the -            # first tag that has its class in `SEARCH_END_TAG_ATTRS` -            start_tag = symbol_heading.find("a", attrs={"class": "headerlink"}) -            if start_tag is None: -                return [], "" - -            end_tag = start_tag.find_next(self._match_end_tag) -            if end_tag is None: -                return [], "" - -            description_start_index = search_html.find(str(start_tag.parent)) + len(str(start_tag.parent)) -            description_end_index = search_html.find(str(end_tag)) -            description = search_html[description_start_index:description_end_index] -            signatures = None - -        else: -            signatures = [] -            description = str(symbol_heading.find_next_sibling("dd")) -            description_pos = search_html.find(description) -            # Get text of up to 3 signatures, remove unwanted symbols -            for element in [symbol_heading] + symbol_heading.find_next_siblings("dt", limit=2): -                signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) -                if signature and search_html.find(str(element)) < description_pos: -                    signatures.append(signature) - -        return signatures, description.replace('¶', '') - -    @async_cache(arg_offset=1)      async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]:          """          Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents.          If the symbol is known, an Embed with documentation about it is returned.          """ -        scraped_html = await self.get_symbol_html(symbol) -        if scraped_html is None: +        symbol_info = self.doc_symbols.get(symbol) +        if symbol_info is None:              return None - -        signatures = scraped_html[0] -        permalink = self.inventories[symbol] -        description = markdownify(scraped_html[1]) - -        # Truncate the description of the embed to the last occurrence -        # of a double newline (interpreted as a paragraph) before index 1000. -        if len(description) > 1000: -            shortened = description[:1000] -            description_cutoff = shortened.rfind('\n\n', 100) -            if description_cutoff == -1: -                # Search the shortened version for cutoff points in decreasing desirability, -                # cutoff at 1000 if none are found. -                for string in (". ", ", ", ",", " "): -                    description_cutoff = shortened.rfind(string) -                    if description_cutoff != -1: -                        break -                else: -                    description_cutoff = 1000 -            description = description[:description_cutoff] - -            # If there is an incomplete code block, cut it out -            if description.count("```") % 2: -                codeblock_start = description.rfind('```py') -                description = description[:codeblock_start].rstrip() -            description += f"... [read more]({permalink})" - -        description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description) -        if signatures is None: -            # If symbol is a module, don't show signature. -            embed_description = description - -        elif not signatures: -            # It's some "meta-page", for example: -            # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views -            embed_description = "This appears to be a generic page not tied to a specific symbol." - -        else: -            embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) -            embed_description += f"\n{description}" +        self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")          embed = discord.Embed( -            title=f'`{symbol}`', -            url=permalink, -            description=embed_description +            title=discord.utils.escape_markdown(symbol), +            url=f"{symbol_info.url}#{symbol_info.symbol_id}", +            description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info)          )          # Show all symbols with the same name that were renamed in the footer.          embed.set_footer( @@ -344,12 +320,12 @@ class Doc(commands.Cog):          return embed      @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True) -    async def docs_group(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: +    async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None:          """Lookup documentation for Python symbols.""" -        await ctx.invoke(self.get_command, symbol) +        await ctx.invoke(self.get_command, symbol=symbol) -    @docs_group.command(name='get', aliases=('g',)) -    async def get_command(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None: +    @docs_group.command(name='getdoc', aliases=('g',)) +    async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None:          """          Return a documentation embed for a given symbol. @@ -359,9 +335,9 @@ class Doc(commands.Cog):              !docs              !docs aiohttp              !docs aiohttp.ClientSession -            !docs get aiohttp.ClientSession +            !docs getdoc aiohttp.ClientSession          """ -        if symbol is None: +        if not symbol:              inventory_embed = discord.Embed(                  title=f"All inventories (`{len(self.base_urls)}` total)",                  colour=discord.Colour.blue() @@ -376,6 +352,7 @@ class Doc(commands.Cog):                  await ctx.send(embed=inventory_embed)          else: +            symbol = symbol.strip("`")              # Fetching documentation for a symbol (at least for the first time, since              # caching is used) takes quite some time, so let's send typing to indicate              # that we got the command, but are still working on it. @@ -383,22 +360,30 @@ class Doc(commands.Cog):                  doc_embed = await self.get_symbol_embed(symbol)              if doc_embed is None: +                symbol = await discord.ext.commands.clean_content().convert(ctx, symbol)                  error_embed = discord.Embed( -                    description=f"Sorry, I could not find any documentation for `{symbol}`.", +                    description=f"Sorry, I could not find any documentation for `{(symbol)}`.",                      colour=discord.Colour.red()                  )                  error_message = await ctx.send(embed=error_embed) -                with suppress(NotFound): -                    await error_message.delete(delay=NOT_FOUND_DELETE_DELAY) -                    await ctx.message.delete(delay=NOT_FOUND_DELETE_DELAY) +                await wait_for_deletion( +                    error_message, +                    (ctx.author.id,), +                    timeout=NOT_FOUND_DELETE_DELAY, +                    client=self.bot +                ) +                with suppress(discord.NotFound): +                    await ctx.message.delete() +                with suppress(discord.NotFound): +                    await error_message.delete()              else:                  msg = await ctx.send(embed=doc_embed)                  await wait_for_deletion(msg, (ctx.author.id,), client=self.bot) -    @docs_group.command(name='set', aliases=('s',)) +    @docs_group.command(name='setdoc', aliases=('s',))      @with_role(*MODERATION_ROLES)      async def set_command( -        self, ctx: commands.Context, package_name: ValidPythonIdentifier, +        self, ctx: commands.Context, package_name: PackageName,          base_url: ValidURL, inventory_url: InventoryURL      ) -> None:          """ @@ -407,7 +392,7 @@ class Doc(commands.Cog):          The database will update the object, should an existing item with the specified `package_name` already exist.          Example: -            !docs set \ +            !docs setdoc \                      python \                      https://docs.python.org/3/ \                      https://docs.python.org/3/objects.inv @@ -426,20 +411,17 @@ class Doc(commands.Cog):              f"Inventory URL: {inventory_url}"          ) -        # Rebuilding the inventory can take some time, so lets send out a -        # typing event to show that the Bot is still working. -        async with ctx.typing(): -            await self.refresh_inventory() +        await self.update_single(package_name, base_url, inventory_url)          await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.") -    @docs_group.command(name='delete', aliases=('remove', 'rm', 'd')) +    @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd'))      @with_role(*MODERATION_ROLES) -    async def delete_command(self, ctx: commands.Context, package_name: ValidPythonIdentifier) -> None: +    async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None:          """          Removes the specified package from the database.          Examples: -            !docs delete aiohttp +            !docs deletedoc aiohttp          """          await self.bot.api_client.delete(f'bot/documentation-links/{package_name}') @@ -449,21 +431,20 @@ class Doc(commands.Cog):              await self.refresh_inventory()          await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.") -    @docs_group.command(name="refresh", aliases=("rfsh", "r")) +    @docs_group.command(name="refreshdoc", aliases=("rfsh", "r"))      @with_role(*MODERATION_ROLES)      async def refresh_command(self, ctx: commands.Context) -> None:          """Refresh inventories and send differences to channel."""          old_inventories = set(self.base_urls)          with ctx.typing():              await self.refresh_inventory() -        # Get differences of added and removed inventories -        added = ', '.join(inv for inv in self.base_urls if inv not in old_inventories) -        if added: -            added = f"+ {added}" +        new_inventories = set(self.base_urls) -        removed = ', '.join(inv for inv in old_inventories if inv not in self.base_urls) -        if removed: -            removed = f"- {removed}" +        if added := ", ".join(new_inventories - old_inventories): +            added = "+ " + added + +        if removed := ", ".join(old_inventories - new_inventories): +            removed = "- " + removed          embed = discord.Embed(              title="Inventories refreshed", @@ -497,17 +478,3 @@ class Doc(commands.Cog):                  return package          log.error(f"Fetching of inventory {inventory_url} failed.")          return None - -    @staticmethod -    def _match_end_tag(tag: Tag) -> bool: -        """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" -        for attr in SEARCH_END_TAG_ATTRS: -            if attr in tag.get("class", ()): -                return True - -        return tag.name == "table" - - -def setup(bot: Bot) -> None: -    """Load the Doc cog.""" -    bot.add_cog(Doc(bot)) diff --git a/bot/cogs/doc/html.py b/bot/cogs/doc/html.py new file mode 100644 index 000000000..bc705130d --- /dev/null +++ b/bot/cogs/doc/html.py @@ -0,0 +1,33 @@ +from collections.abc import Iterable +from typing import List, Union + +from bs4.element import NavigableString, PageElement, SoupStrainer, Tag + + +class Strainer(SoupStrainer): +    """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s.""" + +    def __init__(self, *, include_strings: bool, **kwargs): +        self.include_strings = include_strings +        super().__init__(**kwargs) + +    markup_hint = Union[PageElement, List["markup_hint"]] + +    def search(self, markup: markup_hint) -> Union[PageElement, str]: +        """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s.""" +        if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)): +            for element in markup: +                if isinstance(element, NavigableString) and self.search(element): +                    return element +        elif isinstance(markup, Tag): +            # Also include tags while we're searching for strings and tags. +            if self.include_strings or (not self.text or self.name or self.attrs): +                return self.search_tag(markup) + +        elif isinstance(markup, str): +            # Let everything through the text filter if we're including strings and tags. +            text_filter = None if not self.include_strings else True +            if not self.name and not self.attrs and self._matches(markup, text_filter): +                return markup +        else: +            raise Exception(f"I don't know how to match against a {markup.__class__}") diff --git a/bot/cogs/doc/markdown.py b/bot/cogs/doc/markdown.py new file mode 100644 index 000000000..dca477d35 --- /dev/null +++ b/bot/cogs/doc/markdown.py @@ -0,0 +1,58 @@ +from urllib.parse import urljoin + +from bs4.element import PageElement +from markdownify import MarkdownConverter + + +class _DocMarkdownConverter(MarkdownConverter): +    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods.""" + +    def __init__(self, *, page_url: str, **options): +        super().__init__(**options) +        self.page_url = page_url + +    def convert_li(self, el: PageElement, text: str) -> str: +        """Fix markdownify's erroneous indexing in ol tags.""" +        parent = el.parent +        if parent is not None and parent.name == 'ol': +            li_tags = parent.find_all("li") +            bullet = '%s.' % (li_tags.index(el)+1) +        else: +            depth = -1 +            while el: +                if el.name == 'ul': +                    depth += 1 +                el = el.parent +            bullets = self.options['bullets'] +            bullet = bullets[depth % len(bullets)] +        return '%s %s\n' % (bullet, text or '') + +    def convert_hn(self, _n: int, el: PageElement, text: str) -> str: +        """Convert h tags to bold text with ** instead of adding #.""" +        return f"**{text}**\n\n" + +    def convert_code(self, el: PageElement, text: str) -> str: +        """Undo `markdownify`s underscore escaping.""" +        return f"`{text}`".replace('\\', '') + +    def convert_pre(self, el: PageElement, text: str) -> str: +        """Wrap any codeblocks in `py` for syntax highlighting.""" +        code = ''.join(el.strings) +        return f"```py\n{code}```" + +    def convert_a(self, el: PageElement, text: str) -> str: +        """Resolve relative URLs to `self.page_url`.""" +        el["href"] = urljoin(self.page_url, el["href"]) +        return super().convert_a(el, text) + +    def convert_p(self, el: PageElement, text: str) -> str: +        """Include only one newline instead of two when the parent is a li tag.""" +        parent = el.parent +        if parent is not None and parent.name == "li": +            return f"{text}\n" +        return super().convert_p(el, text) + + +def markdownify(html: str, *, url: str = "") -> str: +    """Create a DocMarkdownConverter object from the input html.""" +    return _DocMarkdownConverter(bullets='•', page_url=url).convert(html) diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py new file mode 100644 index 000000000..21a3065f4 --- /dev/null +++ b/bot/cogs/doc/parsing.py @@ -0,0 +1,208 @@ +from __future__ import annotations + +import logging +import re +import string +import textwrap +from functools import partial +from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union + +from bs4 import BeautifulSoup +from bs4.element import NavigableString, PageElement, Tag + +from .html import Strainer +from .markdown import markdownify +if TYPE_CHECKING: +    from .cog import DocItem + +log = logging.getLogger(__name__) + +_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶") +_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)") + +_SEARCH_END_TAG_ATTRS = ( +    "data", +    "function", +    "class", +    "exception", +    "seealso", +    "section", +    "rubric", +    "sphinxsidebar", +) + +_NO_SIGNATURE_GROUPS = { +    "attribute", +    "envvar", +    "setting", +    "tempaltefilter", +    "templatetag", +    "term", +} + + +def _find_elements_until_tag( +        start_element: PageElement, +        tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]], +        *, +        func: Callable, +        include_strings: bool = False, +        limit: int = None, +) -> List[Union[Tag, NavigableString]]: +    """ +    Get all elements up to `limit` or until a tag matching `tag_filter` is found. + +    `tag_filter` can be either a tuple of string names to check against, +    or a filtering callable that's applied to tags. + +    When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s. + +    `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`. +    The method is then iterated over and all elements until the matching tag or the limit are added to the return list. +    """ +    use_tuple_filter = isinstance(tag_filter, tuple) +    elements = [] + +    for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit): +        if isinstance(element, Tag): +            if use_tuple_filter: +                if element.name in tag_filter: +                    break +            elif tag_filter(element): +                break +        elements.append(element) + +    return elements + + +_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False)) +_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings) +_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings) + + +def _get_general_description(start_element: PageElement) -> Optional[str]: +    """ +    Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`. + +    A headerlink a tag is attempted to be found to skip repeating the symbol information in the description, +    if it's found it's used as the tag to start the search from instead of the `start_element`. +    """ +    header = start_element.find_next("a", attrs={"class": "headerlink"}) +    start_tag = header.parent if header is not None else start_element +    description = "".join( +        str(tag) for tag in _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True) +    ) + +    return description + + +def _get_dd_description(symbol: PageElement) -> str: +    """Get the string contents of the next dd tag, up to a dt or a dl tag.""" +    description_tag = symbol.find_next("dd") +    description_contents = _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True) +    return "".join(str(tag) for tag in description_contents) + + +def _get_signatures(start_signature: PageElement) -> List[str]: +    """ +    Collect up to 3 signatures from dt tags around the `start_signature` dt tag. + +    First the signatures under the `start_signature` are included; +    if less than 2 are found, tags above the start signature are added to the result if any are present. +    """ +    signatures = [] +    for element in ( +            *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)), +            start_signature, +            *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2), +    )[-3:]: +        signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text) + +        if signature: +            signatures.append(signature) + +    return signatures + + +def _truncate_markdown(markdown: str, max_length: int) -> str: +    """ +    Truncate `markdown` to be at most `max_length` characters. + +    The markdown string is searched for substrings to cut at, to keep its structure, +    but if none are found the string is simply sliced. +    """ +    if len(markdown) > max_length: +        shortened = markdown[:max_length] +        description_cutoff = shortened.rfind('\n\n', 100) +        if description_cutoff == -1: +            # Search the shortened version for cutoff points in decreasing desirability, +            # cutoff at 1000 if none are found. +            for cutoff_string in (". ", ", ", ",", " "): +                description_cutoff = shortened.rfind(cutoff_string) +                if description_cutoff != -1: +                    break +            else: +                description_cutoff = max_length +        markdown = markdown[:description_cutoff] + +        # If there is an incomplete code block, cut it out +        if markdown.count("```") % 2: +            codeblock_start = markdown.rfind('```py') +            markdown = markdown[:codeblock_start].rstrip() +        markdown = markdown.rstrip(string.punctuation) + "..." +    return markdown + + +def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: str) -> str: +    """ +    Create a markdown string with the signatures at the top, and the converted html description below them. + +    The signatures are wrapped in python codeblocks, separated from the description by a newline. +    The result string is truncated to be max 1000 symbols long. +    """ +    description = _truncate_markdown(markdownify(description, url=url), 1000) +    description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description) +    if signatures is not None: +        formatted_markdown = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures) +    else: +        formatted_markdown = "" +    formatted_markdown += f"\n{description}" + +    return formatted_markdown + + +def _match_end_tag(tag: Tag) -> bool: +    """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" +    for attr in _SEARCH_END_TAG_ATTRS: +        if attr in tag.get("class", ()): +            return True + +    return tag.name == "table" + + +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str: +    """ +    Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. + +    The method of parsing and what information gets included depends on the symbol's group. +    """ +    symbol_heading = soup.find(id=symbol_data.symbol_id) +    signature = None +    # Modules, doc pages and labels don't point to description list tags but to tags like divs, +    # no special parsing can be done so we only try to include what's under them. +    if symbol_data.group in {"module", "doc", "label"}: +        description = _get_general_description(symbol_heading) + +    elif symbol_heading.name != "dt": +        # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, +        # log info the tag can be looked at. +        description = _get_general_description(symbol_heading) + +    elif symbol_data.group in _NO_SIGNATURE_GROUPS: +        description = _get_dd_description(symbol_heading) + +    else: +        signature = _get_signatures(symbol_heading) +        description = _get_dd_description(symbol_heading) + +    return _parse_into_markdown(signature, description.replace('¶', ''), symbol_data.url) diff --git a/bot/converters.py b/bot/converters.py index 1358cbf1e..26b93120a 100644 --- a/bot/converters.py +++ b/bot/converters.py @@ -121,22 +121,20 @@ class ValidFilterListType(Converter):          return list_type -class ValidPythonIdentifier(Converter): +class PackageName(Converter):      """ -    A converter that checks whether the given string is a valid Python identifier. +    A converter that checks whether the given string is a valid package name. -    This is used to have package names that correspond to how you would use the package in your -    code, e.g. `import package`. - -    Raises `BadArgument` if the argument is not a valid Python identifier, and simply passes through -    the given argument otherwise. +    Package names are used for stats and are restricted to the a-z and _ characters.      """ -    @staticmethod -    async def convert(ctx: Context, argument: str) -> str: -        """Checks whether the given string is a valid Python identifier.""" -        if not argument.isidentifier(): -            raise BadArgument(f"`{argument}` is not a valid Python identifier") +    PACKAGE_NAME_RE = re.compile(r"[^a-z_]") + +    @classmethod +    async def convert(cls, ctx: Context, argument: str) -> str: +        """Checks whether the given string is a valid package name.""" +        if cls.PACKAGE_NAME_RE.search(argument): +            raise BadArgument("The provided package name is not valid, please only use the _ and a-z characters.")          return argument diff --git a/tests/bot/test_converters.py b/tests/bot/test_converters.py index c42111f3f..231798a92 100644 --- a/tests/bot/test_converters.py +++ b/tests/bot/test_converters.py @@ -10,9 +10,9 @@ from bot.converters import (      Duration,      HushDurationConverter,      ISODateTime, +    PackageName,      TagContentConverter,      TagNameConverter, -    ValidPythonIdentifier,  ) @@ -78,24 +78,23 @@ class ConverterTests(unittest.IsolatedAsyncioTestCase):                  with self.assertRaisesRegex(BadArgument, re.escape(exception_message)):                      await TagNameConverter.convert(self.context, invalid_name) -    async def test_valid_python_identifier_for_valid(self): -        """ValidPythonIdentifier returns valid identifiers unchanged.""" -        test_values = ('foo', 'lemon') +    async def test_package_name_for_valid(self): +        """PackageName returns valid package names unchanged.""" +        test_values = ('foo', 'le_mon')          for name in test_values:              with self.subTest(identifier=name): -                conversion = await ValidPythonIdentifier.convert(self.context, name) +                conversion = await PackageName.convert(self.context, name)                  self.assertEqual(name, conversion) -    async def test_valid_python_identifier_for_invalid(self): -        """ValidPythonIdentifier raises the proper exception for invalid identifiers.""" -        test_values = ('nested.stuff', '#####') +    async def test_package_name_for_invalid(self): +        """PackageName raises the proper exception for invalid package names.""" +        test_values = ('text_with_a_dot.', 'UpperCaseName', "num83r")          for name in test_values:              with self.subTest(identifier=name): -                exception_message = f'`{name}` is not a valid Python identifier' -                with self.assertRaisesRegex(BadArgument, re.escape(exception_message)): -                    await ValidPythonIdentifier.convert(self.context, name) +                with self.assertRaises(BadArgument): +                    await PackageName.convert(self.context, name)      async def test_duration_converter_for_valid(self):          """Duration returns the correct `datetime` for valid duration strings.""" | 
