11 files changed, 1033 insertions, 537 deletions
diff --git a/LICENSE-THIRD-PARTY b/LICENSE-THIRD-PARTY
new file mode 100644
index 000000000..f78491fc1
--- /dev/null
+++ b/LICENSE-THIRD-PARTY
@@ -0,0 +1,30 @@
+License for Sphinx
+Applies to:
+    - bot/cogs/doc/inventory_parser.py: _load_v1, _load_v2 and ZlibStreamReader.__aiter__.
+==================
+
+Copyright (c) 2007-2020 by the Sphinx team (see AUTHORS file).
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/Pipfile b/Pipfile
index 99fc70b46..70e7d67e7 100644
--- a/Pipfile
+++ b/Pipfile
@@ -22,9 +22,7 @@ markdownify = "~=0.4"
 more_itertools = "~=8.2"
 python-dateutil = "~=2.8"
 pyyaml = "~=5.1"
-requests = "~=2.22"
 sentry-sdk = "~=0.14"
-sphinx = "~=2.2"
 statsd = "~=3.3"
 
 [dev-packages]
diff --git a/bot/converters.py b/bot/converters.py
index 2e118d476..6c87a50fe 100644
--- a/bot/converters.py
+++ b/bot/converters.py
@@ -126,22 +126,20 @@ class ValidFilterListType(Converter):
         return list_type
 
 
-class ValidPythonIdentifier(Converter):
+class PackageName(Converter):
     """
-    A converter that checks whether the given string is a valid Python identifier.
+    A converter that checks whether the given string is a valid package name.
 
-    This is used to have package names that correspond to how you would use the package in your
-    code, e.g. `import package`.
-
-    Raises `BadArgument` if the argument is not a valid Python identifier, and simply passes through
-    the given argument otherwise.
+    Package names are used for stats and are restricted to the a-z and _ characters.
     """
 
-    @staticmethod
-    async def convert(ctx: Context, argument: str) -> str:
-        """Checks whether the given string is a valid Python identifier."""
-        if not argument.isidentifier():
-            raise BadArgument(f"`{argument}` is not a valid Python identifier")
+    PACKAGE_NAME_RE = re.compile(r"[^a-z_]")
+
+    @classmethod
+    async def convert(cls, ctx: Context, argument: str) -> str:
+        """Checks whether the given string is a valid package name."""
+        if cls.PACKAGE_NAME_RE.search(argument):
+            raise BadArgument("The provided package name is not valid, please only use the _ and a-z characters.")
         return argument
 
 
diff --git a/bot/exts/info/doc.py b/bot/exts/info/doc.py
deleted file mode 100644
index c16a99225..000000000
--- a/bot/exts/info/doc.py
+++ /dev/null
@@ -1,512 +0,0 @@
-import asyncio
-import functools
-import logging
-import re
-import textwrap
-from collections import OrderedDict
-from contextlib import suppress
-from types import SimpleNamespace
-from typing import Any, Callable, Optional, Tuple
-
-import discord
-from bs4 import BeautifulSoup
-from bs4.element import PageElement, Tag
-from discord.errors import NotFound
-from discord.ext import commands
-from markdownify import MarkdownConverter
-from requests import ConnectTimeout, ConnectionError, HTTPError
-from sphinx.ext import intersphinx
-from urllib3.exceptions import ProtocolError
-
-from bot.bot import Bot
-from bot.constants import MODERATION_ROLES, RedirectOutput
-from bot.converters import ValidPythonIdentifier, ValidURL
-from bot.pagination import LinePaginator
-from bot.utils.messages import wait_for_deletion
-
-
-log = logging.getLogger(__name__)
-logging.getLogger('urllib3').setLevel(logging.WARNING)
-
-# Since Intersphinx is intended to be used with Sphinx,
-# we need to mock its configuration.
-SPHINX_MOCK_APP = SimpleNamespace(
-    config=SimpleNamespace(
-        intersphinx_timeout=3,
-        tls_verify=True,
-        user_agent="python3:python-discord/bot:1.0.0"
-    )
-)
-
-NO_OVERRIDE_GROUPS = (
-    "2to3fixer",
-    "token",
-    "label",
-    "pdbcommand",
-    "term",
-)
-NO_OVERRIDE_PACKAGES = (
-    "python",
-)
-
-SEARCH_END_TAG_ATTRS = (
-    "data",
-    "function",
-    "class",
-    "exception",
-    "seealso",
-    "section",
-    "rubric",
-    "sphinxsidebar",
-)
-UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
-WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
-
-FAILED_REQUEST_RETRY_AMOUNT = 3
-NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay
-
-
-def async_cache(max_size: int = 128, arg_offset: int = 0) -> Callable:
-    """
-    LRU cache implementation for coroutines.
-
-    Once the cache exceeds the maximum size, keys are deleted in FIFO order.
-
-    An offset may be optionally provided to be applied to the coroutine's arguments when creating the cache key.
-    """
-    # Assign the cache to the function itself so we can clear it from outside.
-    async_cache.cache = OrderedDict()
-
-    def decorator(function: Callable) -> Callable:
-        """Define the async_cache decorator."""
-        @functools.wraps(function)
-        async def wrapper(*args) -> Any:
-            """Decorator wrapper for the caching logic."""
-            key = ':'.join(args[arg_offset:])
-
-            value = async_cache.cache.get(key)
-            if value is None:
-                if len(async_cache.cache) > max_size:
-                    async_cache.cache.popitem(last=False)
-
-                async_cache.cache[key] = await function(*args)
-            return async_cache.cache[key]
-        return wrapper
-    return decorator
-
-
-class DocMarkdownConverter(MarkdownConverter):
-    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
-
-    def convert_code(self, el: PageElement, text: str) -> str:
-        """Undo `markdownify`s underscore escaping."""
-        return f"`{text}`".replace('\\', '')
-
-    def convert_pre(self, el: PageElement, text: str) -> str:
-        """Wrap any codeblocks in `py` for syntax highlighting."""
-        code = ''.join(el.strings)
-        return f"```py\n{code}```"
-
-
-def markdownify(html: str) -> DocMarkdownConverter:
-    """Create a DocMarkdownConverter object from the input html."""
-    return DocMarkdownConverter(bullets='•').convert(html)
-
-
-class InventoryURL(commands.Converter):
-    """
-    Represents an Intersphinx inventory URL.
-
-    This converter checks whether intersphinx accepts the given inventory URL, and raises
-    `BadArgument` if that is not the case.
-
-    Otherwise, it simply passes through the given URL.
-    """
-
-    @staticmethod
-    async def convert(ctx: commands.Context, url: str) -> str:
-        """Convert url to Intersphinx inventory URL."""
-        try:
-            intersphinx.fetch_inventory(SPHINX_MOCK_APP, '', url)
-        except AttributeError:
-            raise commands.BadArgument(f"Failed to fetch Intersphinx inventory from URL `{url}`.")
-        except ConnectionError:
-            if url.startswith('https'):
-                raise commands.BadArgument(
-                    f"Cannot establish a connection to `{url}`. Does it support HTTPS?"
-                )
-            raise commands.BadArgument(f"Cannot connect to host with URL `{url}`.")
-        except ValueError:
-            raise commands.BadArgument(
-                f"Failed to read Intersphinx inventory from URL `{url}`. "
-                "Are you sure that it's a valid inventory file?"
-            )
-        return url
-
-
-class Doc(commands.Cog):
-    """A set of commands for querying & displaying documentation."""
-
-    def __init__(self, bot: Bot):
-        self.base_urls = {}
-        self.bot = bot
-        self.inventories = {}
-        self.renamed_symbols = set()
-
-        self.bot.loop.create_task(self.init_refresh_inventory())
-
-    async def init_refresh_inventory(self) -> None:
-        """Refresh documentation inventory on cog initialization."""
-        await self.bot.wait_until_guild_available()
-        await self.refresh_inventory()
-
-    async def update_single(
-        self, package_name: str, base_url: str, inventory_url: str
-    ) -> None:
-        """
-        Rebuild the inventory for a single package.
-
-        Where:
-            * `package_name` is the package name to use, appears in the log
-            * `base_url` is the root documentation URL for the specified package, used to build
-                absolute paths that link to specific symbols
-            * `inventory_url` is the absolute URL to the intersphinx inventory, fetched by running
-                `intersphinx.fetch_inventory` in an executor on the bot's event loop
-        """
-        self.base_urls[package_name] = base_url
-
-        package = await self._fetch_inventory(inventory_url)
-        if not package:
-            return None
-
-        for group, value in package.items():
-            for symbol, (package_name, _version, relative_doc_url, _) in value.items():
-                absolute_doc_url = base_url + relative_doc_url
-
-                if symbol in self.inventories:
-                    group_name = group.split(":")[1]
-                    symbol_base_url = self.inventories[symbol].split("/", 3)[2]
-                    if (
-                        group_name in NO_OVERRIDE_GROUPS
-                        or any(package in symbol_base_url for package in NO_OVERRIDE_PACKAGES)
-                    ):
-
-                        symbol = f"{group_name}.{symbol}"
-                        # If renamed `symbol` already exists, add library name in front to differentiate between them.
-                        if symbol in self.renamed_symbols:
-                            # Split `package_name` because of packages like Pillow that have spaces in them.
-                            symbol = f"{package_name.split()[0]}.{symbol}"
-
-                        self.inventories[symbol] = absolute_doc_url
-                        self.renamed_symbols.add(symbol)
-                        continue
-
-                self.inventories[symbol] = absolute_doc_url
-
-        log.trace(f"Fetched inventory for {package_name}.")
-
-    async def refresh_inventory(self) -> None:
-        """Refresh internal documentation inventory."""
-        log.debug("Refreshing documentation inventory...")
-
-        # Clear the old base URLS and inventories to ensure
-        # that we start from a fresh local dataset.
-        # Also, reset the cache used for fetching documentation.
-        self.base_urls.clear()
-        self.inventories.clear()
-        self.renamed_symbols.clear()
-        async_cache.cache = OrderedDict()
-
-        # Run all coroutines concurrently - since each of them performs a HTTP
-        # request, this speeds up fetching the inventory data heavily.
-        coros = [
-            self.update_single(
-                package["package"], package["base_url"], package["inventory_url"]
-            ) for package in await self.bot.api_client.get('bot/documentation-links')
-        ]
-        await asyncio.gather(*coros)
-
-    async def get_symbol_html(self, symbol: str) -> Optional[Tuple[list, str]]:
-        """
-        Given a Python symbol, return its signature and description.
-
-        The first tuple element is the signature of the given symbol as a markup-free string, and
-        the second tuple element is the description of the given symbol with HTML markup included.
-
-        If the given symbol is a module, returns a tuple `(None, str)`
-        else if the symbol could not be found, returns `None`.
-        """
-        url = self.inventories.get(symbol)
-        if url is None:
-            return None
-
-        async with self.bot.http_session.get(url) as response:
-            html = await response.text(encoding='utf-8')
-
-        # Find the signature header and parse the relevant parts.
-        symbol_id = url.split('#')[-1]
-        soup = BeautifulSoup(html, 'lxml')
-        symbol_heading = soup.find(id=symbol_id)
-        search_html = str(soup)
-
-        if symbol_heading is None:
-            return None
-
-        if symbol_id == f"module-{symbol}":
-            # Get page content from the module headerlink to the
-            # first tag that has its class in `SEARCH_END_TAG_ATTRS`
-            start_tag = symbol_heading.find("a", attrs={"class": "headerlink"})
-            if start_tag is None:
-                return [], ""
-
-            end_tag = start_tag.find_next(self._match_end_tag)
-            if end_tag is None:
-                return [], ""
-
-            description_start_index = search_html.find(str(start_tag.parent)) + len(str(start_tag.parent))
-            description_end_index = search_html.find(str(end_tag))
-            description = search_html[description_start_index:description_end_index]
-            signatures = None
-
-        else:
-            signatures = []
-            description = str(symbol_heading.find_next_sibling("dd"))
-            description_pos = search_html.find(description)
-            # Get text of up to 3 signatures, remove unwanted symbols
-            for element in [symbol_heading] + symbol_heading.find_next_siblings("dt", limit=2):
-                signature = UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
-                if signature and search_html.find(str(element)) < description_pos:
-                    signatures.append(signature)
-
-        return signatures, description.replace('¶', '')
-
-    @async_cache(arg_offset=1)
-    async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]:
-        """
-        Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents.
-
-        If the symbol is known, an Embed with documentation about it is returned.
-        """
-        scraped_html = await self.get_symbol_html(symbol)
-        if scraped_html is None:
-            return None
-
-        signatures = scraped_html[0]
-        permalink = self.inventories[symbol]
-        description = markdownify(scraped_html[1])
-
-        # Truncate the description of the embed to the last occurrence
-        # of a double newline (interpreted as a paragraph) before index 1000.
-        if len(description) > 1000:
-            shortened = description[:1000]
-            description_cutoff = shortened.rfind('\n\n', 100)
-            if description_cutoff == -1:
-                # Search the shortened version for cutoff points in decreasing desirability,
-                # cutoff at 1000 if none are found.
-                for string in (". ", ", ", ",", " "):
-                    description_cutoff = shortened.rfind(string)
-                    if description_cutoff != -1:
-                        break
-                else:
-                    description_cutoff = 1000
-            description = description[:description_cutoff]
-
-            # If there is an incomplete code block, cut it out
-            if description.count("```") % 2:
-                codeblock_start = description.rfind('```py')
-                description = description[:codeblock_start].rstrip()
-            description += f"... [read more]({permalink})"
-
-        description = WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
-        if signatures is None:
-            # If symbol is a module, don't show signature.
-            embed_description = description
-
-        elif not signatures:
-            # It's some "meta-page", for example:
-            # https://docs.djangoproject.com/en/dev/ref/views/#module-django.views
-            embed_description = "This appears to be a generic page not tied to a specific symbol."
-
-        else:
-            embed_description = "".join(f"```py\n{textwrap.shorten(signature, 500)}```" for signature in signatures)
-            embed_description += f"\n{description}"
-
-        embed = discord.Embed(
-            title=f'`{symbol}`',
-            url=permalink,
-            description=embed_description
-        )
-        # Show all symbols with the same name that were renamed in the footer.
-        embed.set_footer(
-            text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}"))
-        )
-        return embed
-
-    @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True)
-    async def docs_group(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None:
-        """Lookup documentation for Python symbols."""
-        await self.get_command(ctx, symbol)
-
-    @docs_group.command(name='get', aliases=('g',))
-    async def get_command(self, ctx: commands.Context, symbol: commands.clean_content = None) -> None:
-        """
-        Return a documentation embed for a given symbol.
-
-        If no symbol is given, return a list of all available inventories.
-
-        Examples:
-            !docs
-            !docs aiohttp
-            !docs aiohttp.ClientSession
-            !docs get aiohttp.ClientSession
-        """
-        if symbol is None:
-            inventory_embed = discord.Embed(
-                title=f"All inventories (`{len(self.base_urls)}` total)",
-                colour=discord.Colour.blue()
-            )
-
-            lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items())
-            if self.base_urls:
-                await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False)
-
-            else:
-                inventory_embed.description = "Hmmm, seems like there's nothing here yet."
-                await ctx.send(embed=inventory_embed)
-
-        else:
-            # Fetching documentation for a symbol (at least for the first time, since
-            # caching is used) takes quite some time, so let's send typing to indicate
-            # that we got the command, but are still working on it.
-            async with ctx.typing():
-                doc_embed = await self.get_symbol_embed(symbol)
-
-            if doc_embed is None:
-                error_embed = discord.Embed(
-                    description=f"Sorry, I could not find any documentation for `{symbol}`.",
-                    colour=discord.Colour.red()
-                )
-                error_message = await ctx.send(embed=error_embed)
-                with suppress(NotFound):
-                    await error_message.delete(delay=NOT_FOUND_DELETE_DELAY)
-                    await ctx.message.delete(delay=NOT_FOUND_DELETE_DELAY)
-            else:
-                msg = await ctx.send(embed=doc_embed)
-                await wait_for_deletion(msg, (ctx.author.id,), client=self.bot)
-
-    @docs_group.command(name='set', aliases=('s',))
-    @commands.has_any_role(*MODERATION_ROLES)
-    async def set_command(
-        self, ctx: commands.Context, package_name: ValidPythonIdentifier,
-        base_url: ValidURL, inventory_url: InventoryURL
-    ) -> None:
-        """
-        Adds a new documentation metadata object to the site's database.
-
-        The database will update the object, should an existing item with the specified `package_name` already exist.
-
-        Example:
-            !docs set \
-                    python \
-                    https://docs.python.org/3/ \
-                    https://docs.python.org/3/objects.inv
-        """
-        body = {
-            'package': package_name,
-            'base_url': base_url,
-            'inventory_url': inventory_url
-        }
-        await self.bot.api_client.post('bot/documentation-links', json=body)
-
-        log.info(
-            f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n"
-            f"Package name: {package_name}\n"
-            f"Base url: {base_url}\n"
-            f"Inventory URL: {inventory_url}"
-        )
-
-        # Rebuilding the inventory can take some time, so lets send out a
-        # typing event to show that the Bot is still working.
-        async with ctx.typing():
-            await self.refresh_inventory()
-        await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.")
-
-    @docs_group.command(name='delete', aliases=('remove', 'rm', 'd'))
-    @commands.has_any_role(*MODERATION_ROLES)
-    async def delete_command(self, ctx: commands.Context, package_name: ValidPythonIdentifier) -> None:
-        """
-        Removes the specified package from the database.
-
-        Examples:
-            !docs delete aiohttp
-        """
-        await self.bot.api_client.delete(f'bot/documentation-links/{package_name}')
-
-        async with ctx.typing():
-            # Rebuild the inventory to ensure that everything
-            # that was from this package is properly deleted.
-            await self.refresh_inventory()
-        await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.")
-
-    @docs_group.command(name="refresh", aliases=("rfsh", "r"))
-    @commands.has_any_role(*MODERATION_ROLES)
-    async def refresh_command(self, ctx: commands.Context) -> None:
-        """Refresh inventories and send differences to channel."""
-        old_inventories = set(self.base_urls)
-        with ctx.typing():
-            await self.refresh_inventory()
-        # Get differences of added and removed inventories
-        added = ', '.join(inv for inv in self.base_urls if inv not in old_inventories)
-        if added:
-            added = f"+ {added}"
-
-        removed = ', '.join(inv for inv in old_inventories if inv not in self.base_urls)
-        if removed:
-            removed = f"- {removed}"
-
-        embed = discord.Embed(
-            title="Inventories refreshed",
-            description=f"```diff\n{added}\n{removed}```" if added or removed else ""
-        )
-        await ctx.send(embed=embed)
-
-    async def _fetch_inventory(self, inventory_url: str) -> Optional[dict]:
-        """Get and return inventory from `inventory_url`. If fetching fails, return None."""
-        fetch_func = functools.partial(intersphinx.fetch_inventory, SPHINX_MOCK_APP, '', inventory_url)
-        for retry in range(1, FAILED_REQUEST_RETRY_AMOUNT+1):
-            try:
-                package = await self.bot.loop.run_in_executor(None, fetch_func)
-            except ConnectTimeout:
-                log.error(
-                    f"Fetching of inventory {inventory_url} timed out,"
-                    f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})"
-                )
-            except ProtocolError:
-                log.error(
-                    f"Connection lost while fetching inventory {inventory_url},"
-                    f" trying again. ({retry}/{FAILED_REQUEST_RETRY_AMOUNT})"
-                )
-            except HTTPError as e:
-                log.error(f"Fetching of inventory {inventory_url} failed with status code {e.response.status_code}.")
-                return None
-            except ConnectionError:
-                log.error(f"Couldn't establish connection to inventory {inventory_url}.")
-                return None
-            else:
-                return package
-        log.error(f"Fetching of inventory {inventory_url} failed.")
-        return None
-
-    @staticmethod
-    def _match_end_tag(tag: Tag) -> bool:
-        """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
-        for attr in SEARCH_END_TAG_ATTRS:
-            if attr in tag.get("class", ()):
-                return True
-
-        return tag.name == "table"
-
-
-def setup(bot: Bot) -> None:
-    """Load the Doc cog."""
-    bot.add_cog(Doc(bot))
diff --git a/bot/exts/info/doc/__init__.py b/bot/exts/info/doc/__init__.py
new file mode 100644
index 000000000..e9eb9428c
--- /dev/null
+++ b/bot/exts/info/doc/__init__.py
@@ -0,0 +1,7 @@
+from bot.bot import Bot
+from ._cog import DocCog
+
+
+def setup(bot: Bot) -> None:
+    """Load the Doc cog."""
+    bot.add_cog(DocCog(bot))
diff --git a/bot/exts/info/doc/_cog.py b/bot/exts/info/doc/_cog.py
new file mode 100644
index 000000000..257435e95
--- /dev/null
+++ b/bot/exts/info/doc/_cog.py
@@ -0,0 +1,457 @@
+from __future__ import annotations
+
+import asyncio
+import logging
+import re
+import sys
+from collections import defaultdict
+from contextlib import suppress
+from typing import Dict, List, NamedTuple, Optional, Union
+
+import discord
+from aiohttp import ClientSession
+from bs4 import BeautifulSoup
+from discord.ext import commands
+
+from bot.bot import Bot
+from bot.constants import MODERATION_ROLES, RedirectOutput
+from bot.converters import PackageName, ValidURL
+from bot.pagination import LinePaginator
+from bot.utils.messages import wait_for_deletion
+from bot.utils.scheduling import Scheduler
+from ._inventory_parser import FAILED_REQUEST_ATTEMPTS, fetch_inventory
+from ._parsing import get_symbol_markdown
+
+log = logging.getLogger(__name__)
+
+NO_OVERRIDE_GROUPS = (
+    "2to3fixer",
+    "token",
+    "label",
+    "pdbcommand",
+    "term",
+)
+NO_OVERRIDE_PACKAGES = (
+    "python",
+)
+
+WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
+NOT_FOUND_DELETE_DELAY = RedirectOutput.delete_delay
+
+
+class DocItem(NamedTuple):
+    """Holds inventory symbol information."""
+
+    package: str
+    group: str
+    base_url: str
+    relative_url_path: str
+    symbol_id: str
+
+    @property
+    def url(self) -> str:
+        """Return the absolute url to the symbol."""
+        return "".join((self.base_url, self.relative_url_path))
+
+
+class QueueItem(NamedTuple):
+    """Contains a symbol and the BeautifulSoup object needed to parse it."""
+
+    symbol: DocItem
+    soup: BeautifulSoup
+
+    def __eq__(self, other: Union[QueueItem, DocItem]):
+        if isinstance(other, DocItem):
+            return self.symbol == other
+        return NamedTuple.__eq__(self, other)
+
+
+class CachedParser:
+    """
+    Get symbol markdown from pages with smarter caching.
+
+    DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict.
+    `get_markdown` is used to fetch the markdown; when this is used for the first time on a page,
+    all of the symbols are queued to be parsed to avoid multiple web requests to the same page.
+    """
+
+    def __init__(self):
+        self._queue: List[QueueItem] = []
+        self._results = {}
+        self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list)
+        self._item_events: Dict[DocItem, asyncio.Event] = {}
+        self._parse_task = None
+
+    async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str:
+        """
+        Get result markdown of `doc_item`.
+
+        If no symbols were fetched from `doc_item`s page before,
+        the HTML has to be fetched before parsing can be queued.
+        """
+        if (symbol := self._results.get(doc_item)) is not None:
+            return symbol
+
+        if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None:
+            async with client_session.get(doc_item.url) as response:
+                soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml")
+
+            self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue)
+            del self._page_symbols[doc_item.url]
+            log.debug(f"Added symbols from {doc_item.url} to parse queue.")
+
+            if self._parse_task is None:
+                self._parse_task = asyncio.create_task(self._parse_queue())
+
+        self._move_to_front(doc_item)
+        self._item_events[doc_item] = item_event = asyncio.Event()
+        await item_event.wait()
+        return self._results[doc_item]
+
+    async def _parse_queue(self) -> None:
+        """
+        Parse all item from the queue, setting associated events for symbols if present.
+
+        The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished.
+        """
+        log.trace("Starting queue parsing.")
+        while self._queue:
+            item, soup = self._queue.pop()
+            try:
+                self._results[item] = get_symbol_markdown(soup, item)
+            except Exception:
+                log.exception(f"Unexpected error when handling {item}")
+            else:
+                if (event := self._item_events.get(item)) is not None:
+                    event.set()
+            await asyncio.sleep(0.1)
+
+        self._parse_task = None
+        log.trace("Finished parsing queue.")
+
+    def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None:
+        """Move `item` to the front of the parse queue."""
+        # The parse queue stores soups along with the doc symbols in QueueItem objects,
+        # in case we're moving a DocItem we have to get the associated QueueItem first and then move it.
+        item_index = self._queue.index(item)
+        queue_item = self._queue[item_index]
+
+        del self._queue[item_index]
+        self._queue.append(queue_item)
+
+    def add_item(self, doc_item: DocItem) -> None:
+        """Add a DocItem to `_page_symbols`."""
+        self._page_symbols[doc_item.url].append(doc_item)
+
+    async def clear(self) -> None:
+        """
+        Clear all internal symbol data.
+
+        All currently requested items are waited to be parsed before clearing.
+        """
+        for event in self._item_events.values():
+            await event.wait()
+        if self._parse_task is not None:
+            self._parse_task.cancel()
+            self._parse_task = None
+        self._queue.clear()
+        self._results.clear()
+        self._page_symbols.clear()
+        self._item_events.clear()
+
+
+class InventoryURL(commands.Converter):
+    """
+    Represents an Intersphinx inventory URL.
+
+    This converter checks whether intersphinx accepts the given inventory URL, and raises
+    `BadArgument` if that is not the case.
+
+    Otherwise, it simply passes through the given URL.
+    """
+
+    @staticmethod
+    async def convert(ctx: commands.Context, url: str) -> str:
+        """Convert url to Intersphinx inventory URL."""
+        await ctx.trigger_typing()
+        if await fetch_inventory(ctx.bot.http_session, url) is None:
+            raise commands.BadArgument(f"Failed to fetch inventory file after {FAILED_REQUEST_ATTEMPTS}.")
+        return url
+
+
+class DocCog(commands.Cog):
+    """A set of commands for querying & displaying documentation."""
+
+    def __init__(self, bot: Bot):
+        self.base_urls = {}
+        self.bot = bot
+        self.doc_symbols: Dict[str, DocItem] = {}
+        self.item_fetcher = CachedParser()
+        self.renamed_symbols = set()
+
+        self.inventory_scheduler = Scheduler(self.__class__.__name__)
+        self.scheduled_inventories = set()
+
+        self.bot.loop.create_task(self.init_refresh_inventory())
+
+    async def init_refresh_inventory(self) -> None:
+        """Refresh documentation inventory on cog initialization."""
+        await self.bot.wait_until_guild_available()
+        await self.refresh_inventory()
+
+    async def update_single(
+        self, api_package_name: str, base_url: str, inventory_url: str
+    ) -> bool:
+        """
+        Rebuild the inventory for a single package.
+
+        Where:
+            * `package_name` is the package name to use, appears in the log
+            * `base_url` is the root documentation URL for the specified package, used to build
+                absolute paths that link to specific symbols
+            * `inventory_url` is the absolute URL to the intersphinx inventory.
+
+        If the inventory file is currently unreachable,
+        the update is rescheduled to execute in 2 minutes on the first attempt, and 5 minutes on subsequent attempts.
+
+        Return True on success; False if fetching failed and was rescheduled.
+        """
+        self.base_urls[api_package_name] = base_url
+        package = await fetch_inventory(self.bot.http_session, inventory_url)
+
+        if not package:
+            delay = 2*60 if inventory_url not in self.scheduled_inventories else 5*60
+            log.info(f"Failed to fetch inventory, attempting again in {delay//60} minutes.")
+            self.inventory_scheduler.schedule_later(
+                delay,
+                api_package_name,
+                fetch_inventory(self.bot.http_session, inventory_url)
+            )
+            self.scheduled_inventories.add(api_package_name)
+            return False
+        with suppress(KeyError):
+            self.scheduled_inventories.discard(api_package_name)
+
+        for group, items in package.items():
+            for symbol, relative_doc_url in items:
+                if "/" in symbol:
+                    continue  # skip unreachable symbols with slashes
+                # Intern the group names since they're reused in all the DocItems
+                # to remove unnecessary memory consumption from them being unique objects
+                group_name = sys.intern(group.split(":")[1])
+
+                if (original_symbol := self.doc_symbols.get(symbol)) is not None:
+                    if (
+                        group_name in NO_OVERRIDE_GROUPS
+                        or any(package == original_symbol.package for package in NO_OVERRIDE_PACKAGES)
+                    ):
+                        symbol = f"{group_name}.{symbol}"
+                        self.renamed_symbols.add(symbol)
+
+                    elif (overridden_symbol_group := original_symbol.group) in NO_OVERRIDE_GROUPS:
+                        overridden_symbol = f"{overridden_symbol_group}.{symbol}"
+                        if overridden_symbol in self.renamed_symbols:
+                            overridden_symbol = f"{api_package_name}.{overridden_symbol}"
+
+                        self.doc_symbols[overridden_symbol] = original_symbol
+                        self.renamed_symbols.add(overridden_symbol)
+
+                    # If renamed `symbol` already exists, add library name in front to differentiate between them.
+                    elif symbol in self.renamed_symbols:
+                        symbol = f"{api_package_name}.{symbol}"
+                        self.renamed_symbols.add(symbol)
+
+                relative_url_path, _, symbol_id = relative_doc_url.partition("#")
+                symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id)
+                self.doc_symbols[symbol] = symbol_item
+                self.item_fetcher.add_item(symbol_item)
+
+        log.trace(f"Fetched inventory for {api_package_name}.")
+        return True
+
+    async def refresh_inventory(self) -> None:
+        """Refresh internal documentation inventory."""
+        log.debug("Refreshing documentation inventory...")
+
+        # Clear the old base URLS and doc symbols to ensure
+        # that we start from a fresh local dataset.
+        # Also, reset the cache used for fetching documentation.
+        self.base_urls.clear()
+        self.doc_symbols.clear()
+        self.renamed_symbols.clear()
+        self.scheduled_inventories.clear()
+        await self.item_fetcher.clear()
+
+        # Run all coroutines concurrently - since each of them performs a HTTP
+        # request, this speeds up fetching the inventory data heavily.
+        coros = [
+            self.update_single(
+                package["package"], package["base_url"], package["inventory_url"]
+            ) for package in await self.bot.api_client.get('bot/documentation-links')
+        ]
+        await asyncio.gather(*coros)
+
+    async def get_symbol_embed(self, symbol: str) -> Optional[discord.Embed]:
+        """
+        Attempt to scrape and fetch the data for the given `symbol`, and build an embed from its contents.
+
+        If the symbol is known, an Embed with documentation about it is returned.
+        """
+        symbol_info = self.doc_symbols.get(symbol)
+        if symbol_info is None:
+            return None
+        self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
+
+        embed = discord.Embed(
+            title=discord.utils.escape_markdown(symbol),
+            url=f"{symbol_info.url}#{symbol_info.symbol_id}",
+            description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info)
+        )
+        # Show all symbols with the same name that were renamed in the footer.
+        embed.set_footer(
+            text=", ".join(renamed for renamed in self.renamed_symbols - {symbol} if renamed.endswith(f".{symbol}"))
+        )
+        return embed
+
+    @commands.group(name='docs', aliases=('doc', 'd'), invoke_without_command=True)
+    async def docs_group(self, ctx: commands.Context, *, symbol: Optional[str]) -> None:
+        """Lookup documentation for Python symbols."""
+        await ctx.invoke(self.get_command, symbol=symbol)
+
+    @docs_group.command(name='getdoc', aliases=('g',))
+    async def get_command(self, ctx: commands.Context, *, symbol: Optional[str]) -> None:
+        """
+        Return a documentation embed for a given symbol.
+
+        If no symbol is given, return a list of all available inventories.
+
+        Examples:
+            !docs
+            !docs aiohttp
+            !docs aiohttp.ClientSession
+            !docs getdoc aiohttp.ClientSession
+        """
+        if not symbol:
+            inventory_embed = discord.Embed(
+                title=f"All inventories (`{len(self.base_urls)}` total)",
+                colour=discord.Colour.blue()
+            )
+
+            lines = sorted(f"• [`{name}`]({url})" for name, url in self.base_urls.items())
+            if self.base_urls:
+                await LinePaginator.paginate(lines, ctx, inventory_embed, max_size=400, empty=False)
+
+            else:
+                inventory_embed.description = "Hmmm, seems like there's nothing here yet."
+                await ctx.send(embed=inventory_embed)
+
+        else:
+            symbol = symbol.strip("`")
+            # Fetching documentation for a symbol (at least for the first time, since
+            # caching is used) takes quite some time, so let's send typing to indicate
+            # that we got the command, but are still working on it.
+            async with ctx.typing():
+                doc_embed = await self.get_symbol_embed(symbol)
+
+            if doc_embed is None:
+                symbol = await discord.ext.commands.clean_content().convert(ctx, symbol)
+                error_embed = discord.Embed(
+                    description=f"Sorry, I could not find any documentation for `{(symbol)}`.",
+                    colour=discord.Colour.red()
+                )
+                error_message = await ctx.send(embed=error_embed)
+                await wait_for_deletion(
+                    error_message,
+                    (ctx.author.id,),
+                    timeout=NOT_FOUND_DELETE_DELAY,
+                    client=self.bot
+                )
+                with suppress(discord.NotFound):
+                    await ctx.message.delete()
+                with suppress(discord.NotFound):
+                    await error_message.delete()
+            else:
+                msg = await ctx.send(embed=doc_embed)
+                await wait_for_deletion(msg, (ctx.author.id,), client=self.bot)
+
+    @docs_group.command(name='setdoc', aliases=('s',))
+    @commands.has_any_role(*MODERATION_ROLES)
+    async def set_command(
+        self, ctx: commands.Context, package_name: PackageName,
+        base_url: ValidURL, inventory_url: InventoryURL
+    ) -> None:
+        """
+        Adds a new documentation metadata object to the site's database.
+
+        The database will update the object, should an existing item with the specified `package_name` already exist.
+
+        Example:
+            !docs setdoc \
+                    python \
+                    https://docs.python.org/3/ \
+                    https://docs.python.org/3/objects.inv
+        """
+        body = {
+            'package': package_name,
+            'base_url': base_url,
+            'inventory_url': inventory_url
+        }
+        await self.bot.api_client.post('bot/documentation-links', json=body)
+
+        log.info(
+            f"User @{ctx.author} ({ctx.author.id}) added a new documentation package:\n"
+            f"Package name: {package_name}\n"
+            f"Base url: {base_url}\n"
+            f"Inventory URL: {inventory_url}"
+        )
+
+        if await self.update_single(package_name, base_url, inventory_url) is None:
+            await ctx.send(
+                f"Added package `{package_name}` to database but failed to fetch inventory; rescheduled in 2 minutes."
+            )
+            return
+        await ctx.send(f"Added package `{package_name}` to database and refreshed inventory.")
+
+    @docs_group.command(name='deletedoc', aliases=('removedoc', 'rm', 'd'))
+    @commands.has_any_role(*MODERATION_ROLES)
+    async def delete_command(self, ctx: commands.Context, package_name: PackageName) -> None:
+        """
+        Removes the specified package from the database.
+
+        Examples:
+            !docs deletedoc aiohttp
+        """
+        await self.bot.api_client.delete(f'bot/documentation-links/{package_name}')
+
+        if package_name in self.scheduled_inventories:
+            self.inventory_scheduler.cancel(package_name)
+
+        async with ctx.typing():
+            # Rebuild the inventory to ensure that everything
+            # that was from this package is properly deleted.
+            await self.refresh_inventory()
+        await ctx.send(f"Successfully deleted `{package_name}` and refreshed inventory.")
+
+    @docs_group.command(name="refreshdoc", aliases=("rfsh", "r"))
+    @commands.has_any_role(*MODERATION_ROLES)
+    async def refresh_command(self, ctx: commands.Context) -> None:
+        """Refresh inventories and send differences to channel."""
+        for inventory in self.scheduled_inventories:
+            self.inventory_scheduler.cancel(inventory)
+
+        old_inventories = set(self.base_urls)
+        with ctx.typing():
+            await self.refresh_inventory()
+        new_inventories = set(self.base_urls)
+
+        if added := ", ".join(new_inventories - old_inventories):
+            added = "+ " + added
+
+        if removed := ", ".join(old_inventories - new_inventories):
+            removed = "- " + removed
+
+        embed = discord.Embed(
+            title="Inventories refreshed",
+            description=f"```diff\n{added}\n{removed}```" if added or removed else ""
+        )
+        await ctx.send(embed=embed)
diff --git a/bot/exts/info/doc/_html.py b/bot/exts/info/doc/_html.py
new file mode 100644
index 000000000..bc705130d
--- /dev/null
+++ b/bot/exts/info/doc/_html.py
@@ -0,0 +1,33 @@
+from collections.abc import Iterable
+from typing import List, Union
+
+from bs4.element import NavigableString, PageElement, SoupStrainer, Tag
+
+
+class Strainer(SoupStrainer):
+    """Subclass of SoupStrainer to allow matching of both `Tag`s and `NavigableString`s."""
+
+    def __init__(self, *, include_strings: bool, **kwargs):
+        self.include_strings = include_strings
+        super().__init__(**kwargs)
+
+    markup_hint = Union[PageElement, List["markup_hint"]]
+
+    def search(self, markup: markup_hint) -> Union[PageElement, str]:
+        """Extend default SoupStrainer behaviour to allow matching both `Tag`s` and `NavigableString`s."""
+        if isinstance(markup, Iterable) and not isinstance(markup, (Tag, str)):
+            for element in markup:
+                if isinstance(element, NavigableString) and self.search(element):
+                    return element
+        elif isinstance(markup, Tag):
+            # Also include tags while we're searching for strings and tags.
+            if self.include_strings or (not self.text or self.name or self.attrs):
+                return self.search_tag(markup)
+
+        elif isinstance(markup, str):
+            # Let everything through the text filter if we're including strings and tags.
+            text_filter = None if not self.include_strings else True
+            if not self.name and not self.attrs and self._matches(markup, text_filter):
+                return markup
+        else:
+            raise Exception(f"I don't know how to match against a {markup.__class__}")
diff --git a/bot/exts/info/doc/_inventory_parser.py b/bot/exts/info/doc/_inventory_parser.py
new file mode 100644
index 000000000..23931869b
--- /dev/null
+++ b/bot/exts/info/doc/_inventory_parser.py
@@ -0,0 +1,120 @@
+import logging
+import re
+import zlib
+from collections import defaultdict
+from typing import AsyncIterator, DefaultDict, List, Optional, Tuple
+
+import aiohttp
+
+log = logging.getLogger(__name__)
+
+FAILED_REQUEST_ATTEMPTS = 3
+_V2_LINE_RE = re.compile(r'(?x)(.+?)\s+(\S*:\S*)\s+(-?\d+)\s+?(\S*)\s+(.*)')
+
+
+class ZlibStreamReader:
+    """Class used for decoding zlib data of a stream line by line."""
+
+    READ_CHUNK_SIZE = 16 * 1024
+
+    def __init__(self, stream: aiohttp.StreamReader) -> None:
+        self.stream = stream
+
+    async def _read_compressed_chunks(self) -> AsyncIterator[bytes]:
+        """Read zlib data in `READ_CHUNK_SIZE` sized chunks and decompress."""
+        decompressor = zlib.decompressobj()
+        async for chunk in self.stream.iter_chunked(self.READ_CHUNK_SIZE):
+            yield decompressor.decompress(chunk)
+
+        yield decompressor.flush()
+
+    async def __aiter__(self) -> AsyncIterator[str]:
+        """Yield lines of decompressed text."""
+        buf = b''
+        async for chunk in self._read_compressed_chunks():
+            buf += chunk
+            pos = buf.find(b'\n')
+            while pos != -1:
+                yield buf[:pos].decode()
+                buf = buf[pos + 1:]
+                pos = buf.find(b'\n')
+
+
+async def _load_v1(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]:
+    invdata = defaultdict(list)
+
+    async for line in stream:
+        name, type_, location = line.decode().rstrip().split(maxsplit=2)
+        # version 1 did not add anchors to the location
+        if type_ == 'mod':
+            type_ = 'py:module'
+            location += '#module-' + name
+        else:
+            type_ = 'py:' + type_
+            location += '#' + name
+        invdata[type_].append((name, location))
+    return invdata
+
+
+async def _load_v2(stream: aiohttp.StreamReader) -> DefaultDict[str, List[Tuple[str, str]]]:
+    invdata = defaultdict(list)
+
+    async for line in ZlibStreamReader(stream):
+        m = _V2_LINE_RE.match(line.rstrip())
+        name, type_, _prio, location, _dispname = m.groups()  # ignore the parsed items we don't need
+        if location.endswith('$'):
+            location = location[:-1] + name
+
+        invdata[type_].append((name, location))
+    return invdata
+
+
+async def _fetch_inventory(client_session: aiohttp.ClientSession, url: str) -> DefaultDict[str, List[Tuple[str, str]]]:
+    """Fetch, parse and return an intersphinx inventory file from an url."""
+    timeout = aiohttp.ClientTimeout(sock_connect=5, sock_read=5)
+    async with client_session.get(url, timeout=timeout, raise_for_status=True) as response:
+        stream = response.content
+
+        inventory_header = (await stream.readline()).decode().rstrip()
+        inventory_version = int(inventory_header[-1:])
+        await stream.readline()  # skip project name
+        await stream.readline()  # skip project version
+
+        if inventory_version == 1:
+            return await _load_v1(stream)
+
+        elif inventory_version == 2:
+            if b"zlib" not in await stream.readline():
+                raise ValueError(f"Invalid inventory file at url {url}.")
+            return await _load_v2(stream)
+
+        raise ValueError(f"Invalid inventory file at url {url}.")
+
+
+async def fetch_inventory(
+        client_session: aiohttp.ClientSession,
+        url: str
+) -> Optional[DefaultDict[str, List[Tuple[str, str]]]]:
+    """Get inventory from `url`, retrying `FAILED_REQUEST_ATTEMPTS` times on errors."""
+    for attempt in range(1, FAILED_REQUEST_ATTEMPTS+1):
+        try:
+            inventory = await _fetch_inventory(client_session, url)
+        except aiohttp.ClientConnectorError:
+            log.warning(
+                f"Failed to connect to inventory url at {url}, "
+                f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})."
+            )
+        except aiohttp.ClientError:
+            log.error(
+                f"Failed to get inventory from {url}, "
+                f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})."
+            )
+        except Exception:
+            log.exception(
+                f"An unexpected error has occurred during fetching of {url}, "
+                f"trying again ({attempt}/{FAILED_REQUEST_ATTEMPTS})."
+            )
+        else:
+            return inventory
+
+    return None
diff --git a/bot/exts/info/doc/_markdown.py b/bot/exts/info/doc/_markdown.py
new file mode 100644
index 000000000..ba35a84c4
--- /dev/null
+++ b/bot/exts/info/doc/_markdown.py
@@ -0,0 +1,53 @@
+from urllib.parse import urljoin
+
+from bs4.element import PageElement
+from markdownify import MarkdownConverter
+
+
+class DocMarkdownConverter(MarkdownConverter):
+    """Subclass markdownify's MarkdownCoverter to provide custom conversion methods."""
+
+    def __init__(self, *, page_url: str, **options):
+        super().__init__(**options)
+        self.page_url = page_url
+
+    def convert_li(self, el: PageElement, text: str) -> str:
+        """Fix markdownify's erroneous indexing in ol tags."""
+        parent = el.parent
+        if parent is not None and parent.name == "ol":
+            li_tags = parent.find_all("li")
+            bullet = f"{li_tags.index(el)+1}."
+        else:
+            depth = -1
+            while el:
+                if el.name == "ul":
+                    depth += 1
+                el = el.parent
+            bullets = self.options["bullets"]
+            bullet = bullets[depth % len(bullets)]
+        return f"{bullet} {text}\n"
+
+    def convert_hn(self, _n: int, el: PageElement, text: str) -> str:
+        """Convert h tags to bold text with ** instead of adding #."""
+        return f"**{text}**\n\n"
+
+    def convert_code(self, el: PageElement, text: str) -> str:
+        """Undo `markdownify`s underscore escaping."""
+        return f"`{text}`".replace("\\", "")
+
+    def convert_pre(self, el: PageElement, text: str) -> str:
+        """Wrap any codeblocks in `py` for syntax highlighting."""
+        code = "".join(el.strings)
+        return f"```py\n{code}```"
+
+    def convert_a(self, el: PageElement, text: str) -> str:
+        """Resolve relative URLs to `self.page_url`."""
+        el["href"] = urljoin(self.page_url, el["href"])
+        return super().convert_a(el, text)
+
+    def convert_p(self, el: PageElement, text: str) -> str:
+        """Include only one newline instead of two when the parent is a li tag."""
+        parent = el.parent
+        if parent is not None and parent.name == "li":
+            return f"{text}\n"
+        return super().convert_p(el, text)
diff --git a/bot/exts/info/doc/_parsing.py b/bot/exts/info/doc/_parsing.py
new file mode 100644
index 000000000..83e35e2b1
--- /dev/null
+++ b/bot/exts/info/doc/_parsing.py
@@ -0,0 +1,313 @@
+from __future__ import annotations
+
+import logging
+import re
+import string
+import textwrap
+from functools import partial
+from typing import Callable, Collection, Iterable, List, Optional, TYPE_CHECKING, Tuple, Union
+
+from bs4 import BeautifulSoup
+from bs4.element import NavigableString, PageElement, Tag
+
+from ._html import Strainer
+from ._markdown import DocMarkdownConverter
+if TYPE_CHECKING:
+    from ._cog import DocItem
+
+log = logging.getLogger(__name__)
+
+_MAX_SIGNATURE_AMOUNT = 3
+
+_UNWANTED_SIGNATURE_SYMBOLS_RE = re.compile(r"\[source]|\\\\|¶")
+_WHITESPACE_AFTER_NEWLINES_RE = re.compile(r"(?<=\n\n)(\s+)")
+_PARAMETERS_RE = re.compile(r"\((.+)\)")
+
+_SEARCH_END_TAG_ATTRS = (
+    "data",
+    "function",
+    "class",
+    "exception",
+    "seealso",
+    "section",
+    "rubric",
+    "sphinxsidebar",
+)
+
+_NO_SIGNATURE_GROUPS = {
+    "attribute",
+    "envvar",
+    "setting",
+    "tempaltefilter",
+    "templatetag",
+    "term",
+}
+_EMBED_CODE_BLOCK_LENGTH = 61
+# _MAX_SIGNATURE_AMOUNT code block wrapped lines with py syntax highlight
+_MAX_SIGNATURES_LENGTH = (_EMBED_CODE_BLOCK_LENGTH + 8) * _MAX_SIGNATURE_AMOUNT
+# Maximum discord message length - signatures on top
+_MAX_DESCRIPTION_LENGTH = 2000 - _MAX_SIGNATURES_LENGTH
+_TRUNCATE_STRIP_CHARACTERS = "!?:;." + string.whitespace
+_BRACKET_PAIRS = {
+    "{": "}",
+    "(": ")",
+    "[": "]",
+}
+
+
+def _split_parameters(parameters_string: str) -> List[str]:
+    """
+    Split parameters of a signature into individual parameter strings on commas.
+
+    Long string literals are not accounted for.
+    """
+    parameters_list = []
+    last_split = 0
+    depth = 0
+    expected_end = None
+    current_search = None
+    previous_character = ""
+
+    for index, character in enumerate(parameters_string):
+        if character in _BRACKET_PAIRS:
+            if current_search is None:
+                current_search = character
+                expected_end = _BRACKET_PAIRS[character]
+            if character == current_search:
+                depth += 1
+
+        elif character in {"'", '"'}:
+            if depth == 0:
+                depth += 1
+            elif not previous_character == "\\":
+                depth -= 1
+
+        elif character == expected_end:
+            depth -= 1
+            if depth == 0:
+                current_search = None
+                expected_end = None
+
+        elif depth == 0 and character == ",":
+            parameters_list.append(parameters_string[last_split:index])
+            last_split = index + 1
+        previous_character = character
+
+    parameters_list.append(parameters_string[last_split:])
+    return parameters_list
+
+
+def _find_elements_until_tag(
+        start_element: PageElement,
+        tag_filter: Union[Tuple[str, ...], Callable[[Tag], bool]],
+        *,
+        func: Callable,
+        include_strings: bool = False,
+        limit: int = None,
+) -> List[Union[Tag, NavigableString]]:
+    """
+    Get all elements up to `limit` or until a tag matching `tag_filter` is found.
+
+    `tag_filter` can be either a tuple of string names to check against,
+    or a filtering callable that's applied to tags.
+
+    When `include_strings` is True, `NavigableString`s from the document will be included in the result along `Tag`s.
+
+    `func` takes in a BeautifulSoup unbound method for finding multiple elements, such as `BeautifulSoup.find_all`.
+    The method is then iterated over and all elements until the matching tag or the limit are added to the return list.
+    """
+    use_tuple_filter = isinstance(tag_filter, tuple)
+    elements = []
+
+    for element in func(start_element, name=Strainer(include_strings=include_strings), limit=limit):
+        if isinstance(element, Tag):
+            if use_tuple_filter:
+                if element.name in tag_filter:
+                    break
+            elif tag_filter(element):
+                break
+        elements.append(element)
+
+    return elements
+
+
+_find_next_children_until_tag = partial(_find_elements_until_tag, func=partial(BeautifulSoup.find_all, recursive=False))
+_find_next_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_next_siblings)
+_find_previous_siblings_until_tag = partial(_find_elements_until_tag, func=BeautifulSoup.find_previous_siblings)
+
+
+def _get_general_description(start_element: PageElement) -> List[Union[Tag, NavigableString]]:
+    """
+    Get page content to a table or a tag with its class in `SEARCH_END_TAG_ATTRS`.
+
+    A headerlink a tag is attempted to be found to skip repeating the symbol information in the description,
+    if it's found it's used as the tag to start the search from instead of the `start_element`.
+    """
+    header = start_element.find_next("a", attrs={"class": "headerlink"})
+    start_tag = header.parent if header is not None else start_element
+    return _find_next_siblings_until_tag(start_tag, _match_end_tag, include_strings=True)
+
+
+def _get_dd_description(symbol: PageElement) -> List[Union[Tag, NavigableString]]:
+    """Get the contents of the next dd tag, up to a dt or a dl tag."""
+    description_tag = symbol.find_next("dd")
+    return _find_next_children_until_tag(description_tag, ("dt", "dl"), include_strings=True)
+
+
+def _get_signatures(start_signature: PageElement) -> List[str]:
+    """
+    Collect up to `_MAX_SIGNATURE_AMOUNT` signatures from dt tags around the `start_signature` dt tag.
+
+    First the signatures under the `start_signature` are included;
+    if less than 2 are found, tags above the start signature are added to the result if any are present.
+    """
+    signatures = []
+    for element in (
+            *reversed(_find_previous_siblings_until_tag(start_signature, ("dd",), limit=2)),
+            start_signature,
+            *_find_next_siblings_until_tag(start_signature, ("dd",), limit=2),
+    )[-(_MAX_SIGNATURE_AMOUNT):]:
+        signature = _UNWANTED_SIGNATURE_SYMBOLS_RE.sub("", element.text)
+
+        if signature:
+            signatures.append(signature)
+
+    return signatures
+
+
+def _truncate_signatures(signatures: Collection[str]) -> Union[List[str], Collection[str]]:
+    """
+    Truncate passed signatures to not exceed `_MAX_SIGNAUTRES_LENGTH`.
+
+    If the signatures need to be truncated, parameters are collapsed until they fit withing the limit.
+    Individual signatures can consist of max 1, 2, ..., `_MAX_SIGNATURE_AMOUNT` lines of text,
+    inversely proportional to the amount of signatures.
+    A maximum of `_MAX_SIGNATURE_AMOUNT` signatures is assumed to be passed.
+    """
+    if not sum(len(signature) for signature in signatures) > _MAX_SIGNATURES_LENGTH:
+        return signatures
+
+    max_signature_length = _EMBED_CODE_BLOCK_LENGTH * (_MAX_SIGNATURE_AMOUNT + 1 - len(signatures))
+    formatted_signatures = []
+    for signature in signatures:
+        signature = signature.strip()
+        if len(signature) > max_signature_length:
+            if (parameters_match := _PARAMETERS_RE.search(signature)) is None:
+                formatted_signatures.append(textwrap.shorten(signature, max_signature_length))
+                continue
+
+            truncated_signature = []
+            parameters_string = parameters_match[1]
+            running_length = len(signature) - len(parameters_string)
+            for parameter in _split_parameters(parameters_string):
+                if (len(parameter) + running_length) <= max_signature_length - 4:  # account for comma and placeholder
+                    truncated_signature.append(parameter)
+                    running_length += len(parameter) + 1
+                else:
+                    truncated_signature.append(" ...")
+                    formatted_signatures.append(signature.replace(parameters_string, ",".join(truncated_signature)))
+                    break
+        else:
+            formatted_signatures.append(signature)
+
+    return formatted_signatures
+
+
+def _get_truncated_description(
+        elements: Iterable[Union[Tag, NavigableString]],
+        markdown_converter: DocMarkdownConverter,
+        max_length: int,
+) -> str:
+    """
+    Truncate markdown from `elements` to be at most `max_length` characters visually.
+
+    `max_length` limits the length of the rendered characters in the string,
+    with the real string length limited to `_MAX_DESCRIPTION_LENGTH` to accommodate discord length limits
+    """
+    visual_length = 0
+    real_length = 0
+    result = []
+    shortened = False
+
+    for element in elements:
+        is_tag = isinstance(element, Tag)
+        element_length = len(element.text) if is_tag else len(element)
+        if visual_length + element_length < max_length:
+            if is_tag:
+                element_markdown = markdown_converter.process_tag(element)
+            else:
+                element_markdown = markdown_converter.process_text(element)
+
+            element_markdown_length = len(element_markdown)
+            if real_length + element_markdown_length < _MAX_DESCRIPTION_LENGTH:
+                result.append(element_markdown)
+            else:
+                shortened = True
+                break
+            real_length += element_markdown_length
+            visual_length += element_length
+        else:
+            shortened = True
+            break
+
+    markdown_string = "".join(result)
+    if shortened:
+        markdown_string = markdown_string.rstrip(_TRUNCATE_STRIP_CHARACTERS) + "..."
+    return markdown_string
+
+
+def _parse_into_markdown(signatures: Optional[List[str]], description: Iterable[Tag], url: str) -> str:
+    """
+    Create a markdown string with the signatures at the top, and the converted html description below them.
+
+    The signatures are wrapped in python codeblocks, separated from the description by a newline.
+    The result string is truncated to be max 1000 symbols long.
+    """
+    description = _get_truncated_description(description, DocMarkdownConverter(bullets="•", page_url=url), 750)
+    description = _WHITESPACE_AFTER_NEWLINES_RE.sub('', description)
+    if signatures is not None:
+        formatted_markdown = "".join(f"```py\n{signature}```" for signature in _truncate_signatures(signatures))
+    else:
+        formatted_markdown = ""
+    formatted_markdown += f"\n{description}"
+
+    return formatted_markdown
+
+
+def _match_end_tag(tag: Tag) -> bool:
+    """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
+    for attr in _SEARCH_END_TAG_ATTRS:
+        if attr in tag.get("class", ()):
+            return True
+
+    return tag.name == "table"
+
+
+def get_symbol_markdown(soup: BeautifulSoup, symbol_data: DocItem) -> str:
+    """
+    Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters.
+
+    The method of parsing and what information gets included depends on the symbol's group.
+    """
+    symbol_heading = soup.find(id=symbol_data.symbol_id)
+    if symbol_heading is None:
+        log.warning("Symbol present in loaded inventories not found on site, consider refreshing inventories.")
+        return "Unable to parse the requested symbol."
+    signature = None
+    # Modules, doc pages and labels don't point to description list tags but to tags like divs,
+    # no special parsing can be done so we only try to include what's under them.
+    if symbol_data.group in {"module", "doc", "label"}:
+        description = _get_general_description(symbol_heading)
+
+    elif symbol_heading.name != "dt":
+        # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags,
+        # log info the tag can be looked at.
+        description = _get_general_description(symbol_heading)
+
+    elif symbol_data.group in _NO_SIGNATURE_GROUPS:
+        description = _get_dd_description(symbol_heading)
+
+    else:
+        signature = _get_signatures(symbol_heading)
+        description = _get_dd_description(symbol_heading)
+    return _parse_into_markdown(signature, description, symbol_data.url).replace('¶', '')
diff --git a/tests/bot/test_converters.py b/tests/bot/test_converters.py
index c42111f3f..231798a92 100644
--- a/tests/bot/test_converters.py
+++ b/tests/bot/test_converters.py
@@ -10,9 +10,9 @@ from bot.converters import (
     Duration,
     HushDurationConverter,
     ISODateTime,
+    PackageName,
     TagContentConverter,
     TagNameConverter,
-    ValidPythonIdentifier,
 )
 
 
@@ -78,24 +78,23 @@ class ConverterTests(unittest.IsolatedAsyncioTestCase):
                 with self.assertRaisesRegex(BadArgument, re.escape(exception_message)):
                     await TagNameConverter.convert(self.context, invalid_name)
 
-    async def test_valid_python_identifier_for_valid(self):
-        """ValidPythonIdentifier returns valid identifiers unchanged."""
-        test_values = ('foo', 'lemon')
+    async def test_package_name_for_valid(self):
+        """PackageName returns valid package names unchanged."""
+        test_values = ('foo', 'le_mon')
 
         for name in test_values:
             with self.subTest(identifier=name):
-                conversion = await ValidPythonIdentifier.convert(self.context, name)
+                conversion = await PackageName.convert(self.context, name)
                 self.assertEqual(name, conversion)
 
-    async def test_valid_python_identifier_for_invalid(self):
-        """ValidPythonIdentifier raises the proper exception for invalid identifiers."""
-        test_values = ('nested.stuff', '#####')
+    async def test_package_name_for_invalid(self):
+        """PackageName raises the proper exception for invalid package names."""
+        test_values = ('text_with_a_dot.', 'UpperCaseName', "num83r")
 
         for name in test_values:
             with self.subTest(identifier=name):
-                exception_message = f'`{name}` is not a valid Python identifier'
-                with self.assertRaisesRegex(BadArgument, re.escape(exception_message)):
-                    await ValidPythonIdentifier.convert(self.context, name)
+                with self.assertRaises(BadArgument):
+                    await PackageName.convert(self.context, name)
 
     async def test_duration_converter_for_valid(self):
         """Duration returns the correct `datetime` for valid duration strings."""