Replace caching of soups with new class.

Storing BeautifulSoup objects could lead to memory problems because of their large footprint, the new class replaces the long term storage by parsing all items on the first fetch of the page and only storing their markdown string.
author: Numerlor <[email protected]> 2020-09-20 03:06:59 +0200
committer: Numerlor <[email protected]> 2020-09-20 03:06:59 +0200
commit: 75f95a110ce96734cb64f89321f9a6eeb0d79463 (patch)
tree: 58210c5244b4a38eaf04e404db5bf9df6729f7f1
parent: Don't return fragment in DocItem url (diff)
2 files changed, 119 insertions, 39 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py
index 2e49fcd38..d57e76ebd 100644
--- a/bot/cogs/doc/cog.py
+++ b/bot/cogs/doc/cog.py
@@ -1,14 +1,18 @@
+from __future__ import annotations
+
 import asyncio
 import functools
 import logging
 import re
 import sys
-from collections import OrderedDict
+from collections import defaultdict
 from contextlib import suppress
 from types import SimpleNamespace
-from typing import Dict, NamedTuple, Optional
+from typing import Dict, List, NamedTuple, Optional, Union
 
 import discord
+from aiohttp import ClientSession
+from bs4 import BeautifulSoup
 from discord.ext import commands
 from requests import ConnectTimeout, ConnectionError, HTTPError
 from sphinx.ext import intersphinx
@@ -20,7 +24,6 @@ from bot.converters import PackageName, ValidURL
 from bot.decorators import with_role
 from bot.pagination import LinePaginator
 from bot.utils.messages import wait_for_deletion
-from .cache import async_cache
 from .parsing import get_symbol_markdown
 
 log = logging.getLogger(__name__)
@@ -67,6 +70,108 @@ class DocItem(NamedTuple):
         return "".join((self.base_url, self.relative_url_path))
 
 
+class QueueItem(NamedTuple):
+    """Contains a symbol and the BeautifulSoup object needed to parse it."""
+
+    symbol: DocItem
+    soup: BeautifulSoup
+
+    def __eq__(self, other: Union[QueueItem, DocItem]):
+        if isinstance(other, DocItem):
+            return self.symbol == other
+        return NamedTuple.__eq__(self, other)
+
+
+class CachedParser:
+    """
+    Get symbol markdown from pages with smarter caching.
+
+    DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict.
+    `get_markdown` is used to fetch the markdown; when this is used for the first time on a page,
+    all of the symbols are queued to be parsed to avoid multiple web requests to the same page.
+    """
+
+    def __init__(self):
+        self._queue: List[QueueItem] = []
+        self._results = {}
+        self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list)
+        self._item_events: Dict[DocItem, asyncio.Event] = {}
+        self._parse_task = None
+
+    async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str:
+        """
+        Get result markdown of `doc_item`.
+
+        If no symbols were fetched from `doc_item`s page before,
+        the HTML has to be fetched before parsing can be queued.
+        """
+        if (symbol := self._results.get(doc_item)) is not None:
+            return symbol
+
+        if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None:
+            async with client_session.get(doc_item.url) as response:
+                soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml")
+
+            self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue)
+            del self._page_symbols[doc_item.url]
+            log.debug(f"Added symbols from {doc_item.url} to parse queue.")
+
+            if self._parse_task is None:
+                self._parse_task = asyncio.create_task(self._parse_queue())
+
+        self._move_to_front(doc_item)
+        self._item_events[doc_item] = item_event = asyncio.Event()
+        await item_event.wait()
+        return self._results[doc_item]
+
+    async def _parse_queue(self) -> None:
+        """
+        Parse all item from the queue, setting associated events for symbols if present.
+
+        The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished.
+        """
+        log.trace("Starting queue parsing.")
+        while self._queue:
+            item, soup = self._queue.pop()
+            self._results[item] = get_symbol_markdown(soup, item)
+            if (event := self._item_events.get(item)) is not None:
+                event.set()
+            await asyncio.sleep(0.1)
+
+        self._parse_task = None
+        log.trace("Finished parsing queue.")
+
+    def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None:
+        """Move `item` to the front of the parse queue."""
+        # The parse queue stores soups along with the doc symbols in QueueItem objects,
+        # in case we're moving a DocItem we have to get the associated QueueItem first and then move it.
+        item_index = self._queue.index(item)
+        queue_item = self._queue[item_index]
+
+        del self._queue[item_index]
+        self._queue.append(queue_item)
+
+    def add_item(self, doc_item: DocItem) -> None:
+        """Add a DocItem to `_page_symbols`."""
+        self._page_symbols[doc_item.url].append(doc_item)
+
+    async def clear(self) -> None:
+        """
+        Clear all internal symbol data.
+
+        All currently requested items are waited to be parsed before clearing.
+        """
+        for event in self._item_events.values():
+            await event.wait()
+        if self._parse_task is not None:
+            self._parse_task.cancel()
+            self._parse_task = None
+        self._queue.clear()
+        self._results.clear()
+        self._page_symbols.clear()
+        self._item_events.clear()
+
+
 class InventoryURL(commands.Converter):
     """
     Represents an Intersphinx inventory URL.
@@ -106,6 +211,7 @@ class DocCog(commands.Cog):
         self.base_urls = {}
         self.bot = bot
         self.doc_symbols: Dict[str, DocItem] = {}
+        self.item_fetcher = CachedParser()
         self.renamed_symbols = set()
 
         self.bot.loop.create_task(self.init_refresh_inventory())
@@ -163,7 +269,10 @@ class DocCog(commands.Cog):
                         symbol = f"{api_package_name}.{symbol}"
                         self.renamed_symbols.add(symbol)
 
-                self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name)
+                relative_url_path, _, symbol_id = relative_doc_url.partition("#")
+                symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id)
+                self.doc_symbols[symbol] = symbol_item
+                self.item_fetcher.add_item(symbol_item)
 
         log.trace(f"Fetched inventory for {api_package_name}.")
 
@@ -177,7 +286,7 @@ class DocCog(commands.Cog):
         self.base_urls.clear()
         self.doc_symbols.clear()
         self.renamed_symbols.clear()
-        async_cache.cache = OrderedDict()
+        await self.item_fetcher.clear()
 
         # Run all coroutines concurrently - since each of them performs a HTTP
         # request, this speeds up fetching the inventory data heavily.
@@ -198,12 +307,11 @@ class DocCog(commands.Cog):
         if symbol_info is None:
             return None
         self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
-        embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info)
 
         embed = discord.Embed(
             title=discord.utils.escape_markdown(symbol),
             url=f"{symbol_info.url}#{symbol_info.symbol_id}",
-            description=embed_description
+            description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info)
         )
         # Show all symbols with the same name that were renamed in the footer.
         embed.set_footer(
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
index 1271953d4..9fbce7bed 100644
--- a/bot/cogs/doc/parsing.py
+++ b/bot/cogs/doc/parsing.py
@@ -5,11 +5,9 @@ import textwrap
 from functools import partial
 from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union
 
-from aiohttp import ClientSession
 from bs4 import BeautifulSoup
 from bs4.element import NavigableString, PageElement, Tag
 
-from .cache import async_cache
 from .html import Strainer
 from .markdown import markdownify
 if TYPE_CHECKING:
@@ -171,16 +169,6 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:
     return formatted_markdown
 
 
-@async_cache(arg_offset=1)
-async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
-    """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed."""
-    log.trace(f"Sending a request to {url}.")
-    async with http_session.get(url) as response:
-        soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml')
-    soup.find("head").decompose()  # the head contains no useful data so we can remove it
-    return soup
-
-
 def _match_end_tag(tag: Tag) -> bool:
     """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
     for attr in _SEARCH_END_TAG_ATTRS:
@@ -190,44 +178,28 @@ def _match_end_tag(tag: Tag) -> bool:
     return tag.name == "table"
 
 
-async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str:
+def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str:
     """
-    Return parsed markdown of the passed symbol, truncated to 1000 characters.
+    Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters.
 
-    A request through `http_session` is made to the url associated with `symbol_data` for the html contents;
-    the contents are then parsed depending on what group the symbol belongs to.
+    The method of parsing and what information gets included depends on the symbol's group.
     """
-    log.trace(f"Parsing symbol from url {symbol_data.url}.")
-    if "#" in symbol_data.url:
-        request_url, symbol_id = symbol_data.url.rsplit('#')
-    else:
-        request_url = symbol_data.url
-        symbol_id = None
-
-    soup = await _get_soup_from_url(http_session, request_url)
-    symbol_heading = soup.find(id=symbol_id)
+    symbol_heading = soup.find(id=symbol_data.symbol_id)
     signature = None
     # Modules, doc pages and labels don't point to description list tags but to tags like divs,
     # no special parsing can be done so we only try to include what's under them.
     if symbol_data.group in {"module", "doc", "label"}:
-        log.trace("Symbol is a module, doc or a label; using general description parsing.")
         description = _get_general_description(symbol_heading)
 
     elif symbol_heading.name != "dt":
         # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags,
         # log info the tag can be looked at.
-        log.info(
-            f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it,"
-            f"handling as general description."
-        )
         description = _get_general_description(symbol_heading)
 
     elif symbol_data.group in _NO_SIGNATURE_GROUPS:
-        log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.")
         description = _get_dd_description(symbol_heading)
 
     else:
-        log.trace("Parsing both signature and description of symbol.")
         signature = _get_signatures(symbol_heading)
         description = _get_dd_description(symbol_heading)
author	Numerlor <[email protected]>	2020-09-20 03:06:59 +0200
committer	Numerlor <[email protected]>	2020-09-20 03:06:59 +0200
commit	75f95a110ce96734cb64f89321f9a6eeb0d79463 (patch)
tree	58210c5244b4a38eaf04e404db5bf9df6729f7f1
parent	Don't return fragment in DocItem url (diff)