diff options
| author | 2020-09-20 03:06:59 +0200 | |
|---|---|---|
| committer | 2020-09-20 03:06:59 +0200 | |
| commit | 75f95a110ce96734cb64f89321f9a6eeb0d79463 (patch) | |
| tree | 58210c5244b4a38eaf04e404db5bf9df6729f7f1 | |
| parent | Don't return fragment in DocItem url (diff) | |
Replace caching of soups with new class.
Storing BeautifulSoup objects could lead to memory problems because
of their large footprint, the new class replaces the long term storage
by parsing all items on the first fetch of the page and only storing
their markdown string.
| -rw-r--r-- | bot/cogs/doc/cog.py | 122 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 36 |
2 files changed, 119 insertions, 39 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2e49fcd38..d57e76ebd 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -1,14 +1,18 @@ +from __future__ import annotations + import asyncio import functools import logging import re import sys -from collections import OrderedDict +from collections import defaultdict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional +from typing import Dict, List, NamedTuple, Optional, Union import discord +from aiohttp import ClientSession +from bs4 import BeautifulSoup from discord.ext import commands from requests import ConnectTimeout, ConnectionError, HTTPError from sphinx.ext import intersphinx @@ -20,7 +24,6 @@ from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion -from .cache import async_cache from .parsing import get_symbol_markdown log = logging.getLogger(__name__) @@ -67,6 +70,108 @@ class DocItem(NamedTuple): return "".join((self.base_url, self.relative_url_path)) +class QueueItem(NamedTuple): + """Contains a symbol and the BeautifulSoup object needed to parse it.""" + + symbol: DocItem + soup: BeautifulSoup + + def __eq__(self, other: Union[QueueItem, DocItem]): + if isinstance(other, DocItem): + return self.symbol == other + return NamedTuple.__eq__(self, other) + + +class CachedParser: + """ + Get symbol markdown from pages with smarter caching. + + DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. + `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, + all of the symbols are queued to be parsed to avoid multiple web requests to the same page. + """ + + def __init__(self): + self._queue: List[QueueItem] = [] + self._results = {} + self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) + self._item_events: Dict[DocItem, asyncio.Event] = {} + self._parse_task = None + + async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: + """ + Get result markdown of `doc_item`. + + If no symbols were fetched from `doc_item`s page before, + the HTML has to be fetched before parsing can be queued. + """ + if (symbol := self._results.get(doc_item)) is not None: + return symbol + + if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: + async with client_session.get(doc_item.url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + + self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + del self._page_symbols[doc_item.url] + log.debug(f"Added symbols from {doc_item.url} to parse queue.") + + if self._parse_task is None: + self._parse_task = asyncio.create_task(self._parse_queue()) + + self._move_to_front(doc_item) + self._item_events[doc_item] = item_event = asyncio.Event() + await item_event.wait() + return self._results[doc_item] + + async def _parse_queue(self) -> None: + """ + Parse all item from the queue, setting associated events for symbols if present. + + The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. + """ + log.trace("Starting queue parsing.") + while self._queue: + item, soup = self._queue.pop() + self._results[item] = get_symbol_markdown(soup, item) + if (event := self._item_events.get(item)) is not None: + event.set() + await asyncio.sleep(0.1) + + self._parse_task = None + log.trace("Finished parsing queue.") + + def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: + """Move `item` to the front of the parse queue.""" + # The parse queue stores soups along with the doc symbols in QueueItem objects, + # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. + item_index = self._queue.index(item) + queue_item = self._queue[item_index] + + del self._queue[item_index] + self._queue.append(queue_item) + + def add_item(self, doc_item: DocItem) -> None: + """Add a DocItem to `_page_symbols`.""" + self._page_symbols[doc_item.url].append(doc_item) + + async def clear(self) -> None: + """ + Clear all internal symbol data. + + All currently requested items are waited to be parsed before clearing. + """ + for event in self._item_events.values(): + await event.wait() + if self._parse_task is not None: + self._parse_task.cancel() + self._parse_task = None + self._queue.clear() + self._results.clear() + self._page_symbols.clear() + self._item_events.clear() + + class InventoryURL(commands.Converter): """ Represents an Intersphinx inventory URL. @@ -106,6 +211,7 @@ class DocCog(commands.Cog): self.base_urls = {} self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} + self.item_fetcher = CachedParser() self.renamed_symbols = set() self.bot.loop.create_task(self.init_refresh_inventory()) @@ -163,7 +269,10 @@ class DocCog(commands.Cog): symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name) + relative_url_path, _, symbol_id = relative_doc_url.partition("#") + symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) + self.doc_symbols[symbol] = symbol_item + self.item_fetcher.add_item(symbol_item) log.trace(f"Fetched inventory for {api_package_name}.") @@ -177,7 +286,7 @@ class DocCog(commands.Cog): self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() - async_cache.cache = OrderedDict() + await self.item_fetcher.clear() # Run all coroutines concurrently - since each of them performs a HTTP # request, this speeds up fetching the inventory data heavily. @@ -198,12 +307,11 @@ class DocCog(commands.Cog): if symbol_info is None: return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") - embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info) embed = discord.Embed( title=discord.utils.escape_markdown(symbol), url=f"{symbol_info.url}#{symbol_info.symbol_id}", - description=embed_description + description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) ) # Show all symbols with the same name that were renamed in the footer. embed.set_footer( diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 1271953d4..9fbce7bed 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,11 +5,9 @@ import textwrap from functools import partial from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union -from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag -from .cache import async_cache from .html import Strainer from .markdown import markdownify if TYPE_CHECKING: @@ -171,16 +169,6 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: return formatted_markdown -@async_cache(arg_offset=1) -async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - - def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" for attr in _SEARCH_END_TAG_ATTRS: @@ -190,44 +178,28 @@ def _match_end_tag(tag: Tag) -> bool: return tag.name == "table" -async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str: """ - Return parsed markdown of the passed symbol, truncated to 1000 characters. + Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. - A request through `http_session` is made to the url associated with `symbol_data` for the html contents; - the contents are then parsed depending on what group the symbol belongs to. + The method of parsing and what information gets included depends on the symbol's group. """ - log.trace(f"Parsing symbol from url {symbol_data.url}.") - if "#" in symbol_data.url: - request_url, symbol_id = symbol_data.url.rsplit('#') - else: - request_url = symbol_data.url - symbol_id = None - - soup = await _get_soup_from_url(http_session, request_url) - symbol_heading = soup.find(id=symbol_id) + symbol_heading = soup.find(id=symbol_data.symbol_id) signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. if symbol_data.group in {"module", "doc", "label"}: - log.trace("Symbol is a module, doc or a label; using general description parsing.") description = _get_general_description(symbol_heading) elif symbol_heading.name != "dt": # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, # log info the tag can be looked at. - log.info( - f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it," - f"handling as general description." - ) description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: - log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") description = _get_dd_description(symbol_heading) else: - log.trace("Parsing both signature and description of symbol.") signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) |