diff options
| -rw-r--r-- | bot/cogs/doc/cog.py | 122 | ||||
| -rw-r--r-- | bot/cogs/doc/parsing.py | 36 |
2 files changed, 119 insertions, 39 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py index 2e49fcd38..d57e76ebd 100644 --- a/bot/cogs/doc/cog.py +++ b/bot/cogs/doc/cog.py @@ -1,14 +1,18 @@ +from __future__ import annotations + import asyncio import functools import logging import re import sys -from collections import OrderedDict +from collections import defaultdict from contextlib import suppress from types import SimpleNamespace -from typing import Dict, NamedTuple, Optional +from typing import Dict, List, NamedTuple, Optional, Union import discord +from aiohttp import ClientSession +from bs4 import BeautifulSoup from discord.ext import commands from requests import ConnectTimeout, ConnectionError, HTTPError from sphinx.ext import intersphinx @@ -20,7 +24,6 @@ from bot.converters import PackageName, ValidURL from bot.decorators import with_role from bot.pagination import LinePaginator from bot.utils.messages import wait_for_deletion -from .cache import async_cache from .parsing import get_symbol_markdown log = logging.getLogger(__name__) @@ -67,6 +70,108 @@ class DocItem(NamedTuple): return "".join((self.base_url, self.relative_url_path)) +class QueueItem(NamedTuple): + """Contains a symbol and the BeautifulSoup object needed to parse it.""" + + symbol: DocItem + soup: BeautifulSoup + + def __eq__(self, other: Union[QueueItem, DocItem]): + if isinstance(other, DocItem): + return self.symbol == other + return NamedTuple.__eq__(self, other) + + +class CachedParser: + """ + Get symbol markdown from pages with smarter caching. + + DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict. + `get_markdown` is used to fetch the markdown; when this is used for the first time on a page, + all of the symbols are queued to be parsed to avoid multiple web requests to the same page. + """ + + def __init__(self): + self._queue: List[QueueItem] = [] + self._results = {} + self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list) + self._item_events: Dict[DocItem, asyncio.Event] = {} + self._parse_task = None + + async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str: + """ + Get result markdown of `doc_item`. + + If no symbols were fetched from `doc_item`s page before, + the HTML has to be fetched before parsing can be queued. + """ + if (symbol := self._results.get(doc_item)) is not None: + return symbol + + if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None: + async with client_session.get(doc_item.url) as response: + soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml") + + self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue) + del self._page_symbols[doc_item.url] + log.debug(f"Added symbols from {doc_item.url} to parse queue.") + + if self._parse_task is None: + self._parse_task = asyncio.create_task(self._parse_queue()) + + self._move_to_front(doc_item) + self._item_events[doc_item] = item_event = asyncio.Event() + await item_event.wait() + return self._results[doc_item] + + async def _parse_queue(self) -> None: + """ + Parse all item from the queue, setting associated events for symbols if present. + + The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished. + """ + log.trace("Starting queue parsing.") + while self._queue: + item, soup = self._queue.pop() + self._results[item] = get_symbol_markdown(soup, item) + if (event := self._item_events.get(item)) is not None: + event.set() + await asyncio.sleep(0.1) + + self._parse_task = None + log.trace("Finished parsing queue.") + + def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None: + """Move `item` to the front of the parse queue.""" + # The parse queue stores soups along with the doc symbols in QueueItem objects, + # in case we're moving a DocItem we have to get the associated QueueItem first and then move it. + item_index = self._queue.index(item) + queue_item = self._queue[item_index] + + del self._queue[item_index] + self._queue.append(queue_item) + + def add_item(self, doc_item: DocItem) -> None: + """Add a DocItem to `_page_symbols`.""" + self._page_symbols[doc_item.url].append(doc_item) + + async def clear(self) -> None: + """ + Clear all internal symbol data. + + All currently requested items are waited to be parsed before clearing. + """ + for event in self._item_events.values(): + await event.wait() + if self._parse_task is not None: + self._parse_task.cancel() + self._parse_task = None + self._queue.clear() + self._results.clear() + self._page_symbols.clear() + self._item_events.clear() + + class InventoryURL(commands.Converter): """ Represents an Intersphinx inventory URL. @@ -106,6 +211,7 @@ class DocCog(commands.Cog): self.base_urls = {} self.bot = bot self.doc_symbols: Dict[str, DocItem] = {} + self.item_fetcher = CachedParser() self.renamed_symbols = set() self.bot.loop.create_task(self.init_refresh_inventory()) @@ -163,7 +269,10 @@ class DocCog(commands.Cog): symbol = f"{api_package_name}.{symbol}" self.renamed_symbols.add(symbol) - self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name) + relative_url_path, _, symbol_id = relative_doc_url.partition("#") + symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id) + self.doc_symbols[symbol] = symbol_item + self.item_fetcher.add_item(symbol_item) log.trace(f"Fetched inventory for {api_package_name}.") @@ -177,7 +286,7 @@ class DocCog(commands.Cog): self.base_urls.clear() self.doc_symbols.clear() self.renamed_symbols.clear() - async_cache.cache = OrderedDict() + await self.item_fetcher.clear() # Run all coroutines concurrently - since each of them performs a HTTP # request, this speeds up fetching the inventory data heavily. @@ -198,12 +307,11 @@ class DocCog(commands.Cog): if symbol_info is None: return None self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}") - embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info) embed = discord.Embed( title=discord.utils.escape_markdown(symbol), url=f"{symbol_info.url}#{symbol_info.symbol_id}", - description=embed_description + description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info) ) # Show all symbols with the same name that were renamed in the footer. embed.set_footer( diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py index 1271953d4..9fbce7bed 100644 --- a/bot/cogs/doc/parsing.py +++ b/bot/cogs/doc/parsing.py @@ -5,11 +5,9 @@ import textwrap from functools import partial from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union -from aiohttp import ClientSession from bs4 import BeautifulSoup from bs4.element import NavigableString, PageElement, Tag -from .cache import async_cache from .html import Strainer from .markdown import markdownify if TYPE_CHECKING: @@ -171,16 +169,6 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url: return formatted_markdown -@async_cache(arg_offset=1) -async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup: - """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed.""" - log.trace(f"Sending a request to {url}.") - async with http_session.get(url) as response: - soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml') - soup.find("head").decompose() # the head contains no useful data so we can remove it - return soup - - def _match_end_tag(tag: Tag) -> bool: """Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table.""" for attr in _SEARCH_END_TAG_ATTRS: @@ -190,44 +178,28 @@ def _match_end_tag(tag: Tag) -> bool: return tag.name == "table" -async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str: +def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str: """ - Return parsed markdown of the passed symbol, truncated to 1000 characters. + Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters. - A request through `http_session` is made to the url associated with `symbol_data` for the html contents; - the contents are then parsed depending on what group the symbol belongs to. + The method of parsing and what information gets included depends on the symbol's group. """ - log.trace(f"Parsing symbol from url {symbol_data.url}.") - if "#" in symbol_data.url: - request_url, symbol_id = symbol_data.url.rsplit('#') - else: - request_url = symbol_data.url - symbol_id = None - - soup = await _get_soup_from_url(http_session, request_url) - symbol_heading = soup.find(id=symbol_id) + symbol_heading = soup.find(id=symbol_data.symbol_id) signature = None # Modules, doc pages and labels don't point to description list tags but to tags like divs, # no special parsing can be done so we only try to include what's under them. if symbol_data.group in {"module", "doc", "label"}: - log.trace("Symbol is a module, doc or a label; using general description parsing.") description = _get_general_description(symbol_heading) elif symbol_heading.name != "dt": # Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags, # log info the tag can be looked at. - log.info( - f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it," - f"handling as general description." - ) description = _get_general_description(symbol_heading) elif symbol_data.group in _NO_SIGNATURE_GROUPS: - log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.") description = _get_dd_description(symbol_heading) else: - log.trace("Parsing both signature and description of symbol.") signature = _get_signatures(symbol_heading) description = _get_dd_description(symbol_heading) |