aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--bot/cogs/doc/cog.py122
-rw-r--r--bot/cogs/doc/parsing.py36
2 files changed, 119 insertions, 39 deletions
diff --git a/bot/cogs/doc/cog.py b/bot/cogs/doc/cog.py
index 2e49fcd38..d57e76ebd 100644
--- a/bot/cogs/doc/cog.py
+++ b/bot/cogs/doc/cog.py
@@ -1,14 +1,18 @@
+from __future__ import annotations
+
import asyncio
import functools
import logging
import re
import sys
-from collections import OrderedDict
+from collections import defaultdict
from contextlib import suppress
from types import SimpleNamespace
-from typing import Dict, NamedTuple, Optional
+from typing import Dict, List, NamedTuple, Optional, Union
import discord
+from aiohttp import ClientSession
+from bs4 import BeautifulSoup
from discord.ext import commands
from requests import ConnectTimeout, ConnectionError, HTTPError
from sphinx.ext import intersphinx
@@ -20,7 +24,6 @@ from bot.converters import PackageName, ValidURL
from bot.decorators import with_role
from bot.pagination import LinePaginator
from bot.utils.messages import wait_for_deletion
-from .cache import async_cache
from .parsing import get_symbol_markdown
log = logging.getLogger(__name__)
@@ -67,6 +70,108 @@ class DocItem(NamedTuple):
return "".join((self.base_url, self.relative_url_path))
+class QueueItem(NamedTuple):
+ """Contains a symbol and the BeautifulSoup object needed to parse it."""
+
+ symbol: DocItem
+ soup: BeautifulSoup
+
+ def __eq__(self, other: Union[QueueItem, DocItem]):
+ if isinstance(other, DocItem):
+ return self.symbol == other
+ return NamedTuple.__eq__(self, other)
+
+
+class CachedParser:
+ """
+ Get symbol markdown from pages with smarter caching.
+
+ DocItems are added through the `add_item` method which adds them to the `_page_symbols` dict.
+ `get_markdown` is used to fetch the markdown; when this is used for the first time on a page,
+ all of the symbols are queued to be parsed to avoid multiple web requests to the same page.
+ """
+
+ def __init__(self):
+ self._queue: List[QueueItem] = []
+ self._results = {}
+ self._page_symbols: Dict[str, List[DocItem]] = defaultdict(list)
+ self._item_events: Dict[DocItem, asyncio.Event] = {}
+ self._parse_task = None
+
+ async def get_markdown(self, client_session: ClientSession, doc_item: DocItem) -> str:
+ """
+ Get result markdown of `doc_item`.
+
+ If no symbols were fetched from `doc_item`s page before,
+ the HTML has to be fetched before parsing can be queued.
+ """
+ if (symbol := self._results.get(doc_item)) is not None:
+ return symbol
+
+ if (symbols_to_queue := self._page_symbols.get(doc_item.url)) is not None:
+ async with client_session.get(doc_item.url) as response:
+ soup = BeautifulSoup(await response.text(encoding="utf8"), "lxml")
+
+ self._queue.extend(QueueItem(symbol, soup) for symbol in symbols_to_queue)
+ del self._page_symbols[doc_item.url]
+ log.debug(f"Added symbols from {doc_item.url} to parse queue.")
+
+ if self._parse_task is None:
+ self._parse_task = asyncio.create_task(self._parse_queue())
+
+ self._move_to_front(doc_item)
+ self._item_events[doc_item] = item_event = asyncio.Event()
+ await item_event.wait()
+ return self._results[doc_item]
+
+ async def _parse_queue(self) -> None:
+ """
+ Parse all item from the queue, setting associated events for symbols if present.
+
+ The coroutine will run as long as the queue is not empty, resetting `self._parse_task` to None when finished.
+ """
+ log.trace("Starting queue parsing.")
+ while self._queue:
+ item, soup = self._queue.pop()
+ self._results[item] = get_symbol_markdown(soup, item)
+ if (event := self._item_events.get(item)) is not None:
+ event.set()
+ await asyncio.sleep(0.1)
+
+ self._parse_task = None
+ log.trace("Finished parsing queue.")
+
+ def _move_to_front(self, item: Union[QueueItem, DocItem]) -> None:
+ """Move `item` to the front of the parse queue."""
+ # The parse queue stores soups along with the doc symbols in QueueItem objects,
+ # in case we're moving a DocItem we have to get the associated QueueItem first and then move it.
+ item_index = self._queue.index(item)
+ queue_item = self._queue[item_index]
+
+ del self._queue[item_index]
+ self._queue.append(queue_item)
+
+ def add_item(self, doc_item: DocItem) -> None:
+ """Add a DocItem to `_page_symbols`."""
+ self._page_symbols[doc_item.url].append(doc_item)
+
+ async def clear(self) -> None:
+ """
+ Clear all internal symbol data.
+
+ All currently requested items are waited to be parsed before clearing.
+ """
+ for event in self._item_events.values():
+ await event.wait()
+ if self._parse_task is not None:
+ self._parse_task.cancel()
+ self._parse_task = None
+ self._queue.clear()
+ self._results.clear()
+ self._page_symbols.clear()
+ self._item_events.clear()
+
+
class InventoryURL(commands.Converter):
"""
Represents an Intersphinx inventory URL.
@@ -106,6 +211,7 @@ class DocCog(commands.Cog):
self.base_urls = {}
self.bot = bot
self.doc_symbols: Dict[str, DocItem] = {}
+ self.item_fetcher = CachedParser()
self.renamed_symbols = set()
self.bot.loop.create_task(self.init_refresh_inventory())
@@ -163,7 +269,10 @@ class DocCog(commands.Cog):
symbol = f"{api_package_name}.{symbol}"
self.renamed_symbols.add(symbol)
- self.doc_symbols[symbol] = DocItem(base_url, relative_doc_url, api_package_name, group_name)
+ relative_url_path, _, symbol_id = relative_doc_url.partition("#")
+ symbol_item = DocItem(api_package_name, group_name, base_url, relative_url_path, symbol_id)
+ self.doc_symbols[symbol] = symbol_item
+ self.item_fetcher.add_item(symbol_item)
log.trace(f"Fetched inventory for {api_package_name}.")
@@ -177,7 +286,7 @@ class DocCog(commands.Cog):
self.base_urls.clear()
self.doc_symbols.clear()
self.renamed_symbols.clear()
- async_cache.cache = OrderedDict()
+ await self.item_fetcher.clear()
# Run all coroutines concurrently - since each of them performs a HTTP
# request, this speeds up fetching the inventory data heavily.
@@ -198,12 +307,11 @@ class DocCog(commands.Cog):
if symbol_info is None:
return None
self.bot.stats.incr(f"doc_fetches.{symbol_info.package.lower()}")
- embed_description = await get_symbol_markdown(self.bot.http_session, symbol_info)
embed = discord.Embed(
title=discord.utils.escape_markdown(symbol),
url=f"{symbol_info.url}#{symbol_info.symbol_id}",
- description=embed_description
+ description=await self.item_fetcher.get_markdown(self.bot.http_session, symbol_info)
)
# Show all symbols with the same name that were renamed in the footer.
embed.set_footer(
diff --git a/bot/cogs/doc/parsing.py b/bot/cogs/doc/parsing.py
index 1271953d4..9fbce7bed 100644
--- a/bot/cogs/doc/parsing.py
+++ b/bot/cogs/doc/parsing.py
@@ -5,11 +5,9 @@ import textwrap
from functools import partial
from typing import Callable, List, Optional, TYPE_CHECKING, Tuple, Union
-from aiohttp import ClientSession
from bs4 import BeautifulSoup
from bs4.element import NavigableString, PageElement, Tag
-from .cache import async_cache
from .html import Strainer
from .markdown import markdownify
if TYPE_CHECKING:
@@ -171,16 +169,6 @@ def _parse_into_markdown(signatures: Optional[List[str]], description: str, url:
return formatted_markdown
-@async_cache(arg_offset=1)
-async def _get_soup_from_url(http_session: ClientSession, url: str) -> BeautifulSoup:
- """Create a BeautifulSoup object from the HTML data in `url` with the head tag removed."""
- log.trace(f"Sending a request to {url}.")
- async with http_session.get(url) as response:
- soup = BeautifulSoup(await response.text(encoding="utf8"), 'lxml')
- soup.find("head").decompose() # the head contains no useful data so we can remove it
- return soup
-
-
def _match_end_tag(tag: Tag) -> bool:
"""Matches `tag` if its class value is in `SEARCH_END_TAG_ATTRS` or the tag is table."""
for attr in _SEARCH_END_TAG_ATTRS:
@@ -190,44 +178,28 @@ def _match_end_tag(tag: Tag) -> bool:
return tag.name == "table"
-async def get_symbol_markdown(http_session: ClientSession, symbol_data: "DocItem") -> str:
+def get_symbol_markdown(soup: BeautifulSoup, symbol_data: "DocItem") -> str:
"""
- Return parsed markdown of the passed symbol, truncated to 1000 characters.
+ Return parsed markdown of the passed symbol using the passed in soup, truncated to 1000 characters.
- A request through `http_session` is made to the url associated with `symbol_data` for the html contents;
- the contents are then parsed depending on what group the symbol belongs to.
+ The method of parsing and what information gets included depends on the symbol's group.
"""
- log.trace(f"Parsing symbol from url {symbol_data.url}.")
- if "#" in symbol_data.url:
- request_url, symbol_id = symbol_data.url.rsplit('#')
- else:
- request_url = symbol_data.url
- symbol_id = None
-
- soup = await _get_soup_from_url(http_session, request_url)
- symbol_heading = soup.find(id=symbol_id)
+ symbol_heading = soup.find(id=symbol_data.symbol_id)
signature = None
# Modules, doc pages and labels don't point to description list tags but to tags like divs,
# no special parsing can be done so we only try to include what's under them.
if symbol_data.group in {"module", "doc", "label"}:
- log.trace("Symbol is a module, doc or a label; using general description parsing.")
description = _get_general_description(symbol_heading)
elif symbol_heading.name != "dt":
# Use the general parsing for symbols that aren't modules, docs or labels and aren't dt tags,
# log info the tag can be looked at.
- log.info(
- f"Symbol heading at url {symbol_data.url} was not a dt tag or from known groups that lack it,"
- f"handling as general description."
- )
description = _get_general_description(symbol_heading)
elif symbol_data.group in _NO_SIGNATURE_GROUPS:
- log.trace("Symbol's group is in the group signature blacklist, skipping parsing of signature.")
description = _get_dd_description(symbol_heading)
else:
- log.trace("Parsing both signature and description of symbol.")
signature = _get_signatures(symbol_heading)
description = _get_dd_description(symbol_heading)